출처: https://blog.breezymind.com/2018/03/02/sklearn-feature_extraction-text-2/

In [1]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None
np.random.seed(0)

from konlpy.tag import Okt
twitter = Okt()


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# tokenizer : 문장에서 색인어 추출을 위해 명사,동사,알파벳,숫자 정도의 단어만 뽑아서 normalization, stemming 처리하도록 함
# tokenizer : 문장에서 색인어 추출을 위해 명사,동사,알파벳,숫자 정도의 단어만 뽑아서 normalization, stemming 처리하도록 함
def tokenizer(raw, pos=["Noun","Alpha","Verb","Number"], stopword=[]):
    return [
        word for word, tag in twitter.pos(
            raw, 
            norm=True,   # normalize 그랰ㅋㅋ -> 그래ㅋㅋ
            stem=True    # stemming 바뀌나->바뀌다
            )
            if len(word) > 1 and tag in pos and word not in stopword
        ]

# 테스트 문장
df = pd.read_csv("word2vec_wrangling.csv")
df.shape

(61, 2)

In [2]:
df.head()

Unnamed: 0,exercise_name,Content_txt
0,PT,"💯 What I try to educate my clients around, doe..."
1,검도,#20200115\n저녁 초대!\n와인잔 속에 비치는\n모든 것들이 화려한\n도심속...
2,기구필라테스,#오늘의동작\n캐딜락 동작의 완성 '행잉'\n⠀\n중력을 이용해 척추를 늘려주고\n...
3,다빈치바디보드,#mbn생생정보마당 \n#고투\n#고투GX\n#다빈치바디보드\n#생방송 #GOTOL...
4,드럼스틱,#드럼스틱 #고무팁 #테크라스틱 #전자드럼용스틱\n\n1. 전자드럼타격시 덜 시끄럽...


In [3]:
import re

def preprocessing(text):
    # 개행문자 제거
    text = re.sub('\\\\n', ' ', text)
    # 특수문자 제거
    # 특수문자나 이모티콘 등은 때로는 의미를 갖기도 하지만 여기에서는 제거했습니다.
    # text = re.sub('[?.,;:|\)*~`’!^\-_+<>@\#$%&-=#}※]', '', text)
    # 한글, 영문, 숫자만 남기고 모두 제거하도록 합니다.
    # text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z0-9]', ' ', text)
    # 한글, 영문만 남기고 모두 제거하도록 합니다.
    text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z]', ' ', text)
    return text

In [4]:
%time rawdata = df['Content_txt'].apply(preprocessing)

CPU times: user 985 ms, sys: 19.6 ms, total: 1 s
Wall time: 1.01 s


In [5]:
vectorize = TfidfVectorizer(
    tokenizer=tokenizer,
    min_df=5,
    sublinear_tf=True    # tf값에 1+log(tf)를 적용하여 tf값이 무한정 커지는 것을 막음
)
X = vectorize.fit_transform(rawdata)

# fit_transform, (sentence 5, feature 7)

print(X.toarray())

# ([[0.        , 0.40824829, 0.81649658, 0.        , 0.        , 0.        , 0.40824829],
# [0.        , 0.40824829, 0.40824829, 0.        , 0.        , 0.        , 0.81649658],
# [0.41680418, 0.        , 0.        , 0.69197025, 0.41680418, 0.41680418, 0.        ],
# [0.76944707, 0.        , 0.        , 0.63871058, 0.        , 0.        , 0.        ],
# [0.        , 0.        , 0.        , 0.8695635 , 0.34918428, 0.34918428, 0.        ]])

# 문장에서 뽑아낸 feature 들의 배열
features = vectorize.get_feature_names()

[[0.01845768 0.         0.00949002 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.01298815 0.        ]
 [0.         0.02342017 0.         ... 0.01069141 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.01089691 0.         0.02467502 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.06379826 0.        ]]


In [6]:
print(len(features))

14098


In [7]:
features[:12]

['ab',
 'abc',
 'abdominal',
 'able',
 'about',
 'abs',
 'abt',
 'ac',
 'academy',
 'account',
 'ace',
 'acrobatics']

In [8]:
# 검색 문장에서 feature를 뽑아냄
srch=[t for t in tokenizer('복근이 생기는 운동') if t in features]
print(srch)

['복근', '생기다', '운동']


In [9]:
# dtm 에서 검색하고자 하는 feature만 뽑아낸다.
srch_dtm = np.asarray(X.toarray())[:, [
    # vectorize.vocabulary_.get 는 특정 feature 가 dtm 에서 가지고 있는 index값을 리턴한다
    vectorize.vocabulary_.get(i) for i in srch
]]

print(len(srch_dtm))
srch_dtm

61


array([[0.01171391, 0.00792608, 0.02049993],
       [0.00513336, 0.01197104, 0.02970128],
       [0.02089141, 0.01434384, 0.02915134],
       [0.01628533, 0.01316946, 0.03158734],
       [0.        , 0.01566441, 0.01792581],
       [0.00493877, 0.01215319, 0.02576911],
       [0.        , 0.00661835, 0.02554627],
       [0.01857743, 0.01282613, 0.02925842],
       [0.0208113 , 0.01213876, 0.03383111],
       [0.        , 0.00432354, 0.02644911],
       [0.01161136, 0.01094984, 0.02423812],
       [0.01360429, 0.01136392, 0.02828777],
       [0.01280858, 0.00804281, 0.02407355],
       [0.        , 0.01143355, 0.03262212],
       [0.00872243, 0.0142118 , 0.03306699],
       [0.01825001, 0.01334535, 0.03178166],
       [0.01673009, 0.00376882, 0.02967879],
       [0.        , 0.01330769, 0.02513198],
       [0.00715791, 0.01411042, 0.02382444],
       [0.01317643, 0.00784084, 0.02792821],
       [0.        , 0.00685088, 0.02494428],
       [0.        , 0.01369138, 0.02189571],
       [0.

In [10]:
score = srch_dtm.sum(axis=1)
score

array([0.04013992, 0.04680568, 0.06438658, 0.06104212, 0.03359023,
       0.04286107, 0.03216461, 0.06066198, 0.06678117, 0.03077264,
       0.04679932, 0.05325599, 0.04492494, 0.04405567, 0.05600122,
       0.06337703, 0.0501777 , 0.03843967, 0.04509277, 0.04894548,
       0.03179516, 0.03558709, 0.05857831, 0.0439106 , 0.0515506 ,
       0.05437354, 0.04188015, 0.05429697, 0.07344442, 0.07612417,
       0.0624695 , 0.05870695, 0.08823715, 0.05167066, 0.06889149,
       0.0312876 , 0.04949212, 0.03508067, 0.06571965, 0.05813553,
       0.0533715 , 0.05632672, 0.0512531 , 0.05683219, 0.06726428,
       0.03515854, 0.05252364, 0.06102369, 0.0399493 , 0.10587997,
       0.04233741, 0.06927108, 0.05421435, 0.05577485, 0.07353819,
       0.09692596, 0.06939733, 0.03655585, 0.05601148, 0.06686755,
       0.05038051])

In [11]:
for i in score.argsort()[::-1]:
    if score[i] > 0.065:
        print((df['exercise_name'].iloc[i], score[i]))

('패들핏', 0.10587996957803501)
('필라테스', 0.09692596328676756)
('에이리얼후프', 0.08823715393762888)
('아쿠아테크', 0.07612417377408401)
('플라잉필라테스', 0.07353819008961215)
('아쿠아바이크', 0.0734444246412008)
('필록싱', 0.06939733047412792)
('폴댄스', 0.06927108455991757)
('요가쿠아', 0.06889149455714522)
('타바타', 0.0672642836739537)
('헬스', 0.06686754664399552)
('뮤직복싱', 0.06678116705520054)
('점핑피트니스', 0.0657196515343316)
