출처: https://blog.breezymind.com/2018/03/02/sklearn-feature_extraction-text-2/

In [35]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None
np.random.seed(0)

from konlpy.tag import Mecab
mecab = Mecab()


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# tokenizer : 문장에서 색인어 추출을 위해 명사,동사,알파벳,숫자 정도의 단어만 뽑아서 normalization, stemming 처리하도록 함
def tokenizer(raw, pos=["NNG","NNP"], stopword=['수','퀄리티','도시','분','전문','스타','년','원',\
                       '월','화','수','목','금','시','앤','일','그램','문'] ):
    return [
        word for word, tag in mecab.pos(raw)
            if len(word) > 1 and tag in pos and word not in stopword
        ]

# 테스트 문장
df = pd.read_csv("word2vec_wrangling.csv")
df.shape

(61, 2)

In [36]:
df.head()

Unnamed: 0,exercise_name,Content_txt
0,PT,"💯 What I try to educate my clients around, doe..."
1,검도,#20200115\n저녁 초대!\n와인잔 속에 비치는\n모든 것들이 화려한\n도심속...
2,기구필라테스,#오늘의동작\n캐딜락 동작의 완성 '행잉'\n⠀\n중력을 이용해 척추를 늘려주고\n...
3,다빈치바디보드,#mbn생생정보마당 \n#고투\n#고투GX\n#다빈치바디보드\n#생방송 #GOTOL...
4,드럼스틱,#드럼스틱 #고무팁 #테크라스틱 #전자드럼용스틱\n\n1. 전자드럼타격시 덜 시끄럽...


In [37]:
import re

def preprocessing(text):
    # 개행문자 제거
    text = re.sub('\\\\n', ' ', text)
    # 특수문자 제거
    # 특수문자나 이모티콘 등은 때로는 의미를 갖기도 하지만 여기에서는 제거했습니다.
    # text = re.sub('[?.,;:|\)*~`’!^\-_+<>@\#$%&-=#}※]', '', text)
    # 한글, 영문, 숫자만 남기고 모두 제거하도록 합니다.
    # text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z0-9]', ' ', text)
    # 한글, 영문만 남기고 모두 제거하도록 합니다.
    text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z]', ' ', text)
    return text

In [38]:
%time rawdata = df['Content_txt'].apply(preprocessing)

CPU times: user 1.05 s, sys: 7.03 ms, total: 1.06 s
Wall time: 1.08 s


In [45]:
vectorize = TfidfVectorizer(
    tokenizer=tokenizer,
    min_df=5,
    sublinear_tf=True    # tf값에 1+log(tf)를 적용하여 tf값이 무한정 커지는 것을 막음
)
X = vectorize.fit_transform(rawdata)

# fit_transform, (sentence 5, feature 7)

print(X.toarray())

# ([[0.        , 0.40824829, 0.81649658, 0.        , 0.        , 0.        , 0.40824829],
# [0.        , 0.40824829, 0.40824829, 0.        , 0.        , 0.        , 0.81649658],
# [0.41680418, 0.        , 0.        , 0.69197025, 0.41680418, 0.41680418, 0.        ],
# [0.76944707, 0.        , 0.        , 0.63871058, 0.        , 0.        , 0.        ],
# [0.        , 0.        , 0.        , 0.8695635 , 0.34918428, 0.34918428, 0.        ]])

# 문장에서 뽑아낸 feature 들의 배열
features = vectorize.get_feature_names()

[[0.         0.02304771 0.         ... 0.         0.         0.        ]
 [0.00731699 0.00520238 0.         ... 0.         0.         0.01417282]
 [0.01754004 0.01806715 0.         ... 0.         0.         0.        ]
 ...
 [0.01625052 0.02232546 0.         ... 0.01161061 0.         0.        ]
 [0.01834636 0.0212857  0.         ... 0.         0.         0.        ]
 [0.         0.02156329 0.         ... 0.         0.         0.07388052]]


In [46]:
print(len(features))

9661


In [47]:
features[:12]

['가게', '가격', '가경동', '가공', '가구', '가금', '가까이', '가네샤', '가늠', '가능', '가동', '가드']

In [55]:
# 검색 문장에서 feature를 뽑아냄
srch=[t for t in tokenizer('복근이 생기는 운동') if t in features]
print(srch)

['복근', '운동']


In [57]:
# dtm 에서 검색하고자 하는 feature만 뽑아낸다.
srch_dtm = np.asarray(X.toarray())[:, [
    # vectorize.vocabulary_.get 는 특정 feature 가 dtm 에서 가지고 있는 index값을 리턴한다
    vectorize.vocabulary_.get(i) for i in srch
]]

print(len(srch_dtm))
srch_dtm

61


array([[0.03629959, 0.06352614],
       [0.00583699, 0.0338335 ],
       [0.02324348, 0.03261501],
       [0.01850878, 0.03615956],
       [0.        , 0.02087139],
       [0.00598205, 0.03129328],
       [0.        , 0.03310441],
       [0.02149545, 0.03386523],
       [0.02270879, 0.03693905],
       [0.        , 0.03386949],
       [0.01263085, 0.02884251],
       [0.01572474, 0.03272411],
       [0.01535327, 0.02905333],
       [0.        , 0.03782721],
       [0.01235037, 0.03787701],
       [0.02095056, 0.03649797],
       [0.019482  , 0.03464524],
       [0.        , 0.02868688],
       [0.00859096, 0.0286425 ],
       [0.01559038, 0.03315197],
       [0.        , 0.03190364],
       [0.        , 0.02639931],
       [0.01929779, 0.03356973],
       [0.01036368, 0.02669378],
       [0.01554211, 0.03055554],
       [0.02038296, 0.0312415 ],
       [0.00487456, 0.02702579],
       [0.01301678, 0.03419773],
       [0.02120869, 0.06090998],
       [0.        , 0.08576145],
       [0.

In [58]:
score = srch_dtm.sum(axis=1)
score

array([0.09982573, 0.0396705 , 0.05585849, 0.05466834, 0.02087139,
       0.03727532, 0.03310441, 0.05536068, 0.05964784, 0.03386949,
       0.04147337, 0.04844884, 0.04440661, 0.03782721, 0.05022738,
       0.05744853, 0.05412724, 0.02868688, 0.03723346, 0.04874235,
       0.03190364, 0.02639931, 0.05286752, 0.03705746, 0.04609764,
       0.05162445, 0.03190035, 0.04721451, 0.08211868, 0.08576145,
       0.05929364, 0.0518931 , 0.12866966, 0.0469615 , 0.06013438,
       0.02191491, 0.04939207, 0.03561605, 0.06098917, 0.05068541,
       0.04541064, 0.06200047, 0.05668582, 0.05663816, 0.06261358,
       0.026668  , 0.04610933, 0.06083362, 0.03585804, 0.12235442,
       0.03965288, 0.06465027, 0.04801312, 0.05146894, 0.06858613,
       0.10583776, 0.06413124, 0.02789233, 0.04843761, 0.0611904 ,
       0.05433567])

In [61]:
for i in score.argsort()[::-1]:
    if score[i] > 0.065:
        print((df['exercise_name'].iloc[i], score[i]))

('에이리얼후프', 0.12866966339086583)
('패들핏', 0.12235441995412155)
('필라테스', 0.10583776104833163)
('PT', 0.09982573437229232)
('아쿠아테크', 0.08576145092896134)
('아쿠아바이크', 0.08211867738708024)
('플라잉필라테스', 0.06858613060515403)
