출처: https://blog.breezymind.com/2018/03/02/sklearn-feature_extraction-text-2/

In [13]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None
np.random.seed(0)

from konlpy.tag import Mecab
mecab = Mecab()


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# tokenizer : 문장에서 색인어 추출을 위해 명사,동사,알파벳,숫자 정도의 단어만 뽑아서 normalization, stemming 처리하도록 함
def tokenizer(raw, pos=["NNG","NNP"], stopword=['수','퀄리티','도시','분','전문','스타','년','원',\
                       '월','화','수','목','금','시','앤','일','그램','문'] ):
    return [
        word for word, tag in mecab.pos(raw)
            if len(word) > 1 and tag in pos and word not in stopword
        ]

# 테스트 문장
df = pd.read_csv("word2vec_wrangling.csv")
df.shape

(61, 2)

In [14]:
df.head()

Unnamed: 0,exercise_name,Content_txt
0,PT,"💯 What I try to educate my clients around, doe..."
1,검도,#20200115\n저녁 초대!\n와인잔 속에 비치는\n모든 것들이 화려한\n도심속...
2,기구필라테스,#오늘의동작\n캐딜락 동작의 완성 '행잉'\n⠀\n중력을 이용해 척추를 늘려주고\n...
3,다빈치바디보드,#mbn생생정보마당 \n#고투\n#고투GX\n#다빈치바디보드\n#생방송 #GOTOL...
4,드럼스틱,#드럼스틱 #고무팁 #테크라스틱 #전자드럼용스틱\n\n1. 전자드럼타격시 덜 시끄럽...


In [15]:
import re

def preprocessing(text):
    # 개행문자 제거
    text = re.sub('\\\\n', ' ', text)
    # 특수문자 제거
    # 특수문자나 이모티콘 등은 때로는 의미를 갖기도 하지만 여기에서는 제거했습니다.
    # text = re.sub('[?.,;:|\)*~`’!^\-_+<>@\#$%&-=#}※]', '', text)
    # 한글, 영문, 숫자만 남기고 모두 제거하도록 합니다.
    # text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z0-9]', ' ', text)
    # 한글, 영문만 남기고 모두 제거하도록 합니다.
    text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z]', ' ', text)
    return text

In [16]:
%time rawdata = df['Content_txt'].apply(preprocessing)

CPU times: user 1.01 s, sys: 29.9 ms, total: 1.04 s
Wall time: 1.15 s


In [5]:
vectorize = TfidfVectorizer(
    tokenizer=tokenizer,
    min_df=5,
    sublinear_tf=True    # tf값에 1+log(tf)를 적용하여 tf값이 무한정 커지는 것을 막음
)
X = vectorize.fit_transform(rawdata)

# fit_transform, (sentence 5, feature 7)

print(X.toarray())

# ([[0.        , 0.40824829, 0.81649658, 0.        , 0.        , 0.        , 0.40824829],
# [0.        , 0.40824829, 0.40824829, 0.        , 0.        , 0.        , 0.81649658],
# [0.41680418, 0.        , 0.        , 0.69197025, 0.41680418, 0.41680418, 0.        ],
# [0.76944707, 0.        , 0.        , 0.63871058, 0.        , 0.        , 0.        ],
# [0.        , 0.        , 0.        , 0.8695635 , 0.34918428, 0.34918428, 0.        ]])

# 문장에서 뽑아낸 feature 들의 배열
features = vectorize.get_feature_names()

[[0.         0.02304771 0.         ... 0.         0.         0.        ]
 [0.00731699 0.00520238 0.         ... 0.         0.         0.01417282]
 [0.01754004 0.01806715 0.         ... 0.         0.         0.        ]
 ...
 [0.01625052 0.02232546 0.         ... 0.01161061 0.         0.        ]
 [0.01834636 0.0212857  0.         ... 0.         0.         0.        ]
 [0.         0.02156329 0.         ... 0.         0.         0.07388052]]


In [6]:
print(len(features))

9661


In [7]:
features[:12]

['가게', '가격', '가경동', '가공', '가구', '가금', '가까이', '가네샤', '가늠', '가능', '가동', '가드']

In [9]:
# 검색 문장에서 feature를 뽑아냄
srch=[t for t in tokenizer('살 빠지는 운동, 다이어트') if t in features]
print(srch)

['운동', '다이어트']


In [10]:
# dtm 에서 검색하고자 하는 feature만 뽑아낸다.
srch_dtm = np.asarray(X.toarray())[:, [
    # vectorize.vocabulary_.get 는 특정 feature 가 dtm 에서 가지고 있는 index값을 리턴한다
    vectorize.vocabulary_.get(i) for i in srch
]]

print(len(srch_dtm))
srch_dtm

61


array([[0.06352614, 0.0552553 ],
       [0.0338335 , 0.02703658],
       [0.03261501, 0.02843541],
       [0.03615956, 0.03098877],
       [0.02087139, 0.01433861],
       [0.03129328, 0.02663654],
       [0.03310441, 0.03280607],
       [0.03386523, 0.02767317],
       [0.03693905, 0.03727112],
       [0.03386949, 0.03358315],
       [0.02884251, 0.02493719],
       [0.03272411, 0.02920457],
       [0.02905333, 0.02857701],
       [0.03782721, 0.03336418],
       [0.03787701, 0.03330625],
       [0.03649797, 0.0309647 ],
       [0.03464524, 0.03375994],
       [0.02868688, 0.01445444],
       [0.0286425 , 0.02518272],
       [0.03315197, 0.02619818],
       [0.03190364, 0.0316416 ],
       [0.02639931, 0.01724693],
       [0.03356973, 0.02866724],
       [0.02669378, 0.01658048],
       [0.03055554, 0.02441574],
       [0.0312415 , 0.02727438],
       [0.02702579, 0.0226714 ],
       [0.03419773, 0.02798583],
       [0.06090998, 0.04282384],
       [0.08576145, 0.04947413],
       [0.

In [11]:
score = srch_dtm.sum(axis=1)
score

array([0.11878144, 0.06087009, 0.06105042, 0.06714834, 0.03521   ,
       0.05792982, 0.06591048, 0.0615384 , 0.07421017, 0.06745265,
       0.05377971, 0.06192868, 0.05763035, 0.07119139, 0.07118326,
       0.06746267, 0.06840518, 0.04314132, 0.05382522, 0.05935015,
       0.06354523, 0.04364623, 0.06223696, 0.04327426, 0.05497127,
       0.05851587, 0.04969719, 0.06218356, 0.10373383, 0.13523559,
       0.0681916 , 0.0671112 , 0.17227331, 0.05512534, 0.09558456,
       0.04266173, 0.06282246, 0.06628397, 0.07075479, 0.07006523,
       0.05832708, 0.07166561, 0.06676516, 0.06563571, 0.07134709,
       0.04320907, 0.0604957 , 0.07909717, 0.0507627 , 0.12235442,
       0.06749065, 0.06748331, 0.06325683, 0.0587346 , 0.07541042,
       0.19420956, 0.07681479, 0.04284953, 0.06466524, 0.06646528,
       0.07941882])

In [12]:
for i in score.argsort()[::-1]:
    if score[i] > 0.065:
        print((df['exercise_name'].iloc[i], score[i]))

('필라테스', 0.19420956231914588)
('에이리얼후프', 0.17227330852319234)
('아쿠아테크', 0.13523558540317412)
('패들핏', 0.12235441995412155)
('PT', 0.11878144239726723)
('아쿠아바이크', 0.10373382751965243)
('요가쿠아', 0.09558456162754625)
('힙레', 0.07941882243444937)
('파운드핏', 0.0790971689574306)
('필록싱', 0.07681478914890219)
('플라잉필라테스', 0.07541042021477098)
('뮤직복싱', 0.07421017128839055)
('줌바', 0.07166561039781895)
('타바타', 0.07134708844525589)
('번지댄스', 0.07119138701656397)
('번지요가', 0.07118326121878153)
('점핑피트니스', 0.07075479457842335)
('조깅', 0.07006522786254865)
('복싱', 0.06840518099959292)
('암벽등반', 0.06819160000798577)
('펜싱', 0.06749064976822002)
('폴댄스', 0.06748330685316975)
('번지피트니스', 0.06746267048465115)
('바차타', 0.06745264847120716)
('다빈치바디보드', 0.06714833548626901)
('에어로빅', 0.06711120067304044)
('크로스핏', 0.06676515684623441)
('헬스', 0.06646528178858763)
('재즈댄스', 0.0662839745529735)
('라틴댄스', 0.06591048295033884)
('키네시스', 0.06563570888330617)
