출처: https://blog.breezymind.com/2018/03/02/sklearn-feature_extraction-text-2/

In [2]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None
np.random.seed(0)

from konlpy.tag import Okt
twitter = Okt()


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# tokenizer : 문장에서 색인어 추출을 위해 명사,동사,알파벳,숫자 정도의 단어만 뽑아서 normalization, stemming 처리하도록 함
# tokenizer : 문장에서 색인어 추출을 위해 명사,동사,알파벳,숫자 정도의 단어만 뽑아서 normalization, stemming 처리하도록 함
def tokenizer(raw, pos=["Noun","Alpha","Verb","Number"], stopword=[]):
    return [
        word for word, tag in twitter.pos(
            raw, 
            norm=True,   # normalize 그랰ㅋㅋ -> 그래ㅋㅋ
            stem=True    # stemming 바뀌나->바뀌다
            )
            if len(word) > 1 and tag in pos and word not in stopword
        ]

# 테스트 문장
df = pd.read_csv("word2vec_wrangling.csv")
df.shape

(61, 2)

In [3]:
df.head()

Unnamed: 0,exercise_name,Content_txt
0,PT,"💯 What I try to educate my clients around, doe..."
1,검도,#20200115\n저녁 초대!\n와인잔 속에 비치는\n모든 것들이 화려한\n도심속...
2,기구필라테스,#오늘의동작\n캐딜락 동작의 완성 '행잉'\n⠀\n중력을 이용해 척추를 늘려주고\n...
3,다빈치바디보드,#mbn생생정보마당 \n#고투\n#고투GX\n#다빈치바디보드\n#생방송 #GOTOL...
4,드럼스틱,#드럼스틱 #고무팁 #테크라스틱 #전자드럼용스틱\n\n1. 전자드럼타격시 덜 시끄럽...


In [4]:
import re

def preprocessing(text):
    # 개행문자 제거
    text = re.sub('\\\\n', ' ', text)
    # 특수문자 제거
    # 특수문자나 이모티콘 등은 때로는 의미를 갖기도 하지만 여기에서는 제거했습니다.
    # text = re.sub('[?.,;:|\)*~`’!^\-_+<>@\#$%&-=#}※]', '', text)
    # 한글, 영문, 숫자만 남기고 모두 제거하도록 합니다.
    # text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z0-9]', ' ', text)
    # 한글, 영문만 남기고 모두 제거하도록 합니다.
    text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z]', ' ', text)
    return text

In [5]:
%time rawdata = df['Content_txt'].apply(preprocessing)

CPU times: user 950 ms, sys: 18.8 ms, total: 968 ms
Wall time: 1.13 s


In [6]:
vectorize = TfidfVectorizer(
    tokenizer=tokenizer,
    min_df=5,
    sublinear_tf=True    # tf값에 1+log(tf)를 적용하여 tf값이 무한정 커지는 것을 막음
)
X = vectorize.fit_transform(rawdata)

# fit_transform, (sentence 5, feature 7)

print(X.toarray())

# ([[0.        , 0.40824829, 0.81649658, 0.        , 0.        , 0.        , 0.40824829],
# [0.        , 0.40824829, 0.40824829, 0.        , 0.        , 0.        , 0.81649658],
# [0.41680418, 0.        , 0.        , 0.69197025, 0.41680418, 0.41680418, 0.        ],
# [0.76944707, 0.        , 0.        , 0.63871058, 0.        , 0.        , 0.        ],
# [0.        , 0.        , 0.        , 0.8695635 , 0.34918428, 0.34918428, 0.        ]])

# 문장에서 뽑아낸 feature 들의 배열
features = vectorize.get_feature_names()

[[0.01845768 0.         0.00949002 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.01298815 0.        ]
 [0.         0.02342017 0.         ... 0.01069141 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.01089691 0.         0.02467502 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.06379826 0.        ]]


In [10]:
print(len(features))

14098


In [11]:
features[:12]

['ab',
 'abc',
 'abdominal',
 'able',
 'about',
 'abs',
 'abt',
 'ac',
 'academy',
 'account',
 'ace',
 'acrobatics']

In [27]:
# 검색 문장에서 feature를 뽑아냄
srch=[t for t in tokenizer('다이어트') if t in features]
print(srch)

['다이어트']


In [28]:
# dtm 에서 검색하고자 하는 feature만 뽑아낸다.
srch_dtm = np.asarray(X.toarray())[:, [
    # vectorize.vocabulary_.get 는 특정 feature 가 dtm 에서 가지고 있는 index값을 리턴한다
    vectorize.vocabulary_.get(i) for i in srch
]]

print(len(srch_dtm))
srch_dtm

61


array([[0.017641  ],
       [0.0237774 ],
       [0.02528911],
       [0.02724556],
       [0.01239386],
       [0.0216599 ],
       [0.02558193],
       [0.02385327],
       [0.03424987],
       [0.02624148],
       [0.02098979],
       [0.02519342],
       [0.02386945],
       [0.02890046],
       [0.02915551],
       [0.02697332],
       [0.02901546],
       [0.01267853],
       [0.02098199],
       [0.02206395],
       [0.0247678 ],
       [0.0142348 ],
       [0.02526471],
       [0.01456254],
       [0.02049488],
       [0.02360243],
       [0.01913793],
       [0.02480334],
       [0.03867534],
       [0.04431178],
       [0.02268235],
       [0.0293995 ],
       [0.02471063],
       [0.02163656],
       [0.02883565],
       [0.01745941],
       [0.02195834],
       [0.02420044],
       [0.02967505],
       [0.02812108],
       [0.02442093],
       [0.02773653],
       [0.02447233],
       [0.02298574],
       [0.02977711],
       [0.01789315],
       [0.02467978],
       [0.030

In [29]:
score = srch_dtm.sum(axis=1)
score

array([0.017641  , 0.0237774 , 0.02528911, 0.02724556, 0.01239386,
       0.0216599 , 0.02558193, 0.02385327, 0.03424987, 0.02624148,
       0.02098979, 0.02519342, 0.02386945, 0.02890046, 0.02915551,
       0.02697332, 0.02901546, 0.01267853, 0.02098199, 0.02206395,
       0.0247678 , 0.0142348 , 0.02526471, 0.01456254, 0.02049488,
       0.02360243, 0.01913793, 0.02480334, 0.03867534, 0.04431178,
       0.02268235, 0.0293995 , 0.02471063, 0.02163656, 0.02883565,
       0.01745941, 0.02195834, 0.02420044, 0.02967505, 0.02812108,
       0.02442093, 0.02773653, 0.02447233, 0.02298574, 0.02977711,
       0.01789315, 0.02467978, 0.03061894, 0.01760445, 0.        ,
       0.01998535, 0.02478731, 0.02585175, 0.02293988, 0.02896992,
       0.08093068, 0.03114556, 0.01704866, 0.02854832, 0.02653628,
       0.02759242])

In [30]:
for i in score.argsort()[::-1]:
    if score[i] > 0.065:
        print((df['exercise_name'].iloc[i], score[i]))

('필라테스', 0.08093067994559094)
