In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

%matplotlib inline
# 시각화 결과가 선명하게 표시되도록
%config InlineBackend.figure_format = 'retina'
df = pd.read_csv("word2vec_wrangling.csv")
df.shape

(61, 2)

In [2]:
df.head()

Unnamed: 0,exercise_name,Content_txt
0,PT,"💯 What I try to educate my clients around, doe..."
1,검도,#20200115\n저녁 초대!\n와인잔 속에 비치는\n모든 것들이 화려한\n도심속...
2,기구필라테스,#오늘의동작\n캐딜락 동작의 완성 '행잉'\n⠀\n중력을 이용해 척추를 늘려주고\n...
3,다빈치바디보드,#mbn생생정보마당 \n#고투\n#고투GX\n#다빈치바디보드\n#생방송 #GOTOL...
4,드럼스틱,#드럼스틱 #고무팁 #테크라스틱 #전자드럼용스틱\n\n1. 전자드럼타격시 덜 시끄럽...


In [4]:
def preprocessing(text):
    # 개행문자 제거
    text = re.sub('\\\\n', ' ', text)
    # 특수문자 제거
    # 특수문자나 이모티콘 등은 때로는 의미를 갖기도 하지만 여기에서는 제거했습니다.
    # text = re.sub('[?.,;:|\)*~`’!^\-_+<>@\#$%&-=#}※]', '', text)
    # 한글, 영문, 숫자만 남기고 모두 제거하도록 합니다.
    # text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z0-9]', ' ', text)
    # 한글, 영문만 남기고 모두 제거하도록 합니다.
    text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z]', ' ', text)
    return text

In [5]:
# %time을 찍어주면 해당 코드를 실행할 때 걸리는 시간을 출력해 줍니다
%time sentences = df['Content_txt'].apply(preprocessing)

CPU times: user 789 ms, sys: 6.42 ms, total: 795 ms
Wall time: 794 ms


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = 'word', # 캐릭터 단위로 벡터화 할 수도 있습니다.
                             tokenizer = None, # 토크나이저를 따로 지정해 줄 수도 있습니다.
                             preprocessor = None, # 전처리 도구
                             stop_words = None, # 불용어 nltk등의 도구를 사용할 수도 있습니다.
                             min_df = 2, # 토큰이 나타날 최소 문서 개수로 오타나 자주 나오지 않는 특수한 전문용어 제거에 좋습니다. 
                             ngram_range=(1,2), # BOW의 단위 갯수의 범위를 지정합니다.
                             max_features = 2000 # 만들 피처의 수, 단어의 수
                            )
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=2000, min_df=2,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [17]:
feature_vector = vectorizer.fit_transform(df['Content_txt'])
feature_vector.shape

(61, 2000)

In [19]:
vocab = vectorizer.get_feature_names()
print(len(vocab))
vocab[:500]

2000


['00',
 '00 00',
 '00 pm',
 '000',
 '000개',
 '000개 증가',
 '000명',
 '000원',
 '000원 좋아요',
 '000원 팔로워',
 '01',
 '010',
 '010 4811',
 '01044170198',
 '02',
 '03',
 '031',
 '032',
 '041',
 '043',
 '05',
 '051',
 '053',
 '06',
 '07',
 '070',
 '08',
 '09',
 '09 00',
 '10',
 '10 00',
 '10 30',
 '10 할인',
 '100',
 '10분',
 '10시',
 '10월',
 '11',
 '11 00',
 '11시',
 '11월',
 '12',
 '12 00',
 '12 202호',
 '12시',
 '12월',
 '13',
 '14',
 '15',
 '15일',
 '16',
 '17',
 '18',
 '19',
 '1pt',
 '1pt 그룹pt',
 '1개월',
 '1년',
 '1시간',
 '1월',
 '1인',
 '1일',
 '1회',
 '20',
 '2019',
 '2019년',
 '2020',
 '2020 01',
 '20200125',
 '2020년',
 '2020년 1월',
 '2020년 2월',
 '2020년도',
 '202호',
 '202호 서울시',
 '2030',
 '20만원',
 '21',
 '213',
 '213 12',
 '22',
 '23',
 '24',
 '24시간',
 '24일',
 '25',
 '26',
 '27',
 '27일',
 '28',
 '28일',
 '29',
 '2시',
 '2시간',
 '2시간 무료',
 '2월',
 '2일',
 '2층',
 '2호점',
 '30',
 '30 30',
 '30pm',
 '30분',
 '31',
 '31일',
 '35',
 '3570',
 '3570 따뜻한',
 '36',
 '36 현대하이엘',
 '3f',
 '3개월',
 '3시',
 '3월',
 '3일',
 '3층',
 '40',


In [20]:
# 각 리뷰마다 등장하는 단어에 빈도수가 표현됩니다. 0 은 등장하지 않음을 의미합니다.
pd.DataFrame(feature_vector[:10].toarray(), columns=vocab).head()

Unnamed: 0,00,00 00,00 pm,000,000개,000개 증가,000명,000원,000원 좋아요,000원 팔로워,...,휘트니스,휴무,힐링,힐링요가,힘든,힘을,힘이,힙레,힙합,힙합댄스
0,17,0,0,1,0,0,0,0,0,0,...,5,1,0,0,0,0,1,0,0,0
1,44,0,0,0,0,0,0,0,0,0,...,1,0,3,0,2,0,3,0,1,0
2,55,1,4,9,0,0,0,11,0,0,...,35,2,4,3,5,13,15,0,0,0
3,126,0,0,4,0,0,0,126,0,0,...,11,1,2,3,3,6,3,0,0,0
4,7,0,0,8,0,0,0,63,0,0,...,0,2,3,0,2,0,2,0,0,0


In [21]:
# 위에서 구한 단어벡터를 더하면 단어가 전체에서 등장하는 횟수를 알 수 있습니다.
# 벡터화 된 피처를 확인해 봄
# Bag of words 에 몇 개의 단어가 들어있는지 확인
dist = np.sum(feature_vector, axis=0)
    
df_freq = pd.DataFrame(dist, columns=vocab)
df_freq

Unnamed: 0,00,00 00,00 pm,000,000개,000개 증가,000명,000원,000원 좋아요,000원 팔로워,...,휘트니스,휴무,힐링,힐링요가,힘든,힘을,힘이,힙레,힙합,힙합댄스
0,2952,169,181,184,185,185,192,1464,226,186,...,497,209,823,468,208,258,254,658,302,200


In [22]:
# 행과 열의 축을 T로 바꿔주고 빈도수로 정렬
df_freq.T.sort_values(by=0, ascending=False).head(30)

Unnamed: 0,0
운동,9966
다이어트,9858
요가,8372
필라테스,8161
운동하는여자,6526
일상,4843
010,4468
함께,3748
있는,3438
운동하는남자,3375


In [24]:
df_freq_T = df_freq.T.reset_index()
df_freq_T.columns = ["keyword", "freq"]
# 강의명을 토큰 3개로 중복제거하기 위해, 강좌명에서 지식공유자의 이름을 빈문자열로 변경
df_freq_T["keyword"] = df_freq_T["keyword"]
df_freq_T["keyword"] = df_freq_T["keyword"].apply(lambda x : " ". join(x.split()[:4]))
df_freq_T.sort_values(["keyword", "freq"], ascending=False).head(10)

Unnamed: 0,keyword,freq
1999,힙합댄스,200
1998,힙합,302
1997,힙레,658
1996,힘이,254
1995,힘을,258
1994,힘든,208
1993,힐링요가,468
1992,힐링,823
1991,휴무,209
1990,휘트니스,497


In [25]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
transformer

TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False, use_idf=True)

In [26]:
%%time 
feature_tfidf = transformer.fit_transform(feature_vector)
feature_tfidf.shape

CPU times: user 6.82 ms, sys: 972 µs, total: 7.79 ms
Wall time: 6.77 ms


(61, 2000)

In [27]:
feature_tfidf

<61x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 60269 stored elements in Compressed Sparse Row format>

In [28]:
# 각 row에서 전체 단어가방에 있는 어휘에서 등장하는 단어에 대한 one-hot-vector에 TF-IDF 가중치 반영
tfidf_freq = pd.DataFrame(feature_tfidf.toarray(), columns=vocab)
tfidf_freq.head() 

Unnamed: 0,00,00 00,00 pm,000,000개,000개 증가,000명,000원,000원 좋아요,000원 팔로워,...,휘트니스,휴무,힐링,힐링요가,힘든,힘을,힘이,힙레,힙합,힙합댄스
0,0.005964,0.0,0.0,0.00056,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002551,0.000459,0.0,0.0,0.0,0.0,0.000394,0.0,0.0,0.0
1,0.015845,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000524,0.0,0.001135,0.0,0.00085,0.0,0.001213,0.0,0.000743,0.0
2,0.037123,0.001269,0.005077,0.009697,0.0,0.0,0.0,0.008334,0.0,0.0,...,0.034349,0.001765,0.002836,0.003983,0.003983,0.010894,0.011364,0.0,0.0,0.0
3,0.026986,0.0,0.0,0.001368,0.0,0.0,0.0,0.03029,0.0,0.0,...,0.003426,0.00028,0.00045,0.001264,0.000758,0.001596,0.000721,0.0,0.0,0.0
4,0.001482,0.0,0.0,0.002704,0.0,0.0,0.0,0.014972,0.0,0.0,...,0.0,0.000554,0.000667,0.0,0.0005,0.0,0.000475,0.0,0.0,0.0


In [29]:
df_tfidf = pd.DataFrame(tfidf_freq.sum())
df_tfidf_top = df_tfidf.sort_values(by=0, ascending=False)
df_tfidf_top.head(10)

Unnamed: 0,0
운동,4.416694
필라테스,4.055385
다이어트,4.055243
요가,4.040307
운동하는여자,2.921596
일상,2.224476
아쿠아바이크,2.022201
플라잉요가,1.983279
헬스,1.885728
sup,1.823076


In [30]:
# 중간에 생략되는 단어를 자세히 보고자 할 때
for t in df_tfidf_top.index[:30]:
    print(t)

운동
필라테스
다이어트
요가
운동하는여자
일상
아쿠아바이크
플라잉요가
헬스
sup
010
yoga
패들보드
기구필라테스
운동하는남자
플라잉
함께
빈야사
스피닝
운동스타그램
아쉬탕가
문의
있는
수영
서핑
줌바
너무
발레
번지피트니스
pilates


In [None]:
from math import log10

# =======================================
# -- TF-IDF function
# =======================================
def f(t, d):
    # d is document == tokens
    return d.count(t)

def tf(t, d):
    # d is document == tokens
    return 0.5 + 0.5*f(t,d)/max([f(w,d) for w in d])

def idf(t, D):
    # D is documents == document list
    numerator = len(D)
    denominator = 1 + len([ True for d in D if t in d])
    return log10(numerator/denominator)

def tfidf(t, d, D):
    return tf(t,d)*idf(t, D)

def tokenizer(d):
    # return [ t for t in d.split() if len(t) > 1 ]
    return d.split()

def tfidfScorer(D):
    tokenized_D = [tokenizer(d) for d in D]
    result = []
    for d in tokenized_D:
        result.append([(t, tfidf(t, d, tokenized_D)) for t in d])
    return result


if __name__ == '__main__':
    corpus = df['Content_txt'].tolist()

    for i, doc in enumerate(tfidfScorer(corpus)):
        print('====== document[%d] ======' % i)
        print(doc)

        # http://blog.naver.com/PostView.nhn?blogId=vangarang&logNo=221072014624&parentCategoryNo=&categoryNo=35&viewDate=&isShowPopularPosts=true&from=search