In [6]:
import pandas as pd
import numpy as np
from konlpy.tag import Twitter
import re
import os

import time
import hashlib
import json
from functools import partial

In [3]:
item_df = pd.read_csv("data/sample_item.csv",sep="▒",na_values="",engine='python')
review_df = pd.read_csv("data/sample_review.csv",sep='▒',na_values="",engine='python')
score_df = pd.read_csv("data/score.csv",sep="▒",engine='python')

# review_tag는 json파일로 되어 있음. list 형태로 복원
review_df.review_tag = review_df.review_tag.apply(json.loads)

item_df = item_df.fillna("")
review_df = review_df.fillna("")

### 크롤링 데이터 후처리

#### 1. 고유 명사 수 세기(얼마나 단어에 의미있는 것들이 존재하는가에 대한 지표)

In [7]:
twitter = Twitter()
count_unique_nouns = lambda text : len(set(twitter.nouns(text)))

# review_df['review_accuracy'] = review_df.review.apply(count_unique_nouns)

#### 2. 리뷰의 고유 ID 지정해주기

In [None]:
convert_hash = lambda text : hashlib.sha224(text.encode("utf-8")).hexdigest()
review_df['review_id'] = review_df.nv_mid.astype(str).apply(convert_hash) +\
                        (review_df.review_atc.astype(str) + review_df.review_title.astype(str) + review_df.review_date.astype(str)).apply(convert_hash)

#### 3. 리뷰 별 Tag값 지정하기

In [14]:
def calculate_tag(threshold,review):
    # 리뷰 내 토큰 집합을 가져옴
    tokens = set(twitter.morphs(review,stem=True,norm=True))
    # score가 있는 토큰들을 모은 후, type별 score을 매김
    results = score_df.loc[score_df.token.isin(tokens),['score','type']].groupby('type').sum()
    # index
    return results[results.score >=threshold].index.values

In [None]:
review_df['review_tag'] = review_df.review.apply(partial(calculate_tag,0.5))

In [None]:
review_df.review_tag = review_df.review_tag.apply(lambda x : json.dumps(list(x)))
review_df.to_csv("data/sample_review.csv",sep='▒',index=False)

------------------- 
리뷰 별 Tag값 설정할 때의 Score는 Word2Vec 연산으로 도출해냄

Word2Vec 연산은 아래와 같음

In [8]:
start_time = time.time()
# review_accuracy가 5보다 크면 약간 더 문장같은 놈들이 나타나기 시작. 문장 같은 놈들을 대상으로 word2vec을 학습시키는 게 올바르다고 판단
reviews = review_df[review_df.review_accuracy > 5]
review_morph = reviews.review.apply(partial(twitter.morphs,norm=True,stem=True))
sentences = review_morph.values
print('consumed time ---- {}'.format(time.time()-start_time))

consumed time ---- 594.6248161792755


In [16]:
from gensim.models import Word2Vec

In [35]:
start_time = time.time()
embedding_model = Word2Vec(sentences, 
                           size=200, # 200차원에서 연산하라
                           window=2, # 앞뒤로 2개까지 보라
                           min_count=100, # 코퍼스 내 출현 빈도가 100번 미만은 제외하라
                           workers=2, # cpu는 2개
                           iter=100, # 반복은 100번
                           sg=1) # 학습은 CBOW와 Skip-Gram 중 후자
print('consumed time ---- {}'.format(time.time()-start_time))

consumed time ---- 2461.6247868537903


In [36]:
# 가격
pos_tokens = ['가격','금액','저렴','비싼','값']
neg_tokens = ['품질','브랜드']
prices = embedding_model.wv.most_similar(positive=pos_tokens,negative=neg_tokens,topn=50)
price_df = pd.DataFrame(prices,columns=['token','score'])
for token in pos_tokens:
    price_df = price_df.append({"token":token,"score":0.5},ignore_index=True)
price_df['type'] = '가격'


In [38]:
prices = embedding_model.wv.most_similar(positive=pos_tokens,negative=neg_tokens,topn=100)
prices

[('싼', 0.5054810047149658),
 ('싸게', 0.5040369033813477),
 ('돈', 0.4785744249820709),
 ('비싸게', 0.4498073160648346),
 ('만원', 0.4479735791683197),
 ('비싸', 0.4366980791091919),
 ('비싸지', 0.4230850040912628),
 ('할인', 0.3967007100582123),
 ('이정', 0.3931207060813904),
 ('값어치', 0.39125561714172363),
 ('싼값', 0.3898414671421051),
 ('비싸긴', 0.3888387680053711),
 ('비싸서', 0.37992507219314575),
 ('잘산', 0.37859806418418884),
 ('반값', 0.37528109550476074),
 ('착한', 0.3733900487422943),
 ('싸서', 0.36421823501586914),
 ('비싸고', 0.3635864853858948),
 ('싸고', 0.3635164499282837),
 ('비용', 0.3596600294113159),
 ('배송비', 0.35479292273521423),
 ('쿠폰', 0.35332295298576355),
 ('싸지', 0.350532203912735),
 ('득템', 0.34816092252731323),
 ('제값', 0.3458954095840454),
 ('십만원', 0.3438461422920227),
 ('투자', 0.3431118130683899),
 ('비싸다', 0.337691068649292),
 ('시중', 0.33513981103897095),
 ('이만', 0.32668745517730713),
 ('높은', 0.3241426348686218),
 ('싸니', 0.32204920053482056),
 ('천원', 0.32103508710861206),
 ('송료', 0.3172229528427124

In [44]:
# 배송
pos_tokens = ['배송','배달','택배']
neg_tokens = ['물건']
deliveries = embedding_model.wv.most_similar(positive=pos_tokens,negative=neg_tokens,topn=150)
deliveries

[('시공', 0.5436540246009827),
 ('출고', 0.47407475113868713),
 ('송이', 0.4582885503768921),
 ('아저씨', 0.44014111161231995),
 ('기사', 0.4315301179885864),
 ('해피', 0.4307760000228882),
 ('토요일', 0.41523978114128113),
 ('운송', 0.41475847363471985),
 ('전화', 0.40838292241096497),
 ('지방', 0.4030998647212982),
 ('발송', 0.4020189046859741),
 ('업체', 0.3958789110183716),
 ('오전', 0.39191991090774536),
 ('상담', 0.3891047239303589),
 ('날짜', 0.38860392570495605),
 ('AS', 0.38429200649261475),
 ('콜센터', 0.376819908618927),
 ('설치', 0.3753424882888794),
 ('전날', 0.37398844957351685),
 ('포장', 0.3729618489742279),
 ('경동', 0.37017127871513367),
 ('늦게', 0.3698875904083252),
 ('송전', 0.3687395453453064),
 ('운반', 0.3670470714569092),
 ('저녁', 0.36628586053848267),
 ('접수', 0.3660777509212494),
 ('배송비', 0.3649386167526245),
 ('도착', 0.3644053637981415),
 ('오셔', 0.36439308524131775),
 ('제조', 0.3626255691051483),
 ('배송지', 0.36095768213272095),
 ('집앞', 0.35982680320739746),
 ('지정', 0.35917165875434875),
 ('콜', 0.357774317264556

In [None]:
# 가격
pos_tokens = ['가격','금액','저렴','비싼','값']
neg_tokens = ['품질','브랜드']
prices = embedding_model.wv.most_similar(positive=pos_tokens,negative=neg_tokens,topn=50)
price_df = pd.DataFrame(prices,columns=['token','score'])
for token in pos_tokens:
    price_df = price_df.append({"token":token,"score":0.5},ignore_index=True)
price_df['type'] = '가격'

# 배송
pos_tokens = ['배송','배달','택배']
neg_tokens = ['물건']
deliveries = embedding_model.wv.most_similar(positive=pos_tokens,negative=neg_tokens,topn=30)
delivery_df = pd.DataFrame(deliveries,columns=['token','score'])
for token in pos_tokens:
    delivery_df = delivery_df.append({"token":token,"score":0.5},ignore_index=True)
delivery_df['type'] = '배송'

# 품질
pos_tokens = ['품질','성능','퀄리티']
neg_tokens = ['가격','서비스']
quality = embedding_model.wv.most_similar(positive=pos_tokens,negative=neg_tokens,topn=40)
quality_df = pd.DataFrame(quality,columns=['token','score'])
for token in pos_tokens:
    quality_df = quality_df.append({"token":token,"score":0.5},ignore_index=True)
quality_df['type'] = '품질'

# 디자인
pos_tokens = ['디자인','모양','스타일']
neg_tokens = ['색깔']
design = embedding_model.wv.most_similar(positive=pos_tokens,negative=neg_tokens,topn=80)
design_df = pd.DataFrame(design,columns=['token','score'])
for token in pos_tokens:
    design_df = design_df.append({"token":token,"score":0.5},ignore_index=True)
design_df['type'] = '디자인'

# 조립
pos_tokens = ['조립','설치','시공']
neg_tokens = ['배송','배치']
make = embedding_model.wv.most_similar(positive=pos_tokens,negative=neg_tokens,topn=50)
make_df = pd.DataFrame(make,columns=['token','score'])
for token in pos_tokens:
    make_df = make_df.append({"token":token,"score":0.5},ignore_index=True)
make_df['type'] = '조립'

# 색
pos_tokens = ['색','톤','색깔','색상','색감','색도']
color = embedding_model.wv.most_similar(positive=pos_tokens,topn=200)
color_df = pd.DataFrame(color,columns=['token','score'])
for token in pos_tokens:
    color_df = color_df.append({"token":token,"score":0.5},ignore_index=True)
color_df['type'] = '색'

In [None]:
score_df = pd.concat([price_df,delivery_df,quality_df,design_df,make_df,color_df])