In [None]:
import re, os
import platform
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

from matplotlib import font_manager, rc
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm

from konlpy.tag import Mecab
tokenizer = Mecab(dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic')

In [None]:
# Matplotlib 한글 깨짐 현상 해결
font_manager.get_fontconfig_fonts()
font_name = font_manager.FontProperties(fname='/usr/share/fonts/truetype/nanum/NanumGothic.ttf').get_name()
rc('font', family=font_name)

data_dir = os.path.join(os.getcwd(), 'data')
print (data_dir)
print (plt.rcParams['font.family'])

In [None]:
data_file = os.path.join(data_dir, 'naver_review.txt')
train_data = pd.read_table(data_file).drop(['id', 'label'], axis = 1)
train_data = train_data.dropna(how = 'any')

print (train_data.shape)
train_data.head()

In [None]:
%time train_data['document'] = [re.sub(r'\s+', ' ', sent).strip() for sent in train_data['document']]
%time train_data['document'] = [re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣\s]', '', sent).strip() for sent in train_data['document']]
train_data.head()

In [None]:
tokenized_data = [tokenizer.morphs(sent) for sent in tqdm(train_data['document'])]

from time import sleep
from pprint import pprint
sleep(0.1)

pprint (tokenized_data[0])

In [None]:
# 리뷰 길이 분포 확인
print('리뷰의 최대 길이 :',max(len(l) for l in tokenized_data))
print('리뷰의 평균 길이 :',sum(map(len, tokenized_data))/len(tokenized_data))

%matplotlib inline
plt.hist([len(s) for s in tokenized_data], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
%time model = Word2Vec(sentences = tokenized_data, size = 100, window = 5, min_count = 10, workers = 4, sg = 0)
print (model.wv.vectors.shape)

word_vectors = model.wv

vocabs = list(word_vectors.vocab.keys())
word_vectors = [word_vectors[v] for v in vocabs]
print (len(vocabs))
print (vocabs[:5])

In [None]:
model["스파이"]

In [None]:
# 단어 별 유사 단어 Top 10
word = "첩보"
print (word, '\n')
pprint(model.wv.most_similar(word))

In [None]:
model.similarity("북한", "한국")

In [None]:
def plot_2d_graph(vocabs, x, y):
    plt.figure(figsize=(10, 10))
    plt.scatter(x, y, marker='o')
    for i, v in enumerate(vocabs):
        plt.annotate(v, xy=(x[i], y[i]))

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
xy = pca.fit_transform(word_vectors)

i = 100
%time plot_2d_graph(vocabs[:i], xy[:, 0][:i], xy[:, 1][:i])