### Word2Vec

#### * 라이브러리

In [None]:
import pandas as pd
import numpy as np
import re
import networkx as nx
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action='ignore')

#### * 파일 불러오기

In [None]:
data=pd.read_excel('최종 데이터셋.xlsx')

data=data.reset_index()
data.drop(['index','Unnamed: 0'],axis=1,inplace=True)

data=data.fillna('')

data['noun_va']=data['noun_va'].apply(lambda x:x.split(' '))
data['nouns']=data['nouns'].apply(lambda x:x.split(' '))
data['va']=data['va'].apply(lambda x:x.split(' '))

#### * 한글 글꼴 깨짐 방지

In [None]:
#한글 글꼴 깨짐 방지
import matplotlib
from matplotlib import font_manager, rc
import platform

font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)

#### * Word2Vec 분석

In [None]:
# word2vec을 통해 키워드별 거리를 살펴보기

from gensim.models import Word2Vec

model=Word2Vec(data['noun_va'],vector_size=300,window=5,hs=0,sample=0.00001,negative=6,
               ns_exponent=0.75,batch_words=10000, ## hs : negative sampling
               min_count=500,workers=5,sg=1) ## skip-gram

word_vectors=model.wv
vocabs=list(word_vectors.index_to_key)
word_vectors_list=[word_vectors[v] for v in vocabs]

#### * PCA를 이용한 시각화

In [None]:
# PCA를 이용한 시각화
from sklearn.decomposition import PCA

pca=PCA(n_components=2)
xys=pca.fit_transform(word_vectors_list)
xs=xys[:,0]
ys=xys[:,1]

In [None]:
def plot_2d_graph(vocabs,xs,ys):
    plt.figure(figsize=(10,10))
    plt.scatter(xs,ys,marker='o')
    for i,v in enumerate(vocabs):
        plt.annotate(v,xy=(xs[i],ys[i]))

In [None]:
import matplotlib.pyplot as plt
%matplotlib nbagg

plot_2d_graph(vocabs,xs,ys)

In [None]:
model.wv.most_similar(positive=['충전'], topn=50) # 충전과 유사한 단어 50개 확인 (충전, 배터리, 비싸다, 느리다 등의 키워드로 확인)

#### * T-SNE를 이용한 시각화

In [None]:
# T-SNE를 이용한 시각화
from sklearn.manifold import TSNE

tsne=TSNE(n_components=2)
transformed=tsne.fit_transform(word_vectors_list)

xs2=transformed[:,0]
ys2=transformed[:,1]

plt.figure(figsize=(10,10))
plt.scatter(xs,ys)

for i,v in enumerate(vocabs):
    plt.annotate(v,xy=(xs2[i],ys2[i]))
plt.show()

#### * Word2Vec 모델 저장하고 로드하기 -> 구글 임베팅 프로젝터에 로드해서 시각화

In [None]:
from gensim.models import KeyedVectors
model.wv.save_word2vec_format('word2vec') # 모델 저장
loaded_model = KeyedVectors.load_word2vec_format("word2vec") # 모델 로드

In [None]:
!python -m gensim.scripts.word2vec2tensor --input word2vec --output word2vec