# 015. word2vec 작성

- skipgram, window size 2 의 simplified word2vec model 작성

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Input
import numpy as np
import re

### Toy 말뭉치

In [14]:
corpus = ['king is a very strong man $$@df212#@',
          'queen is a wise and pretty woman',
          'boy is a young man',
          'girl is a young and pretty woman',
          'prince is young and will be a strong and wise king',
          'princess is a young woman and will be a pretty and wise queen',
          'man is strong',
          'woman is pretty',
          'prince is a boy will be a kingㄱㅇㄹ',
          'princess is a girl will be a queen']

# corpus = ['왕은 매우 강한 남자이다',
#           '여왕은 현명한 예쁜 여자이다',
#           '소년은 젊은 남자이다',
#           '소녀는 젊은 예쁜 여자이다',
#           '왕자는 젊고 현명한 왕이 될 것이다',
#           '공주는 젊고 예쁜 현명한 여왕이 될 것이다',
#           '남자는 강하다',
#           '여자는 예쁘다',
#           '왕자는 왕이 될 소년이다',
#           '공주는 왕비가 될 소녀이다']

In [None]:
cleaned_corpus = []
for text in corpus:
    text = re.sub(r'[^a-zA-Z ]', '', text)  #영문
    # text = re.sub(r'[^가-힣 ]', '', text)  #한글
    print(text)
    cleaned_corpus.append(text.lower())

### stopword 제거

In [None]:
from konlpy.tag import Okt
okt = Okt()
okt.morphs()

In [5]:
stop_words = ['is', 'a' ,'will', 'be', 'and']
#stop_words = ['은', '가', '이다', '는', '이', '될']
results = []
for text in corpus:
    tmp = []
    #for word in okt.morphs(text):
    for word in text.split():
        if word not in stop_words:
            tmp.append(word)
    results.append(' '.join(tmp))

In [None]:
cleaned_corpus = results
cleaned_corpus

### vocaburary 모음 작성

In [None]:
words = []

for sentence in corpus:
    for word in sentence.split(' '):
        words.append(word)

words = list(set(words))
words

### word-to-index, index-to-word 작성

- word 를 index 로 변환  

- sentence 를 word index 로 변환  

- window size 에 따라 train data 생성

In [8]:
word2index = dict((w, i) for i, w in enumerate(words))
index2word = dict((i, w) for i, w in enumerate(words))

In [None]:
word2index

### skip-gram 으로 training data 생성

In [None]:
sentences = []
for sentence in corpus:
    sentences.append(sentence.split())
sentences

In [11]:
WINDOW_SIZE = 2

data = []
for sentence in sentences:
    for idx, word in enumerate(sentence):
        for neighbor in sentence[max(idx - WINDOW_SIZE, 0) : min(idx + WINDOW_SIZE, len(sentence)) + 1] :
            if neighbor != word:
                data.append([word, neighbor])

In [None]:
data[:10]

In [None]:
import pandas as pd

df = pd.DataFrame(data, columns = ['input', 'label'])
df.head()

### One hot encoding

In [None]:
from tensorflow.keras.utils import to_categorical

len(words)

### One hot encoding 된 train, label data

In [15]:
X = [] # input word
Y = [] # target word

for x, y in zip(df['input'], df['label']):
    X.append(to_categorical(word2index[x], len(words)))
    Y.append(to_categorical(word2index[x], len(words)))

In [None]:
print(X[:3])
print(Y[:3])

In [17]:
# convert them to numpy arrays
X_train = np.array(X)
Y_train = np.array(Y)

**시각화를 위해 hidden layer 의 unit 을 2 로 제한**

In [18]:
model = Sequential()
model.add(Input(shape=(len(words),)))
model.add(Dense(2))
model.add(Dense(len(words)))

model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, epochs=500, batch_size=3)

### 첫번째 Hidden Layer 추출 및 weight + bias 를 vector 로 합산

In [None]:
model.summary()

In [None]:
model.layers[0].get_weights()

In [None]:
vectors= model.layers[0].get_weights()[0] + model.layers[0].get_weights()[1]
vectors[:5]

In [None]:
words

In [None]:
w2v = pd.DataFrame(vectors, columns = ['x1', 'x2'])
w2v['word'] = words
w2v

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
#한글 폰트 사용
# from matplotlib import font_manager
# import matplotlib
# font_path = "C:/Windows/Fonts/H2GTRM.TTF"                       #폰트 경로
# font_name = font_manager.FontProperties(fname=font_path).get_name()  #폰트 이름 얻어오기
# matplotlib.rc('font', family=font_name)                                 #font 지정
# matplotlib.rcParams['axes.unicode_minus'] = False               #한글사용시 마이너스 사인 깨짐 방지

fig, ax = plt.subplots(figsize=(12,8))

for word, x1, x2 in zip(w2v['word'], w2v['x1'], w2v['x2']):
    ax.annotate(word, (x1, x2))

PADDING = 1.0
x_axis_min = np.min(vectors, axis=0)[0] - PADDING
y_axis_min = np.min(vectors, axis=0)[1] - PADDING
x_axis_max = np.max(vectors, axis=0)[0] + PADDING
y_axis_max = np.max(vectors, axis=0)[1] + PADDING

plt.xlim(x_axis_min, x_axis_max)
plt.ylim(y_axis_min, y_axis_max)
plt.xlabel('x1')
plt.ylabel('x2')

plt.show()