##**3. Word2Vec**
1. 주어진 단어들을 word2vec 모델에 들어갈 수 있는 형태로 만듭니다.
2. CBOW, Skip-gram 모델을 각각 구현합니다.
3. 모델을 실제로 학습해보고 결과를 확인합니다.
4. 산점도를 그려 단어들의 대략적인 위치를 확인해봅니다

### **필요 패키지 import**

In [2]:
# !sudo apt-get install -y fonts-nanum
# !sudo fc-cache -fv
# !rm ~/.cache/matplotlib -rf

In [3]:
# !pip install konlpy

In [4]:
from tqdm import tqdm
from konlpy.tag import Mecab,Twitter,Okt,Kkma
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

import torch
import copy
import numpy as np

### **데이터 전처리**



데이터를 확인하고 Word2Vec 형식에 맞게 전처리합니다.  

In [5]:
train_data = [
  "정말 맛있습니다. 추천합니다.",
  "기대했던 것보단 별로였네요.",
  "다 좋은데 가격이 너무 비싸서 다시 가고 싶다는 생각이 안 드네요.",
  "완전 최고입니다! 재방문 의사 있습니다.",
  "음식도 서비스도 다 만족스러웠습니다.",
  "위생 상태가 좀 별로였습니다. 좀 더 개선되기를 바랍니다.",
  "맛도 좋았고 직원분들 서비스도 너무 친절했습니다.",
  "기념일에 방문했는데 음식도 분위기도 서비스도 다 좋았습니다.",
  "전반적으로 음식이 너무 짰습니다. 저는 별로였네요.",
  "위생에 조금 더 신경 썼으면 좋겠습니다. 조금 불쾌했습니다."       
]

test_words = ["음식", "맛", "서비스", "위생", "가격"]

Tokenization과 vocab을 만드는 과정은 이전 실습과 유사합니다.

In [6]:
tokenizer = Okt()

In [7]:
def make_tokenized(data):
  tokenized = []
  for sent in tqdm(data):
    tokens = tokenizer.morphs(sent, stem=True)
    tokenized.append(tokens)

  return tokenized

In [8]:
train_tokenized = make_tokenized(train_data)

100%|██████████| 10/10 [00:02<00:00,  3.73it/s]


In [9]:
word_count = defaultdict(int)

for tokens in tqdm(train_tokenized):
  for token in tokens:
    word_count[token] += 1

100%|██████████| 10/10 [00:00<00:00, 119495.84it/s]


In [10]:
word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
print(list(word_count))

[('.', 14), ('도', 7), ('이다', 4), ('좋다', 4), ('별로', 3), ('다', 3), ('이', 3), ('너무', 3), ('음식', 3), ('서비스', 3), ('하다', 2), ('방문', 2), ('위생', 2), ('좀', 2), ('더', 2), ('에', 2), ('조금', 2), ('정말', 1), ('맛있다', 1), ('추천', 1), ('기대하다', 1), ('것', 1), ('보단', 1), ('가격', 1), ('비싸다', 1), ('다시', 1), ('가다', 1), ('싶다', 1), ('생각', 1), ('안', 1), ('드네', 1), ('요', 1), ('완전', 1), ('최고', 1), ('!', 1), ('재', 1), ('의사', 1), ('있다', 1), ('만족스럽다', 1), ('상태', 1), ('가', 1), ('개선', 1), ('되다', 1), ('기르다', 1), ('바라다', 1), ('맛', 1), ('직원', 1), ('분들', 1), ('친절하다', 1), ('기념일', 1), ('분위기', 1), ('전반', 1), ('적', 1), ('으로', 1), ('짜다', 1), ('저', 1), ('는', 1), ('신경', 1), ('써다', 1), ('불쾌하다', 1)]


In [11]:
w2i = {}
for pair in tqdm(word_count):
  if pair[0] not in w2i:
    w2i[pair[0]] = len(w2i)

i2w  ={v:k for k,v in w2i.items()}

100%|██████████| 60/60 [00:00<00:00, 1165084.44it/s]


In [12]:
print(train_tokenized)
print(w2i)

[['정말', '맛있다', '.', '추천', '하다', '.'], ['기대하다', '것', '보단', '별로', '이다', '.'], ['다', '좋다', '가격', '이', '너무', '비싸다', '다시', '가다', '싶다', '생각', '이', '안', '드네', '요', '.'], ['완전', '최고', '이다', '!', '재', '방문', '의사', '있다', '.'], ['음식', '도', '서비스', '도', '다', '만족스럽다', '.'], ['위생', '상태', '가', '좀', '별로', '이다', '.', '좀', '더', '개선', '되다', '기르다', '바라다', '.'], ['맛', '도', '좋다', '직원', '분들', '서비스', '도', '너무', '친절하다', '.'], ['기념일', '에', '방문', '하다', '음식', '도', '분위기', '도', '서비스', '도', '다', '좋다', '.'], ['전반', '적', '으로', '음식', '이', '너무', '짜다', '.', '저', '는', '별로', '이다', '.'], ['위생', '에', '조금', '더', '신경', '써다', '좋다', '.', '조금', '불쾌하다', '.']]
{'.': 0, '도': 1, '이다': 2, '좋다': 3, '별로': 4, '다': 5, '이': 6, '너무': 7, '음식': 8, '서비스': 9, '하다': 10, '방문': 11, '위생': 12, '좀': 13, '더': 14, '에': 15, '조금': 16, '정말': 17, '맛있다': 18, '추천': 19, '기대하다': 20, '것': 21, '보단': 22, '가격': 23, '비싸다': 24, '다시': 25, '가다': 26, '싶다': 27, '생각': 28, '안': 29, '드네': 30, '요': 31, '완전': 32, '최고': 33, '!': 34, '재': 35, '의사': 36, '있다': 37, '만족스럽다': 38, '상태

### 다음은 Word2Vec을 학습시키는 대표적인 방법인 Skipgram과 CBoW를 다룹니다. 

* CboW는 주변단어를 이용해, 주어진 단어를 예측하는 방법입니다.
* Skipgram은 중심 단어를 이용하여 주변 단어를 예측하는 방법입니다.
* 즉 데이터셋을 구성할때, input x 와 target y를 어떻게 설정하는지에 차이가 있습니다.

참고자료 

* https://simonezz.tistory.com/35 

* https://towardsdatascience.com/nlp-101-word2vec-skip-gram-and-cbow-93512ee24314 



실제 모델에 들어가기 위한 input을 만들기 위해 `Dataset` 클래스를 정의합니다.

In [13]:
class CBOWDataset(Dataset):
  def __init__(self, train_tokenized, window_size=2):
    self.x = [] # input word
    self.y = [] # target word

    for tokens in tqdm(train_tokenized):
      token_ids = [w2i[token] for token in tokens]
      for i, id in enumerate(token_ids):
        if i-window_size >= 0 and i+window_size < len(token_ids):
            ############################ ANSWER HERE ################################
            # TODO 1: insert tokens for input self.x
            # TODO 2: insert tokens for targets self.y
            #########################################################################        
            tokens_in_window = token_ids[i-window_size:i+window_size+1]
            self.x.append(tokens_in_window[:window_size] + token_ids[-window_size:])
            self.y.append(id)

    self.x = torch.LongTensor(self.x)  # (전체 데이터 개수, 2 * window_size)
    self.y = torch.LongTensor(self.y)  # (전체 데이터 개수)

  def __len__(self):
    return self.x.shape[0]

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

In [14]:
class SkipGramDataset(Dataset):
  def __init__(self, train_tokenized, window_size=2):
    self.x = []
    self.y = []

    for tokens in tqdm(train_tokenized):
      token_ids = [w2i[token] for token in tokens]
      for i, id in enumerate(token_ids):
        if i-window_size >= 0 and i+window_size < len(token_ids):
            ############################ ANSWER HERE ################################
            # TODO 1: insert tokens for input self.x
            # TODO 2: insert tokens for targets self.y
            #########################################################################        
            tokens_in_window = token_ids[i-window_size:i+window_size+1]
            self.x.append(id)
            self.y.append(tokens_in_window[:window_size] + token_ids[-window_size:])


    self.x = torch.LongTensor(self.x)  # (전체 데이터 개수)
    self.y = torch.LongTensor(self.y)  # (전체 데이터 개수)

  def __len__(self):
    return self.x.shape[0]

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

각 모델에 맞는 `Dataset` 객체를 생성합니다.

In [15]:
cbow_set = CBOWDataset(train_tokenized)
skipgram_set = SkipGramDataset(train_tokenized)
print(list(skipgram_set)[:10])
print(list(cbow_set)[:10])

100%|██████████| 10/10 [00:00<00:00, 127486.44it/s]
100%|██████████| 10/10 [00:00<00:00, 143150.31it/s]

[(tensor(0), tensor([17, 18, 10,  0])), (tensor(19), tensor([18,  0, 10,  0])), (tensor(22), tensor([20, 21,  2,  0])), (tensor(4), tensor([21, 22,  2,  0])), (tensor(23), tensor([ 5,  3, 31,  0])), (tensor(6), tensor([ 3, 23, 31,  0])), (tensor(7), tensor([23,  6, 31,  0])), (tensor(24), tensor([ 6,  7, 31,  0])), (tensor(25), tensor([ 7, 24, 31,  0])), (tensor(26), tensor([24, 25, 31,  0]))]
[(tensor([17, 18, 10,  0]), tensor(0)), (tensor([18,  0, 10,  0]), tensor(19)), (tensor([20, 21,  2,  0]), tensor(22)), (tensor([21, 22,  2,  0]), tensor(4)), (tensor([ 5,  3, 31,  0]), tensor(23)), (tensor([ 3, 23, 31,  0]), tensor(6)), (tensor([23,  6, 31,  0]), tensor(7)), (tensor([ 6,  7, 31,  0]), tensor(24)), (tensor([ 7, 24, 31,  0]), tensor(25)), (tensor([24, 25, 31,  0]), tensor(26))]





### **모델 Class 구현**

차례대로 두 가지 Word2Vec 모델을 구현합니다.  


*   `self.embedding`: `vocab_size` 크기의 one-hot vector를 특정 크기의 `dim` 차원으로 embedding 시키는 layer.
*   `self.linear`: 변환된 embedding vector를 다시 원래 `vocab_size`로 바꾸는 layer.


In [19]:
class CBOW(nn.Module):
  def __init__(self, vocab_size, dim):
    super(CBOW, self).__init__()
    self.embedding = nn.Embedding(vocab_size, dim, sparse=True)
    self.linear = nn.Linear(dim, vocab_size)

  # B: batch size, W: window size, d_w: word embedding size, V: vocab size
  def forward(self, x):  # x: (B, 2W)
    embeddings = self.embedding(x)  # (B, 2W, d_w)
    embeddings = torch.sum(embeddings, dim=1)  # (B, d_w)
    output = self.linear(embeddings)  # (B, V)
    return output

In [20]:
class SkipGram(nn.Module):
  def __init__(self, vocab_size, dim):
    super(SkipGram, self).__init__()
    self.embedding = nn.Embedding(vocab_size, dim, sparse=True)
    self.linear = nn.Linear(dim, vocab_size)

  # B: batch size, W: window size, d_w: word embedding size, V: vocab size
  def forward(self, x): # x: (B)
    embeddings = self.embedding(x)  # (B, d_w)
    output = self.linear(embeddings)  # (B, V)
    return output

두 가지 모델을 생성합니다.

In [21]:
cbow = CBOW(vocab_size=len(w2i), dim=256)
skipgram = SkipGram(vocab_size=len(w2i), dim=256)

### **모델 학습**

다음과 같이 hyperparamter를 세팅하고 `DataLoader` 객체를 만듭니다.

In [34]:
batch_size=4
learning_rate = 5e-4
num_epochs = 5
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

cbow_loader = DataLoader(cbow_set, batch_size=batch_size)
skipgram_loader = DataLoader(skipgram_set, batch_size=batch_size)

첫번째로 CBOW 모델 학습입니다.

In [40]:
cbow.train()
cbow = cbow.to(device)
optim = torch.optim.SGD(cbow.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()

for e in range(1, num_epochs+1):
  print("#" * 50)
  print(f"Epoch: {e}")
  for batch in tqdm(cbow_loader):
    x, y = batch
    x, y = x.to(device), y.to(device) # (B, W), (B)
    output = cbow(x)  # (B, V)
    print(output)
 
    optim.zero_grad()
    loss = loss_function(output, y)
    loss.backward()
    optim.step()

    print(f"Train loss: {loss.item()}")

print("Finished.")

##################################################
Epoch: 1


 62%|██████▎   | 10/16 [00:00<00:00, 92.54it/s]

tensor([[ 1.0408,  2.1834,  1.2132,  1.4921,  0.9885, -0.0318,  1.0552,  1.4811,
         -0.3071, -1.3791,  0.6100,  0.2818, -1.7696, -0.1848,  0.5822, -1.0957,
          2.9273, -1.0136, -0.5658,  1.5122,  0.0145, -1.4383,  0.3867, -0.1641,
          0.3945, -0.4107, -1.3123,  0.9716, -0.5681, -0.4191, -0.8905, -0.0285,
         -0.0716, -1.1472,  1.7334,  1.9473,  0.7373,  0.4110, -0.4248,  0.2013,
          0.6578, -0.3174,  0.7684,  0.1214, -0.2693,  0.2610,  1.4476,  0.4923,
         -0.2431,  0.3138,  0.7676, -0.4077, -1.7197, -0.2270, -0.4953, -0.7551,
          0.7694, -1.8131,  0.9691, -0.7655],
        [ 1.9161,  2.8341,  1.9032,  2.4667,  1.5939,  0.6072,  2.3792,  0.4431,
          0.6241, -0.8793, -0.0754,  0.7524, -2.3735,  0.5804,  1.3382, -1.6917,
          2.9280, -1.7718,  0.1946,  1.3966, -1.1989, -1.9288,  1.2821,  0.7063,
          1.1269, -0.0184, -0.9382,  1.0702,  0.0420,  0.0921, -0.1413, -0.5144,
          0.1458, -0.8280,  1.8154,  1.6058,  1.1314, -1.3854, 

100%|██████████| 16/16 [00:00<00:00, 77.82it/s]


Train loss: 3.478600025177002
##################################################
Epoch: 2


 31%|███▏      | 5/16 [00:00<00:00, 44.70it/s]

tensor([[ 1.1241e+00,  2.1979e+00,  1.2050e+00,  1.4702e+00,  9.8107e-01,
         -3.0833e-02,  1.0736e+00,  1.5363e+00, -2.7146e-01, -1.3316e+00,
          6.1634e-01,  3.1633e-01, -1.7822e+00, -1.6317e-01,  6.2183e-01,
         -1.1108e+00,  2.9510e+00, -1.0263e+00, -5.9447e-01,  1.5945e+00,
         -8.4604e-04, -1.4475e+00,  3.8468e-01, -1.8652e-01,  3.8117e-01,
         -4.3084e-01, -1.3224e+00,  9.7374e-01, -5.7808e-01, -4.2476e-01,
         -8.8331e-01, -3.8512e-02, -9.8279e-02, -1.1667e+00,  1.7225e+00,
          1.9402e+00,  7.3645e-01,  4.0461e-01, -4.3799e-01,  1.8294e-01,
          6.6351e-01, -3.1605e-01,  7.6643e-01,  1.0302e-01, -2.8674e-01,
          2.4609e-01,  1.4254e+00,  4.8909e-01, -2.6028e-01,  2.8463e-01,
          7.4791e-01, -4.1731e-01, -1.7294e+00, -2.3485e-01, -4.9412e-01,
         -7.4128e-01,  7.8883e-01, -1.8131e+00,  9.7036e-01, -7.8231e-01],
        [ 1.9978e+00,  2.8538e+00,  1.9034e+00,  2.4510e+00,  1.5795e+00,
          6.1310e-01,  2.4306e+00,  5

100%|██████████| 16/16 [00:00<00:00, 56.68it/s]


tensor([[ 5.0756e-01,  1.7137e+00,  1.2052e+00,  6.0762e-01,  1.6724e+00,
          3.7545e-01,  1.5995e+00, -1.0036e+00,  1.1134e+00,  1.3936e+00,
          1.2573e+00, -1.3693e+00, -3.9353e-01, -1.5589e+00,  1.7779e+00,
         -3.5952e-01,  5.4238e-01,  4.0802e-01,  7.1984e-01, -2.6484e-01,
         -2.0994e+00, -1.6772e+00, -2.5942e-01,  4.6564e-01,  3.8463e-01,
         -2.9140e+00,  3.4598e-01,  1.6125e+00,  7.5069e-01,  1.9438e+00,
         -1.0663e+00,  1.5605e-01,  5.1157e-01, -4.7815e-02, -3.1511e-01,
         -1.4068e+00,  1.4173e+00, -3.0518e+00,  6.8396e-02, -1.4308e+00,
          4.9795e-01,  1.5906e+00,  2.2986e+00, -6.5501e-02, -1.3708e+00,
         -1.8018e+00,  6.9489e-01, -2.3804e-02, -3.4622e-01,  6.4145e-01,
          7.6949e-01, -1.1104e+00, -1.1244e+00,  1.2998e+00,  1.2452e+00,
          8.2355e-01, -2.7892e-01,  4.4908e-01, -8.5986e-01, -1.7502e+00],
        [ 5.7597e-01,  1.7597e+00,  1.3242e+00,  1.6998e-01,  1.7493e+00,
          1.5420e-01,  1.0885e+00,  2

  0%|          | 0/16 [00:00<?, ?it/s]

tensor([[ 1.2042,  2.2107,  1.1966,  1.4501,  0.9744, -0.0298,  1.0893,  1.5889,
         -0.2368, -1.2869,  0.6223,  0.3495, -1.7944, -0.1442,  0.6586, -1.1254,
          2.9704, -1.0387, -0.6217,  1.6753, -0.0158, -1.4564,  0.3832, -0.2075,
          0.3688, -0.4499, -1.3319,  0.9758, -0.5873, -0.4295, -0.8763, -0.0483,
         -0.1236, -1.1854,  1.7127,  1.9338,  0.7358,  0.3983, -0.4507,  0.1653,
          0.6694, -0.3145,  0.7650,  0.0863, -0.3035,  0.2316,  1.4044,  0.4863,
         -0.2768,  0.2569,  0.7291, -0.4267, -1.7390, -0.2422, -0.4925, -0.7279,
          0.8071, -1.8125,  0.9719, -0.7986],
        [ 2.0724,  2.8696,  1.9028,  2.4388,  1.5668,  0.6191,  2.4761,  0.6766,
          0.7340, -0.6682, -0.0364,  0.8930, -2.4314,  0.6538,  1.5273, -1.7584,
          3.0921, -1.8283,  0.0670,  1.6750, -1.2643, -1.9694,  1.2617,  0.6404,
          1.0771, -0.0896, -0.9559,  1.1011,  0.0052,  0.0616, -0.1085, -0.5558,
          0.0309, -0.9158,  1.7676,  1.5614,  1.1445, -1.4102, 

 44%|████▍     | 7/16 [00:00<00:00, 64.80it/s]


tensor([[ 8.9326e-01,  1.7309e+00, -1.3143e-01,  1.1943e+00,  1.5738e+00,
          1.4460e+00, -5.7762e-01, -1.2826e+00,  4.5098e-01, -9.5921e-01,
         -4.2150e-01,  4.8303e-01, -5.3810e-01,  3.4137e+00, -7.5681e-01,
         -1.6627e+00,  3.4962e-01, -5.5041e-01, -9.8092e-01, -3.4196e-01,
         -2.1089e+00, -1.4512e+00, -8.7693e-02,  1.0105e+00,  1.7088e-01,
         -3.3177e-01, -2.2088e-01,  1.1069e+00,  3.8265e-01, -1.6402e+00,
         -9.1212e-01, -9.6126e-01,  1.7100e+00, -2.9530e-01,  3.2685e-01,
          1.0123e+00, -1.0413e-01, -2.1402e+00, -1.7984e-01, -5.1438e-01,
          1.8181e+00,  2.9878e-01, -2.1297e-02,  1.0010e+00, -4.5067e-02,
         -1.1383e+00, -1.1183e-01,  1.4565e+00, -1.7068e+00,  1.9955e+00,
         -5.4725e-01,  5.3043e-01, -1.6935e+00,  7.0751e-01,  5.4893e-01,
          1.1434e+00,  3.9499e-01, -7.6134e-01,  9.5237e-01, -1.5679e+00],
        [ 4.5132e-01,  1.6206e+00,  1.0663e+00,  7.9728e-01,  1.8225e+00,
          1.0625e+00,  3.0077e-02, -

100%|██████████| 16/16 [00:00<00:00, 61.23it/s]


Train loss: 3.3949594497680664
tensor([[ 0.3758,  0.8212,  0.2987,  0.1259,  3.3948,  0.6173,  1.1034,  0.2581,
          1.2792, -1.5741,  1.1073,  0.7127, -2.0127,  0.3084,  0.0284, -0.3838,
         -1.6985, -0.1244,  0.5136, -1.2909, -1.0297, -1.5925,  1.3362,  0.9368,
          1.7052, -0.4887,  1.2175,  0.3557, -0.8901, -0.6253, -0.2488, -1.3160,
          0.6627, -0.3838,  0.6993,  0.0209,  0.2486, -0.2093, -0.8450, -0.9063,
          0.4147,  0.5190, -0.3266,  0.5537, -0.8797, -1.8453,  1.4374, -1.1880,
         -0.4430,  0.7771,  1.4797, -0.9512, -0.7143,  0.7867,  1.4658,  2.1170,
          1.7049, -0.3191, -1.4444,  0.0701],
        [ 2.0723,  0.1254, -0.6464,  1.9000,  0.6121, -0.3385,  1.3187, -0.3949,
          0.8581, -1.3362, -0.8401, -0.7219,  0.4169,  0.5493,  0.4747, -0.7553,
          3.0687, -1.1834,  0.9275, -0.2755, -2.1997, -1.4076, -0.0434,  0.1269,
          0.7852, -1.0818,  1.5551,  0.5724,  1.1395, -0.1143, -0.0993,  0.1398,
         -0.5891, -1.2574, -0.95

  0%|          | 0/16 [00:00<?, ?it/s]

tensor([[ 1.2813,  2.2225,  1.1881,  1.4315,  0.9684, -0.0287,  1.1027,  1.6388,
         -0.2029, -1.2449,  0.6279,  0.3813, -1.8062, -0.1276,  0.6926, -1.1396,
          2.9858, -1.0508, -0.6477,  1.7546, -0.0303, -1.4651,  0.3822, -0.2272,
          0.3574, -0.4681, -1.3408,  0.9779, -0.5959, -0.4334, -0.8695, -0.0578,
         -0.1476, -1.2033,  1.7038,  1.9281,  0.7354,  0.3921, -0.4631,  0.1482,
          0.6756, -0.3127,  0.7642,  0.0712, -0.3198,  0.2176,  1.3844,  0.4839,
         -0.2928,  0.2306,  0.7113, -0.4359, -1.7482, -0.2491, -0.4904, -0.7148,
          0.8244, -1.8115,  0.9739, -0.8143],
        [ 2.1409,  2.8828,  1.9016,  2.4297,  1.5556,  0.6249,  2.5162,  0.7847,
          0.7861, -0.5726, -0.0181,  0.9589, -2.4591,  0.6824,  1.6122, -1.7902,
          3.1615, -1.8553,  0.0078,  1.8101, -1.2956, -1.9890,  1.2534,  0.6120,
          1.0552, -0.1217, -0.9629,  1.1164, -0.0109,  0.0492, -0.0928, -0.5758,
         -0.0222, -0.9570,  1.7471,  1.5414,  1.1516, -1.4223, 

 50%|█████     | 8/16 [00:00<00:00, 71.84it/s]

Train loss: 2.7686409950256348
tensor([[ 1.3721, -0.3274,  2.9621,  1.3231,  2.4083,  2.3280,  0.5460,  1.4978,
          0.1665, -1.1283, -0.2446,  1.6080, -0.5587,  1.0685,  0.9446,  2.1973,
          1.2852, -0.1319,  1.5625,  0.2087,  1.2524, -1.2393,  0.3426, -0.2401,
          0.1610,  0.4414, -0.0252, -0.1179, -0.6558, -0.4468, -0.7756, -0.9370,
          1.0402,  0.0609,  1.4847,  1.9933, -0.0206, -0.6351,  0.1737, -1.7953,
         -0.1710, -0.3934,  2.1484,  1.0707,  0.7863, -2.1642, -0.0852, -0.6520,
         -2.5179,  0.8769,  0.9014, -0.8982,  0.1806,  0.7664,  0.9104,  1.9254,
         -0.3950,  0.4641, -0.9962, -0.0048],
        [ 2.9333,  0.6717,  1.6226,  1.6136,  1.0176,  1.9468,  1.4382,  0.8714,
         -0.4084, -0.5539, -0.5364,  1.8047, -1.2331,  0.2697, -0.8387,  0.5412,
          2.2224, -1.0887,  0.4367,  0.0131,  0.6258, -1.2635,  0.3430,  1.0321,
          1.4354,  0.6910,  0.0999, -1.0297, -0.5258, -0.4695, -1.1578, -1.4154,
         -0.0770, -0.5336, -0.64

100%|██████████| 16/16 [00:00<00:00, 70.26it/s]


tensor([[ 1.8493e+00, -2.7032e-01,  6.0165e-01,  2.6675e-01, -1.4325e-01,
          5.8614e-01,  7.0172e-02, -1.5699e-01,  1.0918e-01, -1.2992e+00,
          1.5810e+00, -1.9221e-01,  1.7123e-01,  1.5156e+00,  1.0807e+00,
         -5.9603e-01,  2.2532e+00, -1.9469e+00,  4.3532e-01, -4.8185e-01,
         -1.4592e+00,  1.9758e-01,  9.7081e-02,  7.8672e-02, -3.4188e-01,
         -1.2625e+00, -3.1890e-01,  5.0958e-01,  1.8655e-01,  4.7922e-01,
          1.6011e+00, -1.3953e+00,  2.6137e-01, -1.5063e+00,  4.5220e-01,
          6.3522e-01, -5.6465e-01, -3.1825e+00, -6.0645e-01,  7.1095e-01,
         -4.8672e-01,  1.1093e+00,  5.3256e-01, -7.6919e-01,  4.3196e-01,
          3.1336e-01, -1.0811e+00,  1.4922e+00,  4.9946e-01, -1.0322e-01,
         -7.5980e-01, -1.5411e+00,  3.2891e-02, -4.0049e-01,  1.2113e+00,
          6.0220e-01,  5.8127e-01,  1.1863e+00,  8.5258e-01, -9.6051e-01],
        [ 3.2751e+00,  4.3884e-03, -4.4656e-01,  9.1940e-01, -5.3454e-01,
         -5.3378e-01,  8.9758e-01, -9

  0%|          | 0/16 [00:00<?, ?it/s]

tensor([[ 1.3559e+00,  2.2336e+00,  1.1795e+00,  1.4142e+00,  9.6309e-01,
         -2.7616e-02,  1.1139e+00,  1.6859e+00, -1.6993e-01, -1.2056e+00,
          6.3319e-01,  4.1180e-01, -1.8177e+00, -1.1299e-01,  7.2405e-01,
         -1.1534e+00,  2.9974e+00, -1.0625e+00, -6.7247e-01,  1.8324e+00,
         -4.4355e-02, -1.4737e+00,  3.8174e-01, -2.4563e-01,  3.4684e-01,
         -4.8526e-01, -1.3491e+00,  9.7998e-01, -6.0390e-01, -4.3663e-01,
         -8.6296e-01, -6.7082e-02, -1.7048e-01, -1.2205e+00,  1.6960e+00,
          1.9229e+00,  7.3514e-01,  3.8598e-01, -4.7505e-01,  1.3182e-01,
          6.8186e-01, -3.1078e-01,  7.6381e-01,  5.7535e-02, -3.3543e-01,
          2.0399e-01,  1.3654e+00,  4.8184e-01, -3.0830e-01,  2.0548e-01,
          6.9416e-01, -4.4484e-01, -1.7573e+00, -2.5548e-01, -4.8794e-01,
         -7.0203e-01,  8.4060e-01, -1.8101e+00,  9.7610e-01, -8.2950e-01],
        [ 2.2040e+00,  2.8944e+00,  1.8999e+00,  2.4233e+00,  1.5457e+00,
          6.3051e-01,  2.5512e+00,  8

 44%|████▍     | 7/16 [00:00<00:00, 60.86it/s]


tensor([[ 2.6032e+00,  2.2666e+00,  9.1454e-01,  1.0925e+00,  2.9840e+00,
          5.8831e-01,  8.8013e-01, -1.4537e-01,  8.2414e-01, -1.2560e+00,
         -1.0375e+00,  1.1815e+00, -2.2690e+00,  2.9834e+00,  1.3732e+00,
         -4.6490e-01,  4.9172e-01, -1.2270e+00, -4.0194e-01,  3.9218e-01,
         -7.2020e-01, -2.1606e+00,  1.0548e+00,  1.8800e+00,  4.1683e-01,
         -3.2273e-01,  7.7708e-01,  8.5619e-01, -1.5295e-02, -6.7061e-01,
         -1.2328e+00, -2.6205e+00,  4.9795e-01,  1.2160e-01,  1.0561e+00,
          7.5789e-01, -6.7984e-01, -2.6550e+00, -2.3898e+00, -1.7104e+00,
          1.1410e+00,  6.6527e-01, -1.5470e-01,  2.7437e+00, -7.7829e-02,
         -2.3207e+00,  6.0151e-01,  1.1733e+00, -2.5617e-01,  2.2673e-01,
         -7.2858e-01, -1.4145e+00, -1.3819e+00,  2.9084e-01,  1.1662e+00,
          2.3904e+00,  1.4533e+00,  4.4681e-01, -3.0819e-01, -2.6042e-01],
        [ 2.2566e+00,  2.4217e+00,  2.3618e+00,  8.8368e-01,  2.3808e+00,
          9.5306e-01,  1.2823e+00, -

100%|██████████| 16/16 [00:00<00:00, 55.87it/s]

tensor([[ 2.2928,  1.5597,  0.9299,  0.4562,  1.2856, -0.0552,  0.3171,  1.5097,
          1.3310, -0.2329, -1.0645,  1.1327, -0.5094,  1.6494, -0.2314,  0.2861,
         -0.9200, -0.7633,  1.1418, -0.3255,  1.3370,  0.3185,  0.6981, -0.3601,
          1.6702,  1.2993,  0.9100,  0.4981,  0.8854,  1.0403,  1.5080, -0.9647,
         -0.5078, -1.0496,  0.7191,  0.6505,  0.5623,  0.5525, -1.3839,  0.6696,
         -0.5389,  0.2563, -0.2601,  0.1391, -1.4496, -0.6300,  0.2446,  0.0659,
         -0.6867,  1.2920,  0.6573, -3.1035, -0.6859, -0.9168, -0.1487,  1.2326,
          0.2164, -0.9635, -0.9727, -0.1688],
        [ 2.8525,  1.0253,  0.9185,  0.8780,  2.7477,  0.1943,  0.1855,  0.7302,
          1.7005,  0.2246, -0.5303,  2.1373, -1.2680,  2.4642,  0.3563, -0.3696,
         -0.0556, -1.6344, -0.4409,  0.4635,  0.2592, -0.7241,  0.1481,  0.7987,
          1.2018,  1.6441,  0.4824,  0.1310,  1.3212,  1.2258, -1.1986, -0.8758,
         -0.5323,  0.6488,  0.1113,  0.9712,  0.2145,  0.1134, 




다음으로 Skip-gram 모델 학습입니다.

In [39]:
skipgram.train()
skipgram = skipgram.to(device)
optim = torch.optim.SGD(skipgram.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()

for e in range(1, num_epochs+1):
  print("#" * 50)
  print(f"Epoch: {e}")
  for batch in tqdm(skipgram_loader):
    x, y = batch
    print(x, y)
    x, y = x.to(device), y.to(device) # (B, W), (B)
    output = skipgram(x)  # (B, V)
    print(output)

    optim.zero_grad()
    loss = loss_function(output, y)
    loss.backward()
    optim.step()

  print(f"Train loss: {loss.item()}")

print("Finished.")

##################################################
Epoch: 1


  0%|          | 0/16 [00:00<?, ?it/s]

tensor([ 0, 19, 22,  4]) tensor([[17, 18, 10,  0],
        [18,  0, 10,  0],
        [20, 21,  2,  0],
        [21, 22,  2,  0]])
tensor([[ 8.4696e-01,  4.5186e-01, -7.6211e-01, -9.8847e-01, -5.1020e-01,
          8.4968e-01,  4.3323e-01, -1.3089e-01, -6.1150e-04, -4.7985e-01,
         -4.8498e-01,  1.8916e-01,  2.7265e-01,  2.1745e-01,  1.8405e-01,
          1.0745e-01,  5.1802e-01, -3.8191e-01, -4.9379e-01, -9.7829e-01,
         -6.9195e-01, -2.3854e-01,  3.3708e-01,  8.6720e-01, -3.6726e-01,
         -2.3638e-01, -2.6499e-01, -5.7010e-01, -1.4501e-01, -8.5553e-01,
          9.4232e-01, -6.7211e-01, -1.2916e+00, -7.1563e-01, -3.9079e-01,
         -4.5412e-01, -9.7598e-01, -5.9034e-02, -2.7046e-01, -1.6879e-01,
         -1.6920e-01, -7.4195e-01, -3.7926e-01, -3.5485e-01,  3.7658e-01,
         -4.6398e-01, -1.7111e+00, -4.9455e-01,  1.6241e-01, -4.0992e-01,
          3.8433e-02,  3.9330e-01, -1.5288e+00,  1.1906e+00, -1.7570e-01,
          4.9121e-01, -4.0274e-01, -2.0103e-01, -6.9216e




RuntimeError: ignored

### **테스트**

학습된 각 모델을 이용하여 test 단어들의 word embedding을 확인합니다.

In [None]:
for word in test_words:
  input_id = torch.LongTensor([w2i[word]]).to(device)
  emb = cbow.embedding(input_id)

  print(f"Word: {word}")
  print(emb.squeeze(0))

In [None]:
for word in test_words:
  input_id = torch.LongTensor([w2i[word]]).to(device)
  emb = skipgram.embedding(input_id)

  print(f"Word: {word}")
  print(max(emb.squeeze(0)))

In [None]:
test_words

In [None]:
i2w[25]

In [None]:
def most_similar(word,top_k=5):
  input_id = torch.LongTensor([w2i[word]]).to(device)
  input_emb = skipgram.embedding(input_id)
  score=torch.matmul(input_emb,skipgram.embedding.weight.transpose(1,0)).view(-1)

  _,top_k_ids=torch.topk(score,top_k)

  return [i2w[word_id.item()] for word_id in top_k_ids][1:]

In [None]:
most_similar("가격")

## Word2Vec 시각화

In [None]:
from sklearn.decomposition import PCA

In [None]:
import matplotlib.pyplot as plt
#matplotlib 패키지 한글 깨짐 처리 시작
plt.rc('font', family='NanumBarunGothic') 
#plt.rc('font', family='AppleGothic') #맥

In [None]:
pca=PCA(n_components=2)

In [None]:
pc_weight=pca.fit_transform(skipgram.embedding.weight.data.cpu().numpy())

In [None]:
plt.figure(figsize=(15,15))

for word_id,(x_coordinate,y_coordinate) in enumerate(pc_weight):
  plt.scatter(x_coordinate,y_coordinate,color="blue")
  plt.annotate(i2w[word_id], (x_coordinate, y_coordinate))