In [2]:
from tqdm import tqdm
from konlpy.tag import Mecab,Twitter,Okt,Kkma
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

import torch
import copy
import numpy as np

In [3]:
train_data = [
    "정말 맛있습니다. 추천합니다.",
    "기대했던 것보단 별로였네요.",
    "다 좋은데 가격이 너무 비싸서 다시 가고 싶다는 생각이 안 드네요.",
    "완전 최고입니다! 재방문 의사 있습니다.",
    "음식도 서비스도 다 만족스러웠습니다.",
    "위생 상태가 좀 별로였습니다. 좀 더 개선되기를 바랍니다.",
    "맛도 좋았고 직원분들 서비스도 너무 친절했습니다.",
    "기념일에 방문했는데 음식도 분위기도 서비스도 다 좋았습니다.",
    "전반적으로 음식이 너무 짰습니다. 저는 별로였네요.",
    "위생에 조금 더 신경 썼으면 좋겠습니다. 조금 불쾌했습니다."       
]

test_words = ["음식", "맛", "서비스", "위생", "가격"]

In [4]:
tokenizer = Okt()

In [5]:
def make_tokenized(data):
    tokenized = []
    for sent in tqdm(data):
        tokens = tokenizer.morphs(sent, stem=True)
        tokenized.append(tokens)

    return tokenized  

In [6]:
train_tokenized = make_tokenized(train_data)

word_count = defaultdict(int)

for tokens in tqdm(train_tokenized):
    for token in tokens:
        word_count[token] += 1

100%|██████████| 10/10 [00:02<00:00,  3.88it/s]
100%|██████████| 10/10 [00:00<00:00, 179243.76it/s]


In [7]:
word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
print(list(word_count))

[('.', 14), ('도', 7), ('이다', 4), ('좋다', 4), ('별로', 3), ('다', 3), ('이', 3), ('너무', 3), ('음식', 3), ('서비스', 3), ('하다', 2), ('방문', 2), ('위생', 2), ('좀', 2), ('더', 2), ('에', 2), ('조금', 2), ('정말', 1), ('맛있다', 1), ('추천', 1), ('기대하다', 1), ('것', 1), ('보단', 1), ('가격', 1), ('비싸다', 1), ('다시', 1), ('가다', 1), ('싶다', 1), ('생각', 1), ('안', 1), ('드네', 1), ('요', 1), ('완전', 1), ('최고', 1), ('!', 1), ('재', 1), ('의사', 1), ('있다', 1), ('만족스럽다', 1), ('상태', 1), ('가', 1), ('개선', 1), ('되다', 1), ('기르다', 1), ('바라다', 1), ('맛', 1), ('직원', 1), ('분들', 1), ('친절하다', 1), ('기념일', 1), ('분위기', 1), ('전반', 1), ('적', 1), ('으로', 1), ('짜다', 1), ('저', 1), ('는', 1), ('신경', 1), ('써다', 1), ('불쾌하다', 1)]


In [8]:
w2i = {}
for pair in tqdm(word_count):
    if pair[0] not in w2i:
        w2i[pair[0]] = len(w2i)

i2w  ={v:k for k,v in w2i.items()}  
print(train_tokenized)
print(w2i)

100%|██████████| 60/60 [00:00<00:00, 788897.30it/s]

[['정말', '맛있다', '.', '추천', '하다', '.'], ['기대하다', '것', '보단', '별로', '이다', '.'], ['다', '좋다', '가격', '이', '너무', '비싸다', '다시', '가다', '싶다', '생각', '이', '안', '드네', '요', '.'], ['완전', '최고', '이다', '!', '재', '방문', '의사', '있다', '.'], ['음식', '도', '서비스', '도', '다', '만족스럽다', '.'], ['위생', '상태', '가', '좀', '별로', '이다', '.', '좀', '더', '개선', '되다', '기르다', '바라다', '.'], ['맛', '도', '좋다', '직원', '분들', '서비스', '도', '너무', '친절하다', '.'], ['기념일', '에', '방문', '하다', '음식', '도', '분위기', '도', '서비스', '도', '다', '좋다', '.'], ['전반', '적', '으로', '음식', '이', '너무', '짜다', '.', '저', '는', '별로', '이다', '.'], ['위생', '에', '조금', '더', '신경', '써다', '좋다', '.', '조금', '불쾌하다', '.']]
{'.': 0, '도': 1, '이다': 2, '좋다': 3, '별로': 4, '다': 5, '이': 6, '너무': 7, '음식': 8, '서비스': 9, '하다': 10, '방문': 11, '위생': 12, '좀': 13, '더': 14, '에': 15, '조금': 16, '정말': 17, '맛있다': 18, '추천': 19, '기대하다': 20, '것': 21, '보단': 22, '가격': 23, '비싸다': 24, '다시': 25, '가다': 26, '싶다': 27, '생각': 28, '안': 29, '드네': 30, '요': 31, '완전': 32, '최고': 33, '!': 34, '재': 35, '의사': 36, '있다': 37, '만족스럽다': 38, '상태




In [9]:
token_ids = [w2i[token] for token in tokens]
token_ids

[12, 15, 16, 14, 57, 58, 3, 0, 16, 59, 0]

In [10]:
class CBOWDataset(Dataset):
    def __init__(self, train_tokenized, window_size=2):
        self.x = torch.LongTensor([]) # input word
        self.y = torch.LongTensor([]) # target word
        
        for tokens in tqdm(train_tokenized):
            token_ids = [w2i[token] for token in tokens]
            for i, id in enumerate(token_ids):
                if i-window_size >= 0 and i+window_size < len(token_ids):
                    ############################ ANSWER HERE ################################
                    # TODO 1: insert tokens for input self.x
                    # TODO 2: insert tokens for targets self.y
                    #########################################################################        
                    tokens_in_window = token_ids[i-window_size:i+window_size+1]
                    x_tensor = torch.LongTensor(tokens_in_window[:window_size] + token_ids[-window_size:])
                    y_tensor = torch.LongTensor([id])

                    self.x = torch.cat((self.x, F.one_hot(x_tensor, num_classes=len(w2i))))
                    self.y = torch.cat((self.y, F.one_hot(y_tensor, num_classes=len(w2i))))

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [11]:
class SkipGramDataset(Dataset):
    def __init__(self, train_tokenized, window_size=2):
        self.x = torch.Tensor([])
        self.y = torch.Tensor([])

        for tokens in tqdm(train_tokenized):
            token_ids = [w2i[token] for token in tokens]
            for i, id in enumerate(token_ids):
                if i-window_size >= 0 and i+window_size < len(token_ids):
                    ############################ ANSWER HERE ################################
                    # TODO 1: insert tokens for input self.x
                    # TODO 2: insert tokens for targets self.y
                    #########################################################################        
                    tokens_in_window = token_ids[i-window_size:i+window_size+1]
                    x_tensor = torch.LongTensor([id])
                    y_tensor = torch.LongTensor(tokens_in_window[:window_size] + token_ids[-window_size:])

                    self.x = torch.cat((self.x, F.one_hot(x_tensor, num_classes=len(w2i))))
                    self.y = torch.cat((self.y, F.one_hot(y_tensor, num_classes=len(w2i))))

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [12]:
cbow_set = CBOWDataset(train_tokenized)
skipgram_set = SkipGramDataset(train_tokenized)
print(list(skipgram_set)[:3])
print(list(cbow_set)[:3])

100%|██████████| 10/10 [00:00<00:00, 1605.91it/s]
100%|██████████| 10/10 [00:00<00:00, 3981.30it/s]

[(tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.]), tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.])), (tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.]), tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.




In [13]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, dim):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(vocab_size, dim, sparse=True)
        self.linear = nn.Linear(dim, vocab_size)

    # B: batch size, W: window size, d_w: word embedding size, V: vocab size
    def forward(self, x):  # x: (B, 2W)
        embeddings = self.embedding(x)  # (B, 2W, d_w)
        embeddings = torch.sum(embeddings, dim=1)  # (B, d_w)
        output = self.linear(embeddings)  # (B, V)
        return output

In [14]:
class SkipGram(nn.Module):
  def __init__(self, vocab_size, dim, window_size=2):
    super(SkipGram, self).__init__()
    self.window_size = window_size
    self.vocab_size = vocab_size
    self.embedding = nn.Embedding(self.vocab_size, dim, sparse=True)
    self.linear = nn.Linear(dim, self.vocab_size * self.window_size * 2)

  # B: batch size, W: window size, d_w: word embedding size, V: vocab size
  def forward(self, x): # x: (B)
    embeddings = self.embedding(x)  # (B, d_w)
    output = self.linear(embeddings)  # (B, V * W)
    output = output.view(-1, self.window_size * 2, self.vocab_size)
    return output

In [15]:
cbow = CBOW(vocab_size=len(w2i), dim=256)
skipgram = SkipGram(vocab_size=len(w2i), dim=256)

In [16]:
batch_size=4
learning_rate = 5e-4
num_epochs = 5
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

cbow_loader = DataLoader(cbow_set, batch_size=batch_size)
skipgram_loader = DataLoader(skipgram_set, batch_size=batch_size)

In [17]:
cbow.train()
cbow = cbow.to(device)
optim = torch.optim.SGD(cbow.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()

for e in range(1, num_epochs+1):
    print("#" * 50)
    print(f"Epoch: {e}")
    for batch in tqdm(cbow_loader):
        x, y = batch
        x, y = x.to(device), y.to(device) # (B, W), (B)
        output = cbow(x)  # (B, V)
    
        optim.zero_grad()
        loss = loss_function(output, y)
        loss.backward()
        optim.step()

        print(f"Train loss: {loss.item()}")

print("Finished.")

##################################################
Epoch: 1


  0%|          | 0/64 [00:00<?, ?it/s]


RuntimeError: Expected floating point type for target with class probabilities, got Long

In [None]:
skipgram.train()
skipgram = skipgram.to(device)
optim = torch.optim.SGD(skipgram.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()

for e in range(1, num_epochs+1):
    print("#" * 50)
    print(f"Epoch: {e}")
    for batch in tqdm(skipgram_loader):
        x, y = batch
        print(x, y)
        x, y = x.to(device), y.to(device) # (B, W), (B)
        output = skipgram(x)  # (B, V)
        print(output)

        optim.zero_grad()
        loss = loss_function(output, y)
        loss.backward()
        optim.step()

    print(f"Train loss: {loss.item()}")

print("Finished.")

##################################################
Epoch: 1


  0%|          | 0/16 [00:00<?, ?it/s]

tensor([ 0, 19, 22,  4]) tensor([[17, 18, 10,  0],
        [18,  0, 10,  0],
        [20, 21,  2,  0],
        [21, 22,  2,  0]])
tensor([[-1.4292, -0.5033,  0.3522,  0.0107, -0.2754,  0.3695,  0.0407, -0.4617,
         -0.1086,  0.6471,  0.0039,  0.1783,  0.9259,  0.1736,  1.0363,  0.0282,
          0.0685,  0.1797,  0.3769, -0.4552, -0.6126, -0.6303, -0.0737, -0.2349,
          0.8764, -0.5239, -1.2522,  0.2579, -0.4421, -0.9994, -0.4711,  0.1125,
         -0.3440, -0.3643,  0.9844,  0.2379, -0.4586,  0.1930,  0.2406, -0.8169,
          0.0051, -0.7984,  0.3381,  0.0923, -0.1215, -0.2333, -0.2954,  0.6058,
         -0.1679,  0.1534, -0.4813, -0.0274,  0.1318, -0.2471, -0.4369, -0.3706,
          0.4833, -0.5089, -0.5341,  0.7908],
        [-0.2617,  0.0820, -0.8226, -0.1968,  0.7589, -0.6089,  0.0048, -0.5277,
          0.4508,  0.2631,  0.5016,  0.4225, -0.0973,  0.0471,  0.7230,  0.0104,
         -0.9979, -1.0905,  0.4761, -0.3524,  0.7428, -0.6272,  0.3647, -0.4594,
          0.34




RuntimeError: 0D or 1D target tensor expected, multi-target not supported

In [None]:
for word in test_words:
    input_id = torch.LongTensor([w2i[word]]).to(device)
    emb = cbow.embedding(input_id)

    print(f"Word: {word}")
    print(emb.squeeze(0))

In [None]:
for word in test_words:
    input_id = torch.LongTensor([w2i[word]]).to(device)
    emb = skipgram.embedding(input_id)

    print(f"Word: {word}")
    print(max(emb.squeeze(0)))

In [None]:
test_words

In [None]:
i2w[25]

In [None]:
def most_similar(word,top_k=5):
    input_id = torch.LongTensor([w2i[word]]).to(device)
    input_emb = skipgram.embedding(input_id)
    score=torch.matmul(input_emb,skipgram.embedding.weight.transpose(1,0)).view(-1)

    _,top_k_ids=torch.topk(score,top_k)

    return [i2w[word_id.item()] for word_id in top_k_ids][1:]

In [None]:
most_similar("가격")

In [None]:
from sklearn.decomposition import PCA

In [None]:
import matplotlib.pyplot as plt
#matplotlib 패키지 한글 깨짐 처리 시작
plt.rc('font', family='NanumBarunGothic') 
#plt.rc('font', family='AppleGothic') #맥

In [None]:
pca = PCA(n_componenbts=2)

In [None]:
pc_weight = pca.fit_transform(skipgram.embedding.weight.data.cpu().numpy())

In [None]:
plt.figure(figsize=(15,15))

for word_id,(x_coordinate,y_coordinate) in enumerate(pc_weight):
    plt.scatter(x_coordinate,y_coordinate,color="blue")
    plt.annotate(i2w[word_id], (x_coordinate, y_coordinate))