[View in Colaboratory](https://colab.research.google.com/github/Gangsss/BOAZ_NLP_STUDY/blob/master/01_Skip_gram_Naive_Softmax_ipynb_colab.ipynb)

## 1. Skip-gram with Naive Softmax

In [0]:
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.3.0.post4-{platform}-linux_x86_64.whl torchvision
import torch

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l : [item for sublist in l for item in sublist]
random.seed(777)

In [0]:
print(torch.__version__)
print(nltk.__version__)

0.3.0.post4
3.2.5


## ENV

In [0]:
USE_CUDA = torch.cuda.is_available()
#gpus = [0]
#torch.cuda.set_device(0)

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [4]:
USE_CUDA  #colab 이용하기

True

In [0]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0 # Start Index
    eindex = batch_size # End Index
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        sindex  = eindex
        eindex = eindex + batch_size
        yield batch
        
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [0]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] 
            if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) 
            if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

### Data Load and Preprocessing

In [6]:
nltk.download('gutenberg')
nltk.corpus.gutenberg.fileids()

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [14]:
nltk.download('punkt')
#The sents() function divides the text up into its sentences, where each sentence is a list of words:
#nltk.corpus.gutenberg.sents('melville-moby_dick.txt')
corpus = nltk.corpus.gutenberg.sents('melville-moby_dick.txt')[:100]
#corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:100]
#print(corpus)
corpus = [[word.lower() for word in sent] for sent in corpus] # lower() 소문자 변환
print(corpus[:10])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[['[', 'moby', 'dick', 'by', 'herman', 'melville', '1851', ']'], ['etymology', '.'], ['(', 'supplied', 'by', 'a', 'late', 'consumptive', 'usher', 'to', 'a', 'grammar', 'school', ')'], ['the', 'pale', 'usher', '--', 'threadbare', 'in', 'coat', ',', 'heart', ',', 'body', ',', 'and', 'brain', ';', 'i', 'see', 'him', 'now', '.'], ['he', 'was', 'ever', 'dusting', 'his', 'old', 'lexicons', 'and', 'grammars', ',', 'with', 'a', 'queer', 'handkerchief', ',', 'mockingly', 'embellished', 'with', 'all', 'the', 'gay', 'flags', 'of', 'all', 'the', 'known', 'nations', 'of', 'the', 'world', '.'], ['he', 'loved', 'to', 'dust', 'his', 'old', 'grammars', ';', 'it', 'somehow', 'mildly', 'reminded', 'him', 'of', 'his', 'mortality', '.'], ['"', 'while', 'you', 'take', 'in', 'hand', 'to', 'school', 'others', ',', 'and', 'to', 'teach', 'them', 'by', 'what', 'name', 'a', 'whale', '-', 'fish', 'is', 't

In [16]:
print(flatten(corpus)[:10])

['[', 'moby', 'dick', 'by', 'herman', 'melville', '1851', ']', 'etymology', '.']


### Extract Stopwords from unigram distributions's tails

In [18]:
# Word Count
word_count = Counter(flatten(corpus))
border = int(len(word_count) * 0.01)

# Stopwords (가장 자주 나온 words를 stopwords로)
# 가장 자주 나온 words + Inversed Matrix에서 가장 자주 나온(?) words
stopwords = word_count.most_common()[:border] + reversed(word_count.most_common())[:border]
stopwords = [s[0] for s in stopwords]
stopwords

TypeError: ignored

### Build Vocab

In [0]:
# Corpus에서 Stopwords를 제거
vocab = list(set(flatten(corpus)) - set(stopwords))
vocab.append('<UNK>')
print('Corpus : ', len(set(flatten(corpus))))
print('Vocab : ', len(vocab))

Corpus :  2607
Vocab :  2556


In [0]:
word2index = {'<UNK>' : 0}

# word2index - word : index 형태
for word in vocab:
    if word2index.get(word) is None:
        word2index[word] = len(word2index)

# index2word - index : word 형태로 변경
index2word = {v:k for k, v in word2index.items()}

In [0]:
# Window Size 기준으로 
WINDOW_SIZE = 3
windows = flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c 
              + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])
print(windows[0])
print(windows[1])
print(windows[2])
print(windows[-1])

('<DUMMY>', '<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by')
('<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by', 'herman')
('<DUMMY>', '[', 'moby', 'dick', 'by', 'herman', 'melville')
('be', 'ready', 'directly', '."', '<DUMMY>', '<DUMMY>', '<DUMMY>')


In [0]:
train_data = []

for window in windows:
    for i in range(WINDOW_SIZE * 2 + 1):
        if i == WINDOW_SIZE or window[i] == '<DUMMY>':
            continue
        train_data.append((window[WINDOW_SIZE], window[i]))

print(train_data[:WINDOW_SIZE * 2])
print(train_data[0])

[('[', 'moby'), ('[', 'dick'), ('[', 'by'), ('moby', '['), ('moby', 'dick'), ('moby', 'by')]
('[', 'moby')


In [0]:
X_p = []
y_p = []

# 위에서 선언한 prepare_word 함수 사용
for data in train_data:
    X_p.append(prepare_word(data[0], word2index).view(1, -1))
    y_p.append(prepare_word(data[1], word2index).view(1, -1))

train_data = list(zip(X_p, y_p))


print(X_p[0])
print(y_p[0])
print(train_data[:WINDOW_SIZE * 2])
print(train_data[0])
len(train_data)

Variable containing:
 1284
[torch.LongTensor of size 1x1]

Variable containing:
 26
[torch.LongTensor of size 1x1]

[(Variable containing:
 1284
[torch.LongTensor of size 1x1]
, Variable containing:
 26
[torch.LongTensor of size 1x1]
), (Variable containing:
 1284
[torch.LongTensor of size 1x1]
, Variable containing:
 2030
[torch.LongTensor of size 1x1]
), (Variable containing:
 1284
[torch.LongTensor of size 1x1]
, Variable containing:
 1227
[torch.LongTensor of size 1x1]
), (Variable containing:
 26
[torch.LongTensor of size 1x1]
, Variable containing:
 1284
[torch.LongTensor of size 1x1]
), (Variable containing:
 26
[torch.LongTensor of size 1x1]
, Variable containing:
 2030
[torch.LongTensor of size 1x1]
), (Variable containing:
 26
[torch.LongTensor of size 1x1]
, Variable containing:
 1227
[torch.LongTensor of size 1x1]
)]
(Variable containing:
 1284
[torch.LongTensor of size 1x1]
, Variable containing:
 26
[torch.LongTensor of size 1x1]
)


55758

In [0]:
class Skipgram(nn.Module):
    def __init__(self, vocab_size, projection_dim):
        super(Skipgram, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim)
        self.embedding_u = nn.Embedding(vocab_size, projection_dim)
        
        # Weight Initialization
        self.embedding_v.weight.data.uniform_(-1, 1) 
        self.embedding_u.weight.data.uniform_(0, 0)
        
    def forward(self, center_words, target_words, outer_words):
        center_embeds = self.embedding_v(center_words) # B x 1 x D
        target_embeds = self.embedding_u(target_words) # B x 1 x D
        outer_embeds = self.embedding_u(outer_words) # B x V x D
        
        # Bx1xD * BxDx1 = Bx1 (Batch Matrix Multiplication)
        scores = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        
        # BxVxD * BxDx1 = BxV (Batch Matrix Multiplication)
        norm_scores = outer_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        
        # Negative Log Likelihood (Log Softmax)
        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1)))
        
        return nll
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

### Train

In [0]:
EMBEDDING_SIZE = 30
BATCH_SIZE = 256
EPOCH = 100
LEARNING_RATE = 0.005

losses = []
model = Skipgram(len(word2index), EMBEDDING_SIZE)

if USE_CUDA:
    model = model.cuda()

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [0]:
for epoch in range(EPOCH):
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs) # Bx1
        targets = torch.cat(targets) # Bx1
        vocabs = prepare_sequence(list(vocab), word2index).expand(inputs.size(0), len(vocab)) # BxV
        loss = model(inputs, targets, vocabs)
        
        # Backpropagation
        model.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.data[0])
        
    if epoch % 10 == 0:
        print("Epoch : %d, Mean_Loss : %.02f" % (epoch, np.mean(losses)))
        losses = []

Epoch : 0, Mean_Loss : 6.33


### Test

In [0]:
def word_similarity(target, vocab):  
    target_V = model.prediction(prepare_word(target, word2index))
    similarities = []
    
    for i in range(len(vocab)):
        if vocab[i] == target:
            continue
        
        vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0]
        similarities.append([vocab[i], cosine_sim])
    # Sort by similarity (Top 10 return)
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10]

In [0]:
test = random.choice(list(vocab))
test

'butt'

In [0]:
word_similarity(test, vocab)

[['wharton', 0.6740432977676392],
 ['bowes', 0.6615484356880188],
 ['cuvier', 0.6574292182922363],
 ['desks', 0.6492806077003479],
 ['salted', 0.6338767409324646],
 ['elizabeth', 0.6323466300964355],
 ['turn', 0.6268795728683472],
 [').', 0.6265820860862732],
 ['dreary', 0.6198908686637878],
 ['sharks', 0.6184421181678772]]