###### source
* http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
* http://www.claudiobellei.com/2018/01/07/backprop-word2vec-python/

In [3]:
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

import re
from collections import Counter

In [4]:
dataset = pd.read_csv('./datasets/imdb_master.csv', header=1, encoding='iso-8859-1', names=['id', 'type', 'review', 'label', 'file'])

del dataset['id']
del dataset['type']
del dataset['label']
del dataset['file']

dataset = dataset.iloc[:100]
dataset.head()

Unnamed: 0,review
0,This is an example of why the majority of acti...
1,"First of all I hate those moronic rappers, who..."
2,Not even the Beatles could write songs everyon...
3,Brass pictures (movies is not a fitting word f...
4,"A funny thing happened to me while watching ""M..."


#### few preprocessing stuffs

In [5]:
def remove_html(s):
    parser = BeautifulSoup(s, 'html5lib')
    return parser.getText()

In [6]:
def remove_punctuation(s):
    return re.sub('[^a-zA-Z0-9]', ' ', s)

In [7]:
dataset = dataset.applymap(lambda x:x.lower())
dataset = dataset.applymap(remove_punctuation)
dataset = dataset.applymap(remove_html)

In [8]:
def softmax(arr):
    arr = np.exp(arr - np.max(arr))
    return arr / np.sum(arr)

class Word2Vec(object):
    
    def __init__(self, vocab_size, h_size, w_size, seed=None):
        self.h_size = h_size
        self.w_size = w_size
        self.vocab_size = vocab_size
        
        if seed:
            np.random.seed(seed)
        
        self._build()
    
    def _build(self):
        self.W = np.random.randn(self.vocab_size, self.h_size)
    
    def train(self, X, y):
        pass
    
model = Word2Vec(10, 300, 2, 42)

assert model.W.shape == (10, 300)

model.train(['hello word! haha'])

In [17]:
reviews = []
for review in dataset[:100].review:
    reviews.append([word for word in review.split(' ') if word])

In [18]:
words = Counter()
for review in reviews:
    for word in review:
        words[word] += 1
        
vocab_size = len(words)
print(f'Vocab size: {vocab_size}')

word_to_ix, ix_to_word = {}, {}
for ix, (word, _) in enumerate(words.most_common()):
    word_to_ix[word] = ix
    ix_to_word[ix] = word
    
words.most_common()[:5]

Vocab size: 4147


[('the', 1316), ('a', 595), ('and', 568), ('of', 535), ('to', 499)]

In [19]:
def get_data(reviews, window_size):
    for review in reviews:
        size = len(review)
        for ix in range(size):
            floor = max(0, ix-window_size)
            ceil = min(size, ix+window_size+1)
            
            X = word_to_ix[review[ix]]
            y = []
            for word in review[floor:ix]:
                y.append(word_to_ix[word])
            for word in review[ix+1:ceil]:
                y.append(word_to_ix[word])
            
            yield X, y

In [None]:
# np.random.seed(42)

epochs = 100
h_size = 30
window_size = 2

learning_rate = 1e-2

# W1 = np.random.randn(vocab_size, h_size) * .5
# W2 = np.random.randn(h_size, vocab_size) * .5

for epoch in range(epochs):
    loss = 0
    for X, labels in get_data(reviews, window_size):
        h = W1[None, X]
        u = np.dot(h, W2)
        y = softmax(u)
        
        loss += -np.sum([u[0, ix] for ix in labels])
        loss += len(labels)*np.log(np.sum(np.exp(u)))
        
        dy = np.copy(y) * len(labels)
        for label in labels:
            dy[0, label] -= 1
        
        dW2 = np.dot(h.T, dy)
        dW1 = np.dot(dy, W2.T)
        
        W2 -= learning_rate * dW2
        W1[None, X] -= learning_rate * dW1
    
    print(f'\rEpoch {epoch+1:3}/{epochs:3} - Loss: {loss:.5}', end='')
    if (epoch+1) % 5 == 0:
        print('')

Epoch   5/100 - Loss: 5.0303e+05
Epoch  10/100 - Loss: 4.9232e+05
Epoch  15/100 - Loss: 4.8266e+05
Epoch  20/100 - Loss: 4.7394e+05
Epoch  25/100 - Loss: 4.661e+05
Epoch  30/100 - Loss: 4.5909e+05
Epoch  35/100 - Loss: 4.5285e+05
Epoch  40/100 - Loss: 4.4733e+05
Epoch  45/100 - Loss: 4.4245e+05
Epoch  50/100 - Loss: 4.3815e+05
Epoch  55/100 - Loss: 4.3436e+05
Epoch  60/100 - Loss: 4.3102e+05
Epoch  65/100 - Loss: 4.2807e+05
Epoch  70/100 - Loss: 4.2546e+05
Epoch  74/100 - Loss: 4.2359e+05