# Word2Vec

## 1. Environment Setup

### 1.0. Check GPU

In [1]:
!nvidia-smi

Tue Apr 28 04:35:33 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64       Driver Version: 440.64       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  TITAN Xp            Off  | 00000000:17:00.0 Off |                  N/A |
|  0%   25C    P8     9W / 250W |    661MiB / 12196MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  TITAN Xp            Off  | 00000000:65:00.0 Off |                  N/A |
|  0%   32C    P8     8W / 250W |     12MiB / 12196MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  TITAN Xp            Off  | 00000000:66:00.0  On |                  N/

### 1.1. Install nltk

In [2]:
# Install nltk
!pip install nltk

# # Make Directory nltk_data for Download nltk
# !mkdir nltk_data

# # Download nltk book on Directory nltk_data
# !python -m nltk.downloader -d ./nltk_data book

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


### 1.2. Pytorch

In [3]:
!pip install torch torchvision

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


### 1.3. Import

In [4]:
# Append Downloaded nltk directory
import nltk
nltk.data.path.append('./nltk_data')

In [5]:
from model import *
from train import *
from utils import *

import itertools
import re

# Ignore Warning
import warnings
warnings.filterwarnings(action='ignore')

## 2. Data Preprocessing

In [6]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [7]:
# sentences : [[word, word, ...], [word, word, ...], ...]
sentences = list()
    
# Use All book data for Training
for raw_sentence in nltk.corpus.gutenberg.sents('shakespeare-hamlet.txt'):
    sentences.append([word.lower() for word in raw_sentence if re.match('^[a-zA-Z]+', word)])

In [8]:
# vocab_set : set of words in sentences
vocab_set = set()
total_word = 0

for sentence in sentences:
    vocab_set.update(set(sentence))
    total_word += len(sentence)

In [9]:
print(total_word)
print(len(vocab_set))

30266
4699


In [10]:
vocab2id = {vocab:i for i, vocab in enumerate(vocab_set)}
id2vocab = {i:vocab for i, vocab in enumerate(vocab_set)}

## 3. Train Model

In [11]:
vocab_dim = len(vocab_set)

# mode : 'cbow' or 'skip-gram'
mode = 'skip-gram'

# int embed_dim : dimension of embedding layer
embed_dim = 70

# bool sparse : activate/deactivate embedding layer sparse
sparse = False

# int C : window size
C = 3

In [12]:
train_dataset = make_dataset(mode, sentences, vocab2id, window_size=C)

In [13]:
# epoch, learning rate, scheduler are fixed (if you want to change, edit train.py)

model = word2vec(mode, vocab_dim, embed_dim, sparse)
print('*' * 45)
print('Mode : {}, Embed dim : {}, Sparse : {}, C : {}'.format(mode, embed_dim, sparse, C))
model.train(train_dataset)

*********************************************
Mode : skip-gram, Embed dim : 70, Sparse : False, C : 3
Epoch 1 Started...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Iteration : 0, Loss : 8.976986
Iteration : 50000, Loss : 1.676742
Iteration : 100000, Loss : 5.453240

Epoch 2 Started...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Iteration : 0, Loss : 8.559054
Iteration : 50000, Loss : 2.319291
Iteration : 100000, Loss : 4.996808

Epoch 3 Started...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Iteration : 0, Loss : 8.308043
Iteration : 50000, Loss : 3.274035
Iteration : 100000, Loss : 4.634839



In [14]:
target = 'horse'
target in vocab_set

True

In [15]:
print('Euclidean Distance')
print('*' * 45)
print('Mode : {}, Embed dim : {}, Sparse : {}, C : {}'.format(mode, embed_dim, sparse, C))
for i, (score, vocab) in enumerate(word_euclidean(model, target, vocab_set, vocab2id)):
    print('Top {} word : {}    score : {}'.format(i+1, vocab, score))

Euclidean Distance
*********************************************
Mode : skip-gram, Embed dim : 70, Sparse : False, C : 3
Top 1 word : you    score : 7.92763
Top 2 word : tooke    score : 10.033244
Top 3 word : princes    score : 10.88969
Top 4 word : lunacie    score : 11.051412
Top 5 word : wager    score : 11.081146
Top 6 word : dreadfull    score : 11.150335
Top 7 word : barr    score : 11.242523
Top 8 word : assistant    score : 11.594965
Top 9 word : stoopes    score : 11.672582
Top 10 word : strick    score : 11.834057
Top 11 word : forgiuenesse    score : 12.008977
Top 12 word : affliction    score : 12.076011
Top 13 word : reueale    score : 12.22012
Top 14 word : doue    score : 12.305252
Top 15 word : dominions    score : 13.264203


In [16]:
print('Cosine Similarity')
print('*' * 45)
print('Mode : {}, Embed dim : {}, Sparse : {}, C : {}'.format(mode, embed_dim, sparse, C))
for i, (score, vocab) in enumerate(word_cosine(model, target, vocab_set, vocab2id)):
    print('Top {} word : {}    score : {}'.format(i+1, vocab, score))

Cosine Similarity
*********************************************
Mode : skip-gram, Embed dim : 70, Sparse : False, C : 3
Top 1 word : deeds    score : 0.323547
Top 2 word : ha    score : 0.325871
Top 3 word : rocke    score : 0.327787
Top 4 word : sounding    score : 0.327995
Top 5 word : going    score : 0.328931
Top 6 word : grosser    score : 0.329292
Top 7 word : you    score : 0.331229
Top 8 word : starres    score : 0.332285
Top 9 word : lead    score : 0.335527
Top 10 word : document    score : 0.34243
Top 11 word : pat    score : 0.346876
Top 12 word : color    score : 0.348081
Top 13 word : besides    score : 0.350819
Top 14 word : pretty    score : 0.359215
Top 15 word : mingled    score : 0.370598
