In [1]:
import nltk
from nltk.corpus import brown, movie_reviews

from sklearn.decomposition import PCA
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import word2vec
from gensim.models.word2vec import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec

import numpy as np
import matplotlib.pyplot as plt
import random
import os

## Skip gram vs CBOW

Simply put, the CBOW model learns the embedding by predicting the current word based on its context. The skip-gram model learns by predicting the surrounding words given a current word.

![](https://3qeqpr26caki16dnhd19sv6by6v-wpengine.netdna-ssl.com/wp-content/uploads/2017/08/Word2Vec-Training-Models.png)

#### Using Skipgram method

In [2]:
sentences = word2vec.Text8Corpus('./data/text8')
model_sg = word2vec.Word2Vec(sentences, vector_size=100,sg=1)

In [3]:
model_sg.wv.most_similar("money")

[('dividends', 0.7778456807136536),
 ('credit', 0.759507954120636),
 ('accrued', 0.7531089186668396),
 ('repayment', 0.7517944574356079),
 ('repay', 0.7475109100341797),
 ('lending', 0.7417505383491516),
 ('specie', 0.7380414605140686),
 ('payments', 0.7368233799934387),
 ('profits', 0.735568106174469),
 ('cheques', 0.7277211546897888)]

In [4]:
model_sg.wv.most_similar("queen")

[('highness', 0.7714400291442871),
 ('elizabeth', 0.7651525139808655),
 ('prince', 0.7617310285568237),
 ('consort', 0.7615090012550354),
 ('regnant', 0.7459958791732788),
 ('isabella', 0.7270307540893555),
 ('victoria', 0.7214810252189636),
 ('margrethe', 0.7168237566947937),
 ('hrh', 0.7142952680587769),
 ('princess', 0.7080675959587097)]

In [5]:
model_sg.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('daughter', 0.6312880516052246),
 ('throne', 0.6281335353851318),
 ('matilda', 0.6162846684455872),
 ('jagiellon', 0.6146883368492126),
 ('queen', 0.6145486831665039)]

In [6]:
model_sg.wv.most_similar("meandering")

[('sinuous', 0.905532717704773),
 ('wadis', 0.9010744690895081),
 ('basaltic', 0.8984476923942566),
 ('watercourses', 0.8929212093353271),
 ('troughs', 0.890213131904602),
 ('sinkholes', 0.8894224166870117),
 ('uplifted', 0.8881569504737854),
 ('featureless', 0.8815904259681702),
 ('undulating', 0.8756296038627625),
 ('outwash', 0.8742260932922363)]

#### Learning:
In CBOW the vectors from the context words are averaged before predicting the center word. In skip-gram there is no averaging of embedding vectors. It seems like the model can learn better representations for the rare words when their vectors are not averaged with the other context words in the process of making the predictions. 

## Word vectors trained on different contexts
 - We'll load different corpora, from different contexts and see how the embeddings vary
 - The text8 corpus is wikipedia pages, while Brown corpus is from 15 different topics, and movie reviews are from IMDB

In [7]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/mz195/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [8]:
model_brown = Word2Vec(brown.sents(), sg=1)
model_movie = Word2Vec(movie_reviews.sents(), sg=1, window=5)

In [9]:
model_sg.wv.most_similar('money', topn=5)

[('dividends', 0.7778456807136536),
 ('credit', 0.759507954120636),
 ('accrued', 0.7531089186668396),
 ('repayment', 0.7517944574356079),
 ('repay', 0.7475109100341797)]

In [10]:
model_brown.wv.most_similar('money', topn=5)

[('care', 0.8328453898429871),
 ('job', 0.8285400867462158),
 ('friendship', 0.8189941048622131),
 ('risk', 0.8062323927879333),
 ('joy', 0.8045491576194763)]

In [11]:
model_movie.wv.most_similar('money', topn=5)

[('cash', 0.7353739738464355),
 ('paid', 0.7042819857597351),
 ('ransom', 0.6965006589889526),
 ('record', 0.6914075613021851),
 ('risk', 0.6876891851425171)]

## Using pre-trained word vectors

### A quick note on Glove:
   - Developed by Stanford by training on 6 Billion tokens
   - Objective is slightly different
   - End result very similar to Google's word2vec

https://nlp.stanford.edu/projects/glove/

- We'll use the 100D vectors for this example.
- The trained vectors are available in a text file
- The format is slightly different from that of word2vec, necessitating the use of a utility to format accordingly

In [12]:
glove_input_file = './data/glove.6B/glove.6B.100d.txt'
word2vec_output_file = './data/glove.6B/glove.6B.100d.w2vformat.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

  glove2word2vec(glove_input_file, word2vec_output_file)


(400000, 100)

In [13]:
glove_model = KeyedVectors.load_word2vec_format("./data/glove.6B/glove.6B.100d.w2vformat.txt", binary=False)

#### Now you can use all the methods you used with word2vec models

In [14]:
glove_model.most_similar("king")

[('prince', 0.7682328820228577),
 ('queen', 0.7507690787315369),
 ('son', 0.7020888328552246),
 ('brother', 0.6985775232315063),
 ('monarch', 0.6977890729904175),
 ('throne', 0.6919989585876465),
 ('kingdom', 0.6811409592628479),
 ('father', 0.6802029013633728),
 ('emperor', 0.6712858080863953),
 ('ii', 0.6676074266433716)]

In [15]:
glove_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.7698540687561035),
 ('monarch', 0.6843381524085999),
 ('throne', 0.6755736470222473),
 ('daughter', 0.6594556570053101),
 ('princess', 0.6520534157752991)]

In [16]:
glove_model.most_similar(positive=['woman', 'hero'], negative=['man'], topn=5)

[('heroine', 0.732498049736023),
 ('heroes', 0.6356217861175537),
 ('icon', 0.6185224056243896),
 ('beloved', 0.6136684417724609),
 ('herself', 0.5904076099395752)]