In [1]:
from gensim.test.utils import datapath
from gensim import utils

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        #corpus_path = datapath('abc_news/abcnews-date-text.csv')
        for line in open('abc_news/abcnews-date-text.csv'):
            ind = line.find(",")                   
            text = line[ind+1:]                   
            yield utils.simple_preprocess(text)

In [2]:
sentences = MyCorpus()

In [4]:
i=0
for s in sentences:
    print(s)
    i+=1
    if i==2:
        break

['aba', 'decides', 'against', 'community', 'broadcasting', 'licence']
['act', 'fire', 'witnesses', 'must', 'be', 'aware', 'of', 'defamation']


In [5]:
import gensim.models

In [6]:
# Takes a few minutes...
model = gensim.models.Word2Vec(sentences=sentences) #min_count=5, vector_size=200

In [7]:
len(model.wv.index_to_key)

38684

In [8]:
for index, word in enumerate(model.wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index} is {word}")

word #0 is to
word #1 is in
word #2 is for
word #3 is of
word #4 is on
word #5 is the
word #6 is over
word #7 is police
word #8 is at
word #9 is with


In [11]:
model.wv['police']

array([ 2.876929  ,  0.36573046, -0.04436797, -0.2312297 , -0.23336677,
        1.5372736 ,  1.6806175 ,  1.4925967 , -0.70252985, -1.3188128 ,
        1.2768271 ,  0.51103354, -0.61636156,  0.20204471,  1.8007569 ,
        1.0954375 ,  2.4577975 ,  1.4593645 ,  0.37467524, -1.1313852 ,
        0.3397876 ,  1.6261564 ,  0.54296994, -1.0222702 , -2.9926198 ,
       -0.3150511 , -0.8980939 ,  0.7713534 , -0.27746835,  0.6877973 ,
       -0.7031958 , -0.28893924, -1.3481305 ,  0.68166167, -0.4411558 ,
        0.15603477, -2.4622796 , -1.3708696 ,  0.47186565,  1.5288067 ,
        0.3202819 , -0.31492466,  0.20830353, -1.094427  , -1.3252416 ,
        1.6239321 , -0.9848096 ,  1.0463514 ,  0.5173541 , -1.6201061 ,
       -0.5579231 , -1.2227696 ,  1.8716978 , -1.770485  ,  1.1623493 ,
       -0.45479202, -3.1006906 , -0.00998674,  1.1050769 ,  0.6193704 ,
        1.292312  , -1.2837234 ,  0.65003306,  1.5189573 ,  1.0474945 ,
        0.35978016, -2.854416  , -1.2052798 , -0.82887965, -0.04

In [12]:
print(model.wv.most_similar(positive=['police'], topn=5))

[('detectives', 0.7062063217163086), ('afp', 0.6775110960006714), ('authorities', 0.6539719700813293), ('customs', 0.5348871350288391), ('qps', 0.5265768766403198)]


In [10]:
# will require a few GBs of RAM ()
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [None]:
# man:woman :: king:?
# man - woman = king - ?
# ? = woman - man + king
# we add woman and king and subtract man
# positive=['king', 'woman'] and negative=['man']

In [11]:
wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=4)

[('queen', 0.7118193507194519),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072)]

In [12]:
wv.most_similar(positive=['Germany', 'Paris'], negative=['France'], topn=4)

[('Berlin', 0.7644002437591553),
 ('Frankfurt', 0.7329736351966858),
 ('Dusseldorf', 0.7009456753730774),
 ('Munich', 0.6773864030838013)]

In [13]:
len(wv.index_to_key)

3000000

In [14]:
from numpy import dot
from numpy.linalg import norm

def cosine(a,b):
    return dot(a, b)/(norm(a)*norm(b))

In [15]:
cosine(wv['tiger'], wv['lion'])

0.51210403

# GLoVe

<img src="glove_loss.png">

In [16]:
import torch

In [17]:
import torch.nn as nn

In [141]:
class Glove(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.W = nn.Parameter(torch.randn(vocab_size, embed_dim))
        self.W_hat = nn.Parameter(torch.randn(vocab_size, embed_dim))
        self.b = nn.Parameter(torch.randn(vocab_size, 1))
        self.b_hat = nn.Parameter(torch.randn(vocab_size, 1))
        self.x_max = 100
    
    def normalize(self, x):
        return (x/self.x_max)**0.75
    
    def forward(self, w_i, w_j, x_ij):
        emb_1 = self.W[w_i]
        emb_2 = self.W_hat[w_j]
        prod = torch.sum(emb_1*emb_2, dim=1)
        out = torch.pow(prod + self.b[w_i] + self.b_hat[w_j] - torch.log(x_ij),2)
        return out*self.normalize(x_ij)

In [142]:
model = Glove(10, 20)

In [143]:
w_1 = torch.LongTensor([0, 1])
w_2 = torch.LongTensor([1, 2])
x_12 = torch.LongTensor([1, 5])

In [144]:
out = model(w_1, w_2, x_12)

In [135]:
optimizer = torch.optim.SGD(model.parameters(),lr=0.001)

In [148]:
loss = torch.mean(out)
optimizer.zero_grad()
loss.backward()
optimizer.step()

# Tasks

1) Check out the documentation from Gensim for fasttext and doc2vec and try out these models

2) Down load the word analogy dataset from http://download.tensorflow.org/data/questions-words.txt

Example: Athens Greece Baghdad Iraq

Frame the analogy question as Athens:Greece :: Baghdad: x

Using the pre-trained word2vec model predict x. Check if x is correctly predicted.

Do this across all the examples and report the accuracy.

3) Consider the GOT corpus from last week's exercise. Create a word-word co-occurrence matrix (do the necessary pre-processing) and train the GLoVe model on this corpus

In [None]:
for e in range(epochs):
    for W_i, W_j, X_ij in batches:
        # Fill this out