In [25]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [26]:
from nltk.corpus import brown
brown.categories()
news_corpus = brown.sents(categories=['news'])
# reference : https://www.nltk.org/howto/corpus.html (brown corpus)

# specify windows size which will be used in the random batch funciton for pair the center word with specific numbers of words. 
windows_size = 4
  

In [27]:
news_corpus
loss_all = {}

## Load data

In [28]:

flatten = lambda l: [item for sublist in l for item in sublist]
vocabs = list(set(flatten(news_corpus))) 
vocabs.append('<UNK>')
word2index = {v:idx for idx, v in enumerate(vocabs)}
index2word = {v:k for k, v in word2index.items()}

## Prepare train data for Skipgram & Skipgram Negative

In [29]:


def random_batch_skipgrapm(batch_size, news_corpus,windows_size=2): # default windows_size = 2

    skipgrams = []


    for doc in news_corpus:
        #look from word at the window size number on first and last word
        for i in range(windows_size, len(doc)-windows_size):
            #center word
            center = word2index[doc[i]]
            #outside words = windows size * 2
            outside = []
            for j in range(windows_size):
                outside.append(word2index[doc[i-j-1]])
                outside.append(word2index[doc[i+j+1]]) 

            for _,each_out in enumerate(outside):
                skipgrams.append([center, each_out])

                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)
            
x_skipgrapm, y_skipgramp = random_batch_skipgrapm(2, news_corpus,windows_size)

## Negative Sampling

In [30]:
z = 0.001

from collections import Counter

word_count = Counter(flatten(news_corpus))
word_count

num_total_words = sum([c for w, c in word_count.items()])
num_total_words

unigram_table = []

for v in vocabs:
    uw = word_count[v] / num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([v] * uw_alpha)
    
Counter(unigram_table)

Counter({'the': 114,
         ',': 108,
         '.': 89,
         'of': 69,
         'to': 55,
         'and': 55,
         'a': 52,
         'in': 50,
         'for': 30,
         'that': 26,
         'The': 26,
         "''": 24,
         'was': 24,
         'is': 24,
         '``': 24,
         'on': 22,
         'at': 21,
         'with': 19,
         'be': 19,
         'by': 18,
         'as': 18,
         'he': 17,
         'said': 15,
         'his': 15,
         'will': 15,
         'it': 14,
         'from': 14,
         ';': 13,
         'are': 13,
         'had': 12,
         'has': 12,
         '--': 12,
         'an': 12,
         'not': 11,
         'Mrs.': 11,
         'have': 11,
         'this': 11,
         'were': 11,
         'who': 11,
         'would': 10,
         'their': 10,
         'which': 10,
         'been': 9,
         'they': 9,
         'He': 9,
         'its': 8,
         ')': 8,
         'last': 8,
         'Mr.': 8,
         'out': 8,
         'more

## Co-occurence Matrix X

In [31]:
from collections import Counter

X_i = Counter(flatten(news_corpus))

skip_grams = []

for doc in news_corpus:
    for i in range(windows_size, len(doc)-windows_size):
        center = doc[i]
        outside = []
        for j in range(windows_size):
            outside.append(doc[i-j-1])
            outside.append(doc[i+j+1]) 
        for each_out in outside:
            skip_grams.append((center, each_out))
            
X_ik_skipgrams = Counter(skip_grams)


## Weight Function

In [32]:
def weighting(w_i, w_j, X_ik):
    

    try:
        x_ij = X_ik[(w_i, w_j)]

    except:
        x_ij = 1
        

    x_max = 100

    alpha = 0.75
    

    if x_ij < x_max:
        result = (x_ij / x_max)**alpha

    else:
        result = 1
    
    return result

from itertools import combinations_with_replacement

X_ik = {} 
weighting_dic = {} 

for bigram in combinations_with_replacement(vocabs, 2):
    if X_ik_skipgrams.get(bigram):  
        co = X_ik_skipgrams[bigram]
        X_ik[bigram] = co + 1 
        X_ik[(bigram[1], bigram[0])] = co + 1 
    else:
        pass
    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

## Prepare Data for GloVe

In [33]:
import math

def random_batch_glove(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    
    #convert our skipgrams to id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    #randomly choose indexes based on batch size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False)
    
    #get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])
        #coocs
        pair = skip_grams[index] #e.g., ('banana', 'fruit')
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
    
        #weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

In [34]:
batch_size = 2
x_glove, y_glove, cooc, weighting = random_batch_glove(batch_size, news_corpus, skip_grams, X_ik, weighting_dic)

## Skipgram Model


In [35]:
embedding = nn.Embedding(7, 2)

In [36]:
class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.embedding_center(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_center(all_vocabs) #(batch_size, voc_size, emb_size)
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1) 

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size) 
        
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)
        
        loss = -torch.mean(torch.log(top_term / lower_term_sum))  #scalar
        
        return loss
        

In [37]:
#prepare all vocabs

batch_size = 2
voc_size   = len(vocabs)

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size)
all_vocabs

tensor([[    0,     1,     2,  ..., 14392, 14393, 14394],
        [    0,     1,     2,  ..., 14392, 14393, 14394]])

In [38]:
input_tensor = torch.LongTensor(x_skipgrapm)
label_tensor = torch.LongTensor(y_skipgramp)

## Training Skipgram

In [39]:
num_epochs = 1000
batch_size = 2
emb_size   = 2
model_skipgram  = Skipgram(voc_size, emb_size)
optimizer  = optim.Adam(model_skipgram.parameters(), lr=0.001)

for epoch in range(num_epochs):
    
    #get batch
    input_batch, label_batch = random_batch_skipgrapm(batch_size, news_corpus)
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    #predict
    loss = model_skipgram(input_tensor, label_tensor, all_vocabs)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print the loss
    if (epoch + 1) % 100 == 0:
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss:2.6f}")

Epoch    100 | Loss: 9.847179
Epoch    200 | Loss: 8.592554
Epoch    300 | Loss: 10.407146
Epoch    400 | Loss: 9.902851
Epoch    500 | Loss: 9.603545
Epoch    600 | Loss: 10.437670
Epoch    700 | Loss: 9.978737
Epoch    800 | Loss: 9.363317
Epoch    900 | Loss: 11.957073
Epoch   1000 | Loss: 10.606616


## Skipgram Model (Neg Sampling)

In [40]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index['<UNK>'], seq))
    return torch.LongTensor(idxs)


import random

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):  #(1, k)
        target_index = targets[i].item()
        nsample      = []
        while (len(nsample) < k):
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))
        
    return torch.cat(neg_samples) #batch_size, k

batch_size = 2
x_skipgram_neg, y_skipgram_neg = random_batch_skipgrapm(batch_size, news_corpus,2)
x_tensor = torch.LongTensor(x_skipgram_neg)
y_tensor = torch.LongTensor(y_skipgram_neg)

k = 5
neg_samples = negative_sampling(y_tensor, unigram_table, k)

In [41]:
class SkipgramNeg(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        #center, outside:  (bs, 1)
        #negative       :  (bs, k)
        
        center_embed   = self.embedding_center(center) #(bs, 1, emb_size)
        outside_embed  = self.embedding_outside(outside) #(bs, 1, emb_size)
        negative_embed = self.embedding_outside(negative) #(bs, k, emb_size)
        
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) #(bs, 1)
        
        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        
        return -torch.mean(loss)

## Training Skipgram (Neg Sampling)

In [42]:
num_epochs = 1000
model_neg = SkipgramNeg(voc_size, emb_size)
optimizer = optim.Adam(model_neg.parameters(), lr=0.001)
batch_size = 2
emb_size   = 2



for epoch in range(num_epochs):
    
    #get batch
    input_batch, label_batch = random_batch_skipgrapm(batch_size, news_corpus)
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    #predict
    neg_samples = negative_sampling(label_tensor, unigram_table, k)
    loss = model_neg(x_tensor, y_tensor, neg_samples)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()

    
    #update alpha
    optimizer.step()

    
    #print the loss
    if (epoch + 1) % 100 == 0:
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss:2.6f}")

Epoch    100 | Loss: 5.088034
Epoch    200 | Loss: 0.805303
Epoch    300 | Loss: 1.973518
Epoch    400 | Loss: 1.082703
Epoch    500 | Loss: 1.483647
Epoch    600 | Loss: 2.702628
Epoch    700 | Loss: 1.002316
Epoch    800 | Loss: 0.735861
Epoch    900 | Loss: 0.856802
Epoch   1000 | Loss: 3.371357


## GloVe Model

In [43]:
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.center_embedding(center) #(batch_size, 1, emb_size)
        outside_embeds = self.outside_embedding(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

In [44]:
x_tensor = torch.LongTensor(x_glove)
y_tensor = torch.LongTensor(y_glove)
cooc_tensor = torch.FloatTensor(cooc)
weighting_tensor = torch.FloatTensor(weighting)

## Training GloVe

In [45]:
batch_size     = 10 # mini-batch size
embedding_size = 2 #so we can later plot
voc_size = len(vocabs)
model_glove          = Glove(voc_size, embedding_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_glove.parameters(), lr=0.001)

In [46]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [47]:
import time

num_epochs = 1000
for epoch in range(num_epochs):
    
    start = time.time()
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch_glove(batch_size, news_corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch) #[batch_size, 1]
    
    optimizer.zero_grad()
    loss = model_glove(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)

    if (epoch + 1) % 100 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")


Epoch: 100 | cost: 55.958790 | time: 0m 0s
Epoch: 200 | cost: 31.641260 | time: 0m 0s
Epoch: 300 | cost: 31.378723 | time: 0m 0s
Epoch: 400 | cost: 59.562737 | time: 0m 0s
Epoch: 500 | cost: 15.261617 | time: 0m 0s
Epoch: 600 | cost: 37.775932 | time: 0m 0s
Epoch: 700 | cost: 44.425762 | time: 0m 0s
Epoch: 800 | cost: 9.504477 | time: 0m 0s
Epoch: 900 | cost: 79.401909 | time: 0m 0s
Epoch: 1000 | cost: 4.851114 | time: 0m 0s


## GloVe (Gensim)

In [48]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

#you have to put this file in some python/gensim directory; just run it and it will inform where to put....
# glove_file = datapath('glove.6B.100d.txt')  #search on the google
glove_file = 'glove.6B.100d.txt'  #se
model_gensim = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

## Embed Function

In [50]:
def get_embed_skipgram(word):
    try:
        index = word2index[word]
        word = torch.LongTensor([word2index[word]])
    except:
        index = word2index['<UNK>']
        word = torch.LongTensor([word2index['<UNK>']])
        
    
    embed_c = model_skipgram.embedding_center(word)
    embed_o = model_skipgram.embedding_outside(word)
    embed   = (embed_c + embed_o) / 2
    
    return embed[0][0].item(), embed[0][1].item()

In [51]:
def get_embed_skipgram_neg(word):
    try:
        index = word2index[word]
        word = torch.LongTensor([word2index[word]])
    except:
        index = word2index['<UNK>']
        word = torch.LongTensor([word2index['<UNK>']])
        
    
    embed_c = model_neg.embedding_center(word)
    embed_o = model_neg.embedding_outside(word)
    embed   = (embed_c + embed_o) / 2
    
    return embed[0][0].item(), embed[0][1].item()

In [52]:
#let's write a function to get embedding given a word
def get_embed_glove(word):
    try:
        id_tensor = torch.LongTensor([word2index[word]])
    except:
        id_tensor = torch.LongTensor([word2index['<UNK>']])

    v_embed = model_glove.center_embedding(id_tensor)
    u_embed = model_glove.outside_embedding(id_tensor) 
    word_embed = (v_embed + u_embed) / 2 
    x, y = word_embed[0][0].item(), word_embed[0][1].item()

    return x, y

## Cosine Similarity Function

In [53]:
#more formally is to divide by its norm
def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

## Semantic & Syntactic Evaluation

In [54]:
## Skipgram Semantic & Syntactic Evaluation

f = open("word_test_semantic.txt", "r")
count_all = 0
count_correct = 0

for line in f:
    words = line.strip().split()
    a, b, c, d = words
    try:
        count_all+=1
        a_emb = np.array(get_embed_skipgram(a)).flatten()
        b_emb = np.array(get_embed_skipgram(b)).flatten()
        c_emb = np.array(get_embed_skipgram(c)).flatten()
        temp = np.subtract(b_emb,np.add(a_emb,c_emb)).flatten()
        max = -1 
        similar_word = ''
        for word in vocabs:
            word_temp = get_embed_skipgram(word)
            similar_value = cosine_similarity(temp,np.array(word_temp))
            if similar_value > max:
                max = similar_value
                similar_word = word
        if similar_word == d:
            count_correct+=1

    except:
        pass

f.close()

print('Semantic Accurancy is',(count_correct/count_all) * 100)

f = open("word_test_syntactic.txt", "r")
count_all = 0
count_correct = 0

for line in f:
    words = line.strip().split()
    a, b, c, d = words
    try:
        count_all+=1
        a_emb = np.array(get_embed_skipgram(a)).flatten()
        b_emb = np.array(get_embed_skipgram(b)).flatten()
        c_emb = np.array(get_embed_skipgram(c)).flatten()
        temp = np.subtract(b_emb,np.add(a_emb,c_emb)).flatten()
        max = -1 
        similar_word = ''
        for word in vocabs:
            word_temp = get_embed_skipgram(word)
            similar_value = cosine_similarity(temp,np.array(word_temp))
            if similar_value > max:
                max = similar_value
                similar_word = word
        if similar_word == d:
            count_correct+=1

    except:
        pass

f.close()

print('Syntactic Accurancy is ',(count_correct/count_all) * 100)

Semantic Accurancy is 0.1976284584980237
Syntactic Accurancy is  0.0641025641025641


In [55]:
## Skipgram (Neg Sampling) Semantic & Syntactic Evaluation

f = open("word_test_semantic.txt", "r")
count_all = 0
count_correct = 0

for line in f:
    words = line.strip().split()
    a, b, c, d = words
    try:
        count_all+=1
        a_emb = np.array(get_embed_skipgram_neg(a)).flatten()
        b_emb = np.array(get_embed_skipgram_neg(b)).flatten()
        c_emb = np.array(get_embed_skipgram_neg(c)).flatten()
        temp = np.subtract(b_emb,np.add(a_emb,c_emb)).flatten()
        max = -1 
        similar_word = ''
        for word in vocabs:
            word_temp = get_embed_skipgram_neg(word)
            similar_value = cosine_similarity(temp,np.array(word_temp))
            if similar_value > max:
                max = similar_value
                similar_word = word
        if similar_word == d:
            count_correct+=1

    except:
        pass

f.close()

print('Semantic Accurancy is',(count_correct/count_all) * 100)

f = open("word_test_syntactic.txt", "r")
count_all = 0
count_correct = 0

for line in f:
    words = line.strip().split()
    a, b, c, d = words
    try:
        count_all+=1
        a_emb = np.array(get_embed_skipgram_neg(a)).flatten()
        b_emb = np.array(get_embed_skipgram_neg(b)).flatten()
        c_emb = np.array(get_embed_skipgram_neg(c)).flatten()
        temp = np.subtract(b_emb,np.add(a_emb,c_emb)).flatten()
        max = -1 
        similar_word = ''
        for word in vocabs:
            word_temp = get_embed_skipgram_neg(word)
            similar_value = cosine_similarity(temp,np.array(word_temp))
            if similar_value > max:
                max = similar_value
                similar_word = word
        if similar_word == d:
            count_correct+=1

    except:
        pass

f.close()

print('Syntactic Accurancy is ',(count_correct/count_all) * 100)

Semantic Accurancy is 0.0
Syntactic Accurancy is  0.0


In [56]:
## GloVe Semantic & Syntactic Evaluation

f = open("word_test_semantic.txt", "r")
count_all = 0
count_correct = 0

for line in f:
    words = line.strip().split()
    a, b, c, d = words
    try:
        count_all+=1
        a_emb = np.array(get_embed_glove(a)).flatten()
        b_emb = np.array(get_embed_glove(b)).flatten()
        c_emb = np.array(get_embed_glove(c)).flatten()
        temp = np.subtract(b_emb,np.add(a_emb,c_emb)).flatten()
        max = -1 
        similar_word = ''
        for word in vocabs:
            word_temp = get_embed_glove(word)
            similar_value = cosine_similarity(temp,np.array(word_temp))
            if similar_value > max:
                max = similar_value
                similar_word = word
        if similar_word == d:
            count_correct+=1

    except:
        pass

f.close()

print('Semantic Accurancy is',(count_correct/count_all) * 100)

f = open("word_test_syntactic.txt", "r")
count_all = 0
count_correct = 0

for line in f:
    words = line.strip().split()
    a, b, c, d = words

    try:
        count_all+=1
        a_emb = np.array(get_embed_glove(a)).flatten()
        b_emb = np.array(get_embed_glove(b)).flatten()
        c_emb = np.array(get_embed_glove(c)).flatten()
        temp = np.subtract(b_emb,np.add(a_emb,c_emb)).flatten()
        max = -1 
        similar_word = ''
        for word in vocabs:
            word_temp = get_embed_glove(word)
            similar_value = cosine_similarity(temp,np.array(word_temp))
            if similar_value > max:
                max = similar_value
                similar_word = word
        if similar_word == d:
            count_correct+=1

    except:
        pass

f.close()

print('Syntactic Accurancy is ',(count_correct/count_all) * 100)

Semantic Accurancy is 0.0
Syntactic Accurancy is  0.0


In [59]:
## GloVe Gensim Semantic & Syntactic Evaluation

f = open("word_test_semantic.txt", "r")
count_all = 0
count_correct = 0

for line in f:
    words = line.strip().split()
    a, b, c, d = words
    count_all+=1
    try:
        result = model_gensim.most_similar(positive=[a, c], negative=[b]) 
        if result[0][0] == d:
            count_correct+=1
    except:
        pass


f.close()

print('Semantic Accurancy is',(count_correct/count_all) * 100)

f = open("word_test_syntactic.txt", "r")
count_all = 0
count_correct = 0

for line in f:
    words = line.strip().split()
    a, b, c, d = words
    count_all+=1
    try:
        result = model_gensim.most_similar(positive=[a, c], negative=[b]) 
        if result[0][0] == d:
            count_correct+=1
    except:
        pass

f.close()

print('Syntactic Accurancy is ',(count_correct/count_all) * 100)

Semantic Accurancy is 0.0
Syntactic Accurancy is  2.307692307692308


## Human Judgement Evaluation

In [60]:
f = open("wordsim_similarity_goldstandard.txt", "r")
human_mean = []
model_similar_skipgram = []
for line in f:
    words = line.strip().split()
    a, b, c = words
    try:
        model_similar_skipgram.append(cosine_similarity(np.array(get_embed_skipgram(a)),np.array(get_embed_skipgram(b))))
        human_mean.append(c)
    except:
        continue
f.close()



In [61]:
f = open("wordsim_similarity_goldstandard.txt", "r")
model_similar_skipgram_neg = []
for line in f:
    words = line.strip().split()
    a, b, c = words
    try:
        model_similar_skipgram_neg.append(cosine_similarity(np.array(get_embed_skipgram_neg(a)),np.array(get_embed_skipgram_neg(b))))
    except:
        continue
f.close()



In [71]:
f = open("wordsim_similarity_goldstandard.txt", "r")
human_mean = []
model_similar_glove = []
for line in f:
    words = line.strip().split()
    a, b, c = words
    try:
        model_similar_glove.append(cosine_similarity(np.array(get_embed_glove(a)),np.array(get_embed_glove(b))))
        human_mean.append(c)
    except:
        continue
f.close()



In [66]:
f = open("wordsim_similarity_goldstandard.txt", "r")
human_mean = []
model_similar_glove_gensim = []
for line in f:
    words = line.strip().split()
    a, b, c = words
    try:
        model_similar_glove_gensim.append(model_gensim.distance(a,b))
        human_mean.append(c)
    except:
        continue
f.close()



In [63]:
from scipy.stats import spearmanr

res = spearmanr(human_mean, model_similar_skipgram)
res.statistic


0.021424759410375093

In [65]:
from scipy.stats import spearmanr

res = spearmanr(human_mean, model_similar_skipgram_neg)
res.statistic


0.10198918072519858

In [72]:
from scipy.stats import spearmanr

res = spearmanr(human_mean, model_similar_glove)
res.statistic


0.10187570060495589

In [68]:
from scipy.stats import spearmanr

res = spearmanr(human_mean, model_similar_glove_gensim)
res.statistic


-0.57701049257972

In [83]:
from scipy.stats import spearmanr

res = spearmanr(human_mean, human_mean)
res.statistic

1.0

In [77]:
from tabulate import tabulate

In [82]:
print(tabulate([['Skipgram', 4, 10.606616,'6 min 44 sec',0.19762,0.06410], 
                ['Skipgram (NEG)', 4,3.371357,'5 min 35 sec',0.0,0.0],
                ['GloVe', 4,4.851114,'1 min 37 sec',0.0,0.0],
                ['GloVe (Gensim)', 4,'-','-',0.0,2.30769]], 
                headers=["Model","Window Size","Training Loss","Training time","Syntactic Accuracy",'Semantic accuracy'], tablefmt='orgtbl'))

| Model          |   Window Size | Training Loss   | Training time   |   Syntactic Accuracy |   Semantic accuracy |
|----------------+---------------+-----------------+-----------------+----------------------+---------------------|
| Skipgram       |             4 | 10.606616       | 6 min 44 sec    |              0.19762 |             0.0641  |
| Skipgram (NEG) |             4 | 3.371357        | 5 min 35 sec    |              0       |             0       |
| GloVe          |             4 | 4.851114        | 1 min 37 sec    |              0       |             0       |
| GloVe (Gensim) |             4 | -               | -               |              0       |             2.30769 |


In [85]:
print(tabulate([['MSE', 0.0214, 0.1019,0.1018,-0.5770,1]], 
                headers=['Model',"Skipgram","Skipgram (NEG)","GloVe","GloVe (Gensim)",'Y true'], tablefmt='orgtbl'))

| Model   |   Skipgram |   Skipgram (NEG) |   GloVe |   GloVe (Gensim) |   Y true |
|---------+------------+------------------+---------+------------------+----------|
| MSE     |     0.0214 |           0.1019 |  0.1018 |           -0.577 |        1 |


In [64]:
# torch.save(model, 'model_skipgram')

In [86]:
# torch.save(model_neg, 'model_skipgram_neg')