In [1]:
# import libraries

import torch.nn.functional as F
import torch
import numpy as np
from scipy.stats import spearmanr

#### load data

In [2]:
file_path_capitals = 'data/capital-common.txt'
file_path_past = 'data/past-tense.txt'
file_path_given_dataset = 'data/similarity_gold_standard.txt'

with open(file_path_capitals, 'r') as file:
    capital_common = file.readlines()

with open(file_path_past, 'r') as file:
    past_tense = file.readlines()

with open(file_path_given_dataset, 'r') as file:
    similarity_gold_standard = file.readlines()

In [3]:
# load data from pickle

import pickle
with open('data/data.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

voc_size = loaded_data['voc_size']
emb_size = loaded_data['emb_size']
word2index = loaded_data['word2index']
vocab = loaded_data['vocab']

In [4]:
voc_size

8558

In [5]:
# sample word extraction

for line in capital_common:
    print(line)
    break

words = line.split()

for word in words:
    print(word)

Athens Greece Baghdad Iraq

Athens
Greece
Baghdad
Iraq


#### Load models

In [6]:
from utils import Skipgram,SkipgramNeg,Glove

In [7]:
# load skipgram.pth model

skipgram = Skipgram(voc_size, emb_size)
skipgram.load_state_dict(torch.load('models/skipgram.pth',map_location=torch.device('cpu')))
skipgram.eval()

Skipgram(
  (embedding_center): Embedding(8558, 30)
  (embedding_outside): Embedding(8558, 30)
)

In [8]:
# load skipgramNEG.pth model

skipgramNEG = SkipgramNeg(voc_size, emb_size)
skipgramNEG.load_state_dict(torch.load('models/skipgramNEG.pth',map_location=torch.device('cpu')))
skipgramNEG.eval()

SkipgramNeg(
  (embedding_center): Embedding(8558, 30)
  (embedding_outside): Embedding(8558, 30)
  (logsigmoid): LogSigmoid()
)

In [9]:
# load GloVe.pth model

GloVe = Glove(voc_size, emb_size)
GloVe.load_state_dict(torch.load('models/GloVe.pth',map_location=torch.device('cpu')))
GloVe.eval()

Glove(
  (embedding_center): Embedding(8558, 30)
  (embedding_outside): Embedding(8558, 30)
  (center_bias): Embedding(8558, 1)
  (outside_bias): Embedding(8558, 1)
)

In [10]:
# load Gensim model

from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

#you have to put this file in some python/gensim directory; just run it and it will inform where to put....
glove_file = datapath('glove.6B.100d.txt')  #search on the google
gensim = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True, limit=None)

#### Sematic and syntatic calcualtion function

In [11]:
# tensor_size testing

tensor_size = skipgram.get_embed('FEAR') 
print(tensor_size.size())

torch.Size([1, 30])


In [12]:
def prepare_vectorized_words(vocab, model):
    word_vectors = []

    # Iterate over words in vocab
    
    for word in vocab:
        vector = model.get_embed(word)
        word_vectors.append(vector)

    vectorized_words = torch.stack(word_vectors)

    return vectorized_words


In [13]:
def offset_word_technique(lines, model, vocab):

    correct = 0
    vectorized_words = prepare_vectorized_words(vocab, model) 

    for line in lines:
        words = line.split() # Example: 'Athens', 'Greece', Baghdad Iraq'

        # Ensure all words are in vocabulary
        vec = [model.get_embed(word if word in vocab else '<UNK>') for word in words]

        # Vector algebraic operation: vec2 - vec1 + vec3
        final_vector = vec[1] - vec[0] + vec[2]

        final_vector = final_vector.unsqueeze(0)

        # Cosine similarities
        cos_sim = F.cosine_similarity(final_vector, vectorized_words)

        # Find the index of the word with the highest similarity
        closest_word_index = torch.argmax(cos_sim).item()
        closest_word = vocab[closest_word_index]
        
        if closest_word == words[3]:
            correct += 1

    accuracy = (correct / len(lines)) * 100
    print(f'Accuracy: {accuracy:.2f}%')
    return accuracy

In [14]:
# for word in words:
#     vec1 = skipgram.get_embed(words[0])
#     vec2 = skipgram.get_embed(words[1])
#     vec3 = skipgram.get_embed(words[2])

#     result_vector = vec2[0] - vec1[0] + vec3[0]

In [15]:
# vec1 = skipgram.get_embed('FEAR')
# vec2 = skipgram.get_embed('ASIAN')
# vec3 = skipgram.get_embed('JAPAN')

# result2 = vec2[0] - vec1[0] + vec3[0]
# print(result2)
# print(result2.size())
# result3 = result2.unsqueeze(0)
# print(result3.size())

#### Inferencing models

##### skipgram

In [16]:
offset_word_technique(capital_common, skipgram, vocab)

Accuracy: 0.00%


0.0

In [17]:
offset_word_technique(past_tense, skipgram, vocab)

Accuracy: 0.00%


0.0

##### skipgramNEG

In [18]:
offset_word_technique(capital_common, skipgramNEG, vocab)

Accuracy: 0.00%


0.0

In [19]:
offset_word_technique(past_tense, skipgramNEG, vocab)

Accuracy: 0.00%


0.0

##### GloVe

In [20]:
offset_word_technique(capital_common, GloVe, vocab)

Accuracy: 0.00%


0.0

In [21]:
offset_word_technique(past_tense, GloVe, vocab)

Accuracy: 0.00%


0.0

##### gensim

In [22]:
# syntactic and semantic similarity function for gensim

def analogy(lines):

    correct = 0
    
    # Vector algebraic operation: vec2 - vec1 + vec3
    
    for line in lines:
        words = line.split()
        
        for i in range(len(words)):
            words[i] = words[i].lower() # Convert all words to lower case
            if words[i] not in gensim: # Check if gensim contains the word
                words[i] = 'unknown' # Set as unknown if not
        
        # used gensim's built in function 
        result = gensim.most_similar(positive=[words[2], words[1]], negative=[words[0]])

        # Get the closest word
        closest_word = result[0][0]
        if closest_word == words[3]:
            correct += 1

    accuracy = (correct / len(lines)) * 100
    print(f'Accuracy: {accuracy:.2f}%')
    return accuracy

In [23]:
analogy(capital_common)


Accuracy: 93.87%


93.87351778656127

In [24]:
analogy(past_tense)

Accuracy: 55.45%


55.44871794871795

#### Finding correlation between models’ dot product and offset_word_technique

In [25]:
# Read the text file and create a list of tuples
with open('data/similarity_gold_standard.txt', 'r') as file:
    given_dataset = file.readlines()

In [26]:
from numpy import dot
from numpy.linalg import norm

def cos_sim(a, b):
    a = a.flatten()  # Flatten the array to 1D
    b = b.flatten()  # Flatten the array to 1D
    cos_sim = dot(a, b) / (norm(a) * norm(b))
    return cos_sim

In [37]:
def correlation(lines, model):

    y_truth = []; y_predict = []

    # input words from .txt
    # append y-true value to y_truth vector
    
    for line in lines:
        words = line.split()
        
        vec = []
        for word in words[:2]:
            try:
                vec.append(model.get_embed(word).detach().numpy())
            except:
                vec.append(model.get_embed('<UNK>').detach().numpy())
                            
        y_truth.append(float(words[2]))
        y_predict.append(cos_sim(np.array(vec[0]), np.array(vec[1])))
    
    # spearmanr correlation 
    correlation_score, p_value = spearmanr(y_truth, y_predict)
    print(f'Correlation score: {correlation_score:.2f}, P-value: {p_value:.2f}')
    return correlation_score

In [38]:
# skipgram

correlation(similarity_gold_standard, skipgram)

Correlation score: 0.11, P-value: 0.12


0.10802765308471039

In [39]:
# skipgramNEG

correlation(similarity_gold_standard, skipgramNEG)

Correlation score: 0.10, P-value: 0.18


0.09511098355406604

In [40]:
# GloVe

correlation(similarity_gold_standard, GloVe)

Correlation score: 0.16, P-value: 0.02


0.1636033039515042

In [41]:
# get_vector function is the reason why I seperate the correlation function

def correlation_gensim(lines, model):

    y_truth = []; y_predict = []

    for line in lines:
        words = line.split()
        
        vec = []
        for word in words[:2]:
            try:
                vec.append(model.get_vector(word)) 
            except:
                vec.append(model.get_vector('unknown'))
                            
        y_truth.append(float(words[2]))
        y_predict.append(cos_sim(np.array(vec[0]), np.array(vec[1])))
        
    correlation_score, p_value = spearmanr(y_truth, y_predict)
    print(f'Correlation score: {correlation_score:.2f}, P-value: {p_value:.2f}')
    return correlation_score


In [43]:
correlation_gensim(similarity_gold_standard, gensim)

Correlation score: 0.60, P-value: 0.00


0.5958258410203774