# Expirements for Task 2

According to Task 2, I will conduct the follwings expirements:
- (1) Compare Skip-gram, Skip-gram negative sampling, GloVe models on training loss, training time. 
- (2) Use Word analogies dataset to calucalte between syntactic and semantic accuracy, similar to the
methods in the Word2Vec and GloVe paper. 
- (3) Use the similarity dataset to find the correlation between your models’ dot product and the provided similarity metrics. (from scipy.stats import spearmanr) Assess if your embeddings correlate with human judgment.

## 1. Importing

In [1]:
# import 3 models from main.py
from main import Skipgram, SkipgramNeg, Glove

In [2]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [3]:
import torch
import torch.nn.functional as F
import pickle
import numpy as np
from scipy.stats import spearmanr

In [4]:
# Importing training data
Data = pickle.load(open('./model/Data.pkl', 'rb'))
corpus = Data['corpus']
vocab = Data['vocab']
word2index = Data['word2index']
voc_size = Data['voc_size']
embed_size = Data['embedding_size']

## 2. Load the Model

In [5]:
# create object of the Skipgram model and load parameters
Skipgram = Skipgram(voc_size, embed_size)
Skipgram.load_state_dict(torch.load('model/A1-Skipgram.pt'))
Skipgram.eval()

Skipgram(
  (embedding_center): Embedding(4167, 2)
  (embedding_outside): Embedding(4167, 2)
)

In [6]:
# create object of the SkipgramNeg model and load parameters
SkipgramNeg = SkipgramNeg(voc_size, embed_size)
SkipgramNeg.load_state_dict(torch.load('model/A1-NegSampling.pt'))
SkipgramNeg.eval()

SkipgramNeg(
  (embedding_center): Embedding(4167, 2)
  (embedding_outside): Embedding(4167, 2)
  (logsigmoid): LogSigmoid()
)

In [7]:
# Instantiate the model and load saved parameters
glove = Glove(voc_size, embed_size)
glove.load_state_dict(torch.load('model/A1-Glove.pt'))
glove.eval()

Glove(
  (center_embedding): Embedding(4167, 2)
  (outside_embedding): Embedding(4167, 2)
  (center_bias): Embedding(4167, 1)
  (outside_bias): Embedding(4167, 1)
)

In [8]:
# you have to put this file in some python/gensim directory; just run it and it will inform where to put
glove_file = ('./glove.6B/glove.6B.100d.txt')
gensim = KeyedVectors.load_word2vec_format(glove_file, binary = False, no_header = True)

## 3. Function for semantic and syntactic analysis 
that calculates similarities using cosine similarity

In [9]:
import torch
import torch.nn.functional as F

def calculate_similarity(word_vectors, result_vector):
    
    # Calculate cosine similarities
    similarities = F.cosine_similarity(result_vector, word_vectors)

    # Find the index of the word with the highest similarity
    closest_word_index = torch.argmax(similarities).item()

    return closest_word_index

def similarities(lines, model, vocab):
    
    # Get word vectors for all words in the vocabulary
    all_word_vectors = torch.stack([model.get_embed(word) for word in vocab])
    # [voc_size, 1, emb_size]

    correct_count = 0
    # Perform vector manipulation for each set of four words
    for line in lines: # ['dancing', 'danced', 'falling', 'fell']
        words = line
        
        # Assuming there are four words in each line
        # vectors = [model.get_embed(word.lower()) if word in vocab else model.get_embed('<UNK>') for word in words]
        vectors = [model.get_embed(word) if word in vocab else model.get_embed('<UNK>') for word in words]
        

        # Perform vector manipulation (e.g., subtraction, addition)
        result_vector = vectors[1][0] - vectors[0][0] + vectors[2][0] # [, emb_size]

        # Add a batch dimension to result_vector
        result_vector = result_vector.unsqueeze(0) # [1, emb_size]

        # Find the closest word index using cosine similarity
        closest_word_index = calculate_similarity(all_word_vectors, result_vector)

        # Get the closest word from your vocabulary
        closest_word = vocab[closest_word_index]

        if closest_word == words[3]:
            correct_count += 1

        # Optionally, you can print the result for each line
        # print(f"The word with the closest cosine similarity to the result of {line} is: {closest_word}")

    # print(f'Accuracy : {(correct_count / len(lines)) * 100: .2f}%')
    return (correct_count / len(lines)) * 100


## 4. Testing Phase

### Load the Data

For the testing data, I will use Word analogies dataset to calucalte between syntactic and semantic accuracy

In [10]:
# path to file
txt_file = '../A1 - Search Engine/word-test.v1.txt'

In [11]:
analogy = []
# current_analogy_set = {}

with open(txt_file, 'r', encoding='utf-8') as file:
    
    # Read all lines from the file and store them in a list
    lines = file.readlines()

i = 0
# Iterate through lines and extract analogy sets
for line in lines:
    line = line.strip() # remove the space
    
    # check if the line is section declaration line < : capital-common-country >
    # if yes >> take as section title
    if line.startswith(':'):
        
        if i > 0:
            analogy.append(current_analogy_set)
        
        current_section = line[2:]
        current_analogy_set = {'section': current_section, 'pairs': []}
    
    # otherwise >> record as each word and append into a list called pairs for each section
    else:
        words = line.split()
        current_analogy_set['pairs'].append(words)
    
    i+=1
        
    # if not line:  # Empty line indicates the end of an analogy set
    #     analogy.append(current_analogy_set)

# Display the first analogy set for illustration
print(analogy[0])


{'section': 'capital-common-countries', 'pairs': [['Athens', 'Greece', 'Baghdad', 'Iraq'], ['Athens', 'Greece', 'Bangkok', 'Thailand'], ['Athens', 'Greece', 'Beijing', 'China'], ['Athens', 'Greece', 'Berlin', 'Germany'], ['Athens', 'Greece', 'Bern', 'Switzerland'], ['Athens', 'Greece', 'Cairo', 'Egypt'], ['Athens', 'Greece', 'Canberra', 'Australia'], ['Athens', 'Greece', 'Hanoi', 'Vietnam'], ['Athens', 'Greece', 'Havana', 'Cuba'], ['Athens', 'Greece', 'Helsinki', 'Finland'], ['Athens', 'Greece', 'Islamabad', 'Pakistan'], ['Athens', 'Greece', 'Kabul', 'Afghanistan'], ['Athens', 'Greece', 'London', 'England'], ['Athens', 'Greece', 'Madrid', 'Spain'], ['Athens', 'Greece', 'Moscow', 'Russia'], ['Athens', 'Greece', 'Oslo', 'Norway'], ['Athens', 'Greece', 'Ottawa', 'Canada'], ['Athens', 'Greece', 'Paris', 'France'], ['Athens', 'Greece', 'Rome', 'Italy'], ['Athens', 'Greece', 'Stockholm', 'Sweden'], ['Athens', 'Greece', 'Tehran', 'Iran'], ['Athens', 'Greece', 'Tokyo', 'Japan'], ['Baghdad', 'I

In [12]:
# get the corpus_capital_country = where section = 'capital country'
for j in range(len(analogy)):
    if analogy[j]['section'] == 'capital-common-countries':
        corpus_capital_country = analogy[j]['pairs']
    
    if analogy[j]['section'] == 'gram7-past-tense':
        corpus_past_tense = analogy[j]['pairs']
        
# corpus_capital_country
# corpus_past_tense

### (1) syntactic and semantic accuracy

#### Skipgram Model

In [13]:
# Sementic
accuracy = similarities (corpus_capital_country, Skipgram, vocab)

print(f"Accuracy : {accuracy:.2f} %")

Accuracy : 0.00 %


In [14]:
# Syntactic
accuracy = similarities (corpus_past_tense, Skipgram, vocab)

print(f"Accuracy : {accuracy:.2f}")

Accuracy : 0.00


#### Skipgram (Neg) Model

In [15]:
# Sementic
accuracy = similarities (corpus_capital_country, SkipgramNeg, vocab)

print(f"Accuracy : {accuracy:.2f}")

Accuracy : 0.00


In [16]:
# Syntactic
accuracy = similarities (corpus_past_tense, SkipgramNeg, vocab)

print(f"Accuracy : {accuracy:.2f}")

Accuracy : 0.00


#### GloVe Model

In [17]:
# Sementic
accuracy = similarities (corpus_capital_country, glove, vocab)

print(f"Accuracy : {accuracy:.2f}")

Accuracy : 0.00


In [18]:
# Syntactic
accuracy = similarities (corpus_past_tense, glove, vocab)

print(f"Accuracy : {accuracy:.2f}")

Accuracy : 0.00


#### Gensim Model

In [None]:
# analogy
def analogy(x1, x2, y1):
    result = gensim.most_similar(positive=[y1,x2], negative = [x1])
    return result[0][0]

In [35]:
def gensim_similarity (lines):
    
    correct_count = 0
    
    for line in lines:
        words = line # ['dancing', 'danced', 'falling', 'fell']
        
        # check whether words are in gensim model or not
        words_checked = [word.lower() if word.lower() in gensim else 'unknown' for word in words] 
        # print (words_checked)
        
        # result = gensim.most_similar(positive = [words_checked[2], words_checked[1]], negative = [words_checked[0]])
        # word : number (queen : 0.7699)
        result = analogy (words_checked[0], words_checked[1], words_checked[2])
        
        if result == words_checked[3]:
            correct_count += 1  
        
    return (correct_count / len(lines)) * 100

In [36]:
# sementic
accuracy = gensim_similarity (corpus_capital_country)

print(f"Accuracy : {accuracy:.2f}")

Accuracy : 93.87


In [37]:
# syntatic
accuracy = gensim_similarity (corpus_past_tense)

print(f"Accuracy : {accuracy:.2f}")

Accuracy : 55.45


### (2) Similarity

### Human Model