## 1. Import Libraires

In [1]:
import torch
import torch.nn.functional as F
import pickle
import numpy as np
from scipy.stats import spearmanr
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from utils import Skipgram, SkipgramNeg, Glove

### Load trained data

In [2]:
skipgram_data = pickle.load(open('app/code/models/skipgrams.pkl', 'rb'))
skipgram_neg_data = pickle.load(open('app/code/models/skipgrams-neg.pkl', 'rb'))
glove_data = pickle.load(open('app/code/models/glove.pkl', 'rb'))

skipgram_word2index = skipgram_data['word2index']
skipgram_neg_word2index = skipgram_neg_data['word2index']
glove_word2index = glove_data['word2index']

skipgram_voc_size = skipgram_data['voc_size']
skipgram_neg_voc_size = skipgram_neg_data['voc_size']
glove_voc_size = glove_data['voc_size']

skipgram_emb_size = skipgram_data['emb_size']
skipgram_neg_emb_size = skipgram_neg_data['emb_size']
glove_emb_size = glove_data['emb_size']

### Instantiate Model

In [4]:
# checkpoint = torch.load('app/code/models/skipgram.pt')
# print("Keys in saved state_dict:", checkpoint.keys())
# print("Keys in model state_dict:", skipgram.state_dict().keys())

In [3]:
skipgram = Skipgram(skipgram_voc_size, skipgram_emb_size, skipgram_word2index)
skipgram.load_state_dict(torch.load('app/code/models/skipgram.pt'))
skipgram.eval()

  skipgram.load_state_dict(torch.load('app/code/models/skipgram.pt'))


Skipgram(
  (embedding_center): Embedding(18046, 2)
  (embedding_outside): Embedding(18046, 2)
)

In [4]:
skipgramNeg = SkipgramNeg(skipgram_neg_voc_size, skipgram_neg_emb_size, skipgram_neg_word2index)
skipgramNeg.load_state_dict(torch.load('app/code/models/skipgram-neg.pt'))
skipgramNeg.eval()

  skipgramNeg.load_state_dict(torch.load('app/code/models/skipgram-neg.pt'))


SkipgramNeg(
  (embedding_center): Embedding(18046, 2)
  (embedding_outside): Embedding(18046, 2)
  (logsigmoid): LogSigmoid()
)

In [5]:
glove = Glove(glove_voc_size, glove_emb_size, glove_word2index)
glove.load_state_dict(torch.load('app/code/models/glove.pt'))
glove.eval()

  glove.load_state_dict(torch.load('app/code/models/glove.pt'))


Glove(
  (center_embedding): Embedding(18046, 2)
  (outside_embedding): Embedding(18046, 2)
  (center_bias): Embedding(18046, 1)
  (outside_bias): Embedding(18046, 1)
)

In [6]:
glove_file = datapath('glove.6B.100d.txt')  #search on the google
gensim = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True, limit=None)

In [7]:
# Function to calculate semantic and syntactic similarities
def evaluate_analogies(analogy_lines, model, word2index):
    """
    Evaluate the model's performance on semantic and syntactic analogy tasks.

    Args:
        analogy_lines (list): List of analogy strings in the format "word1 word2 word3 word4".
        model: The embedding model (e.g., Skip-gram or GloVe).
        word2index (dict): Mapping of words to their indices in the model vocabulary.

    Returns:
        float: Accuracy of the model on the analogy task.
    """
    # Extract vocabulary from word2index
    vocabs = list(word2index.keys())

    # Prepare embedding vectors for all vocabulary words
    all_word_vectors = []
    for word in vocabs:
        all_word_vectors.append(model.get_embed(word))
    all_word_vectors = torch.stack(all_word_vectors)

    correct_predictions = 0

    # Perform vector manipulations for each analogy
    for analogy in analogy_lines:
        words = analogy.split()

        # Assuming the analogy line has four words
        vectors = []
        for word in words[:3]:  # Only need the first three words for manipulation
            if word in vocabs:
                vectors.append(model.get_embed(word.lower()))
            else:
                vectors.append(model.get_embed('<UNK>'))  # Handle unknown words

        # Perform vector manipulation (e.g., subtraction and addition)
        result_vector = vectors[1] - vectors[0] + vectors[2]

        # Add a batch dimension to the result vector
        result_vector = result_vector.unsqueeze(0)

        # Calculate cosine similarities between the result vector and all vocabulary embeddings
        cosine_similarities = F.cosine_similarity(result_vector, all_word_vectors)

        # Find the index of the closest word in the vocabulary
        closest_word_index = torch.argmax(cosine_similarities).item()
        closest_word = vocabs[closest_word_index]

        # Check if the predicted word matches the target word (4th word in analogy line)
        if closest_word == words[3]:
            correct_predictions += 1

    accuracy = (correct_predictions / len(analogy_lines)) * 100
    print(f"Accuracy: {accuracy:.2f}%")
    return

In [8]:
def evaluate_analogy_gensim(analogy_lines, gensim_model):
    """
    Evaluate analogy gensim using a pre-trained gensim model.

    Args:
        analogy_lines (list): List of analogy questions in the format "word1 word2 word3 word4".
        gensim_model: The pre-trained gensim word embedding model.

    Returns:
        float: Accuracy of the model on the analogy task.
    """
    correct_predictions = 0  # Counter for correct answers

    # Process each analogy line
    for analogy in analogy_lines:
        # Split and preprocess words
        words = analogy.split()
        processed_words = []
        
        for word in words:
            word = word.lower()  # Convert to lowercase
            # Check if the word exists in the gensim model vocabulary
            if word in gensim_model:
                processed_words.append(word)
            else:
                processed_words.append('unknown')  # Use 'unknown' for missing words
        
        # Perform vector arithmetic using the gensim model
        try:
            most_similar_words = gensim_model.most_similar(
                positive=[processed_words[2], processed_words[1]], 
                negative=[processed_words[0]]
            )

            # Get the most similar word
            predicted_word = most_similar_words[0][0]

            # Check if the predicted word matches the target word (4th word)
            if predicted_word == processed_words[3]:
                correct_predictions += 1
        except KeyError:
            # Skip analogy if one or more words are missing in the model
            continue

    # Calculate accuracy
    accuracy = (correct_predictions / len(analogy_lines)) * 100
    print(f'Analogy Accuracy: {accuracy:.2f}%')
    return

## 2. Semantic and Syntatic Analysis

### Load test files

In [9]:
# Read capital-common-countries text file and create a list of tuples
with open('test_data/capital-common-countries.txt', 'r') as file:
    semantic_analogies = file.readlines()

In [10]:
# Read past-tense text file and create a list of tuples
with open('test_data/past-tense.txt', 'r') as file:
    syntatic_analogies = file.readlines()

### Semantic

In [11]:
#skipgram model
evaluate_analogies(semantic_analogies, skipgram, skipgram_word2index)

Accuracy: 0.00%


In [12]:
# skipgram negative sampling model
evaluate_analogies(semantic_analogies, skipgramNeg, skipgram_neg_word2index)

Accuracy: 0.00%


In [13]:
# glove model
evaluate_analogies(semantic_analogies, glove, glove_word2index)

Accuracy: 0.00%


In [14]:
# glove gensim model
evaluate_analogy_gensim(semantic_analogies, gensim)

Analogy Accuracy: 93.87%


### Syntatic

In [16]:
#skipgram model
evaluate_analogies(syntatic_analogies, skipgram, skipgram_word2index)

Accuracy: 0.00%


In [17]:
# skipgram negative sampling model
evaluate_analogies(syntatic_analogies, skipgramNeg, skipgram_neg_word2index)

Accuracy: 0.00%


In [18]:
# glove model
evaluate_analogies(syntatic_analogies, glove, glove_word2index)

Accuracy: 0.00%


In [19]:
# glove gensim model
evaluate_analogy_gensim(syntatic_analogies, gensim)

Analogy Accuracy: 55.45%


## 3. Similarity Analysis

In [24]:
def cosine_similarity(A, B):
    dot_product = np.dot(A.flatten(), B.flatten())
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

In [39]:
def evaluate_similarity(lines, model, is_gensim=False):
    """
    Evaluate the similarity between word pairs using a given model.
    
    Args:
        lines (list): List of lines containing word pairs and their similarity scores.
                      Each line should be in the format: "word1 word2 similarity_score".
        model: The word embedding model (e.g., Gensim, PyTorch-based).
        is_gensim (bool): Set to True if the model is a Gensim model.
    
    Returns:
        tuple: Spearman rank correlation (correlation coefficient and p-value).
    """
    scores_real = []  # Store the ground truth similarity scores
    scores_pred = []  # Store the model-predicted cosine similarities

    # Loop through each line in the dataset
    for line in lines:
        words = line.split()  # Split the line into components
        vec = []  # List to store word vectors

        # Extract vectors for the first two words
        for word in words[:2]:
            try:
                if is_gensim:
                    # Fetch vector directly for Gensim models
                    vec.append(model.get_vector(word))
                else:
                    # Fetch vector for non-Gensim models
                    vec.append(model.get_embed(word).detach().numpy())
            except:
                if is_gensim:
                    vec.append(model.get_vector('unknown'))
                else:
                    vec.append(model.get_embed('<UNK>').detach().numpy())

        # Append real similarity score (3rd column)
        scores_real.append(float(words[2]))

        # Compute cosine similarity between the two word vectors
        # scores_pred.append(cosine_similarity(np.array(vec[0]).reshape(1, -1), np.array(vec[1]).reshape(1, -1))[0][0])
        scores_pred.append(cosine_similarity(np.array(vec[0]), np.array(vec[1])))

    # Calculate Spearman rank correlation
    return spearmanr(scores_real, scores_pred)

In [30]:
# Read wordsim_similarity_goldstandard text file and create a list of tuples
with open('test_data/wordsim_similarity_goldstandard.txt', 'r') as file:
    similarity_lines = file.readlines()

In [34]:
# skipgram model
spearman_corr_skipgram = evaluate_similarity(similarity_lines, skipgram, is_gensim=False)
print(f"Skipgram Model Spearman Correlation: {spearman_corr_skipgram[0]}")

Skipgram Model Spearman Correlation: 0.04280092775965694


In [35]:
# skipgram negative sampling model
spearman_corr_skipgram_neg = evaluate_similarity(similarity_lines, skipgramNeg, is_gensim=False)
print(f"Skipgram Negative Sampling Model Spearman Correlation: {spearman_corr_skipgram_neg[0]}")

Skipgram Negative Sampling Model Spearman Correlation: -0.03984931349350284


In [36]:
# glove model
spearman_corr_glove = evaluate_similarity(similarity_lines, glove, is_gensim=False)
print(f"Glove Model Spearman Correlation: {spearman_corr_glove[0]}")

Glove Model Spearman Correlation: 0.10537471804776563


In [40]:
# glove gensim
spearman_corr_gensim = evaluate_similarity(similarity_lines, gensim, is_gensim=True)
print(f"Gensim Model Spearman Correlation: {spearman_corr_gensim[0]}")

Gensim Model Spearman Correlation: 0.5962863369934295


### Human Model

In [47]:
def evaluate_human_similarity(analogy_lines, human_scores_file):
    """
    Evaluate similarity scores by comparing analogy line scores and human-provided scores.

    Args:
        analogy_lines (list): List of strings, where each string is formatted as
                              "word1 word2 similarity_score".
        human_scores_file (str): Path to the file containing human-provided similarity scores.
                                 Each line should contain a single numeric value.

    Returns:
        tuple: Spearman rank correlation (correlation coefficient and p-value).
    """
    scores_real = []  # Store ground truth similarity scores
    scores_human = []  # Store human-provided similarity scores

    # Read human-provided similarity scores from file and store it in a dictionary
    human_scores_dict = {}
    with open(human_scores_file, 'r') as file_human:
        for line in file_human:
            words = line.split()  # Split each line into words
            if len(words) >= 3:  # Ensure the line has at least 3 columns
                # Use the first two words as the key and the third column (score) as the value
                human_scores_dict[(words[0], words[1])] = float(words[2])

    # Extract real similarity scores from analogy lines
    for line in analogy_lines:
        words = line.split()
        if len(words) >= 3:  # Ensure valid format
            try:
                scores_real.append(float(words[2]))
            except ValueError:
                print(f"Invalid similarity score in analogy line: {line} (skipping this entry).")

            # Fetch the predicted score from the human input file using the first two words as the key
            key = (words[0], words[1])

            if key in human_scores_dict:
                # Append the corresponding human-provided score to predicted scores
                scores_human.append(human_scores_dict[key])
            else:
                # If no match is found, handle it by appending a default value or handling differently
                print(f"No human score found for: {words[0]} - {words[1]}")
                scores_human.append(0)  # Default value

    # Check if the lengths of both lists match
    if len(scores_real) != len(scores_human):
        raise ValueError("Mismatch in the number of real and human-provided scores.")

    # Calculate Spearman correlation
    return spearmanr(scores_real, scores_human)

In [48]:
human_scores_file = "test_data/human_scores.txt"
correlation = evaluate_human_similarity(similarity_lines, human_scores_file)
print(f"Spearman Correlation: {correlation[0]}, p-value: {correlation[1]}")

Spearman Correlation: 0.9721074309203155, p-value: 1.493074200772165e-128
