In [3]:
import pandas as pd
import numpy as np
import gensim.downloader as api
from sklearn.decomposition import PCA
import copy
from gensim.models import KeyedVectors

# Loading Word2Vec Model
Loading Word2vec pretrained model on google news 300

In [4]:
# Word2Vec embeddings 
word2vec_model = api.load('word2vec-google-news-300')

# Loading Glove Model 
Loading Glove model with 42B word vectors and 300 dimensions

In [5]:
def load_glove_model(glove_file_path):
    # Convert GloVe to Word2Vec format directly in Gensim
    model = KeyedVectors.load_word2vec_format(glove_file_path, no_header=True, binary=False)
    return model

glove_file_path = "/mnt/hdd/karmpatel/naman/demo/glove/glove.42B.300d.txt"
glove_model = load_glove_model(glove_file_path)


# Loading Dataset

1. **WordSim353**
2. **RG65**
3. **Men3000**

In [6]:
file_names = ["wordsim353.csv", "rg65.csv", "men3000.csv"]
file_path = f"/mnt/hdd/karmpatel/naman/demo/all_but_the_top/{file_names[0]}"
df = pd.read_csv(file_path, header=None, names=['word1', 'word2', 'similarity_score'], delimiter=';')

# Normalizing score
mean = df['similarity_score'].mean()
std = df['similarity_score'].std()
df['similarity_score_normalized'] = (df['similarity_score'] - mean) / std

print(f"Data frame shape : {df.shape}")
print("Dataframe top 5 rows")
print(df.head)

# Get unique words
unique_words = pd.concat([df['word1'], df['word2']]).unique()
unique_words_ls = unique_words.tolist()
# Sort all the unique words 
unique_words_ls.sort() 

# Create unique word mapping 
# Key : Word , Value : Index 
unique_words_mapper_word2idx = {} 
for i in range(len(unique_words_ls)) : 
    unique_words_mapper_word2idx[unique_words_ls[i]] = i 

print(f"Unique keys : {len(unique_words_mapper_word2idx)}")


Data frame shape : (353, 4)
Dataframe top 5 rows
<bound method NDFrame.head of             word1     word2  similarity_score  similarity_score_normalized
0            love       sex              6.77                     0.420200
1           tiger       cat              7.35                     0.686808
2           tiger     tiger             10.00                     1.904931
3            book     paper              7.46                     0.737372
4        computer  keyboard              7.62                     0.810919
..            ...       ...               ...                          ...
348        shower     flood              6.03                     0.080045
349       weather  forecast              8.34                     1.141880
350      disaster      area              6.25                     0.181172
351      governor    office              6.34                     0.222542
352  architecture   century              3.78                    -0.954210

[353 rows x 4 column

# Creating embedding matrix 
1. Word2Vec
2. Glove

In [8]:
# Word2Vec Matrix
word2vec_matrix = [] 
for i in range(len(unique_words_ls)) : 
    word2vec_matrix.append(word2vec_model[unique_words_ls[i]])
    
word2vec_matrix = np.vstack(word2vec_matrix)
print(f"Word2vec matrix shape : {word2vec_matrix.shape}")

Word2vec matrix shape : (437, 300)


In [18]:
# Glove Matrix 
glove_matrix = [] 
for i in range(len(unique_words_ls)) : 
    word = unique_words_ls[i]
    vector = glove_model.get_vector(word) if word in glove_model else None
    if vector is not None : 
        glove_matrix.append(vector)
    else : 
        glove_matrix.append(np.zeros(300))

glove_matrix = np.vstack(glove_matrix)
print(f"Glove matrix shape : {glove_matrix.shape}")

Glove matrix shape : (437, 300)


# Post processing algorithm 
1. Zero mean vector
2. Subtracting top D dimension, D can vary from d = 1, 2, 3, ...

In paper they have mentioned that D is a hyperparameter, and for illustration purpose we have fixed it to 1, where it is giving best performance, but generally it is around d/100 size

In [13]:
def get_processed_embeddings(embedding_matrix_orig, n_components = 1):
	pca = PCA(n_components=n_components)
	embedding_matrix = copy.deepcopy(embedding_matrix_orig)
	mean = np.average(embedding_matrix, axis=0)
	temp = embedding_matrix - mean
	principalComponents = pca.fit_transform(temp)
	principalAxes = pca.components_
	toSubstract = np.matmul(np.matmul(embedding_matrix, principalAxes.T), principalAxes)
	processed = temp - toSubstract
	return processed



In [20]:
# Word2vec
word2vec_matrix_postprocessed = get_processed_embeddings(word2vec_matrix, n_components = 1)
print(f"Word2vec matrix postprocess shape : {word2vec_matrix_postprocessed.shape}")

# Glove
glove_matrix_postprocessed = get_processed_embeddings(glove_matrix, n_components = 1)
print(f"Glove matrix postprocess shape : {glove_matrix_postprocessed.shape}")

Word2vec matrix postprocess shape : (437, 300)
Glove matrix postprocess shape : (437, 300)


In [21]:
# Cosine score of word2vec original 
df['word2vec_cs_original'] = df.apply(lambda row: (
    row['similarity_score_normalized'] * (
        np.dot(
            word2vec_matrix[unique_words_mapper_word2idx[row['word1']]],
            word2vec_matrix[unique_words_mapper_word2idx[row['word2']]]
        ) / (
            np.linalg.norm(word2vec_matrix[unique_words_mapper_word2idx[row['word1']]]) *
            np.linalg.norm(word2vec_matrix[unique_words_mapper_word2idx[row['word2']]])
        )
    )
), axis=1)

df['word2vec_cs_postprocess'] = df.apply(lambda row: (
    row['similarity_score_normalized'] * (
        np.dot(
            word2vec_matrix_postprocessed[unique_words_mapper_word2idx[row['word1']]],
            word2vec_matrix_postprocessed[unique_words_mapper_word2idx[row['word2']]]
        ) / (
            np.linalg.norm(word2vec_matrix_postprocessed[unique_words_mapper_word2idx[row['word1']]]) *
            np.linalg.norm(word2vec_matrix_postprocessed[unique_words_mapper_word2idx[row['word2']]])
        )
    )
), axis=1)

df['glove_cs_original'] = df.apply(lambda row: (
    row['similarity_score_normalized'] * (
        np.dot(
            glove_matrix[unique_words_mapper_word2idx[row['word1']]],
            glove_matrix[unique_words_mapper_word2idx[row['word2']]]
        ) / (
            np.linalg.norm(glove_matrix[unique_words_mapper_word2idx[row['word1']]]) *
            np.linalg.norm(glove_matrix[unique_words_mapper_word2idx[row['word2']]])
        )
    ) if np.all(glove_matrix[unique_words_mapper_word2idx[row['word1']]] != 0) and np.all(glove_matrix[unique_words_mapper_word2idx[row['word2']]] != 0)
    else 0
), axis=1)


df['glove_cs_postprocess'] = df.apply(lambda row: (
    row['similarity_score_normalized'] * (
        np.dot(
            glove_matrix_postprocessed[unique_words_mapper_word2idx[row['word1']]],
            glove_matrix_postprocessed[unique_words_mapper_word2idx[row['word2']]]
        ) / (
            np.linalg.norm(glove_matrix_postprocessed[unique_words_mapper_word2idx[row['word1']]]) *
            np.linalg.norm(glove_matrix_postprocessed[unique_words_mapper_word2idx[row['word2']]])
        )
    )  if np.all(glove_matrix[unique_words_mapper_word2idx[row['word1']]] != 0) and np.all(glove_matrix[unique_words_mapper_word2idx[row['word2']]] != 0)
    else 0
), axis=1)

df.head()

Unnamed: 0,word1,word2,similarity_score,similarity_score_normalized,word2vec_cs_original,word2vec_cs_postprocess,glove_cs_original,glove_cs_postprocess
0,love,sex,6.77,0.4202,0.110907,0.061679,0.211496,0.16304
1,tiger,cat,7.35,0.686808,0.355283,0.261792,0.344203,0.312226
2,tiger,tiger,10.0,1.904931,1.904931,1.904931,1.904931,1.904931
3,book,paper,7.46,0.737372,0.268007,0.227952,0.401583,0.302883
4,computer,keyboard,7.62,0.810919,0.321441,0.279749,0.455347,0.45102


In [22]:
print("Cosine Similarity")
print(f" Word2vec Original :{df['word2vec_cs_original'].mean()}") 
print(f" Word2vec Postprocess :{df['word2vec_cs_postprocess'].mean()}") 
print(f" Glove Original :{df['glove_cs_original'].mean()}") 
print(f" Glove Postprocess :{df['glove_cs_postprocess'].mean()}") 

Cosine Similarity
 Word2vec Original :0.11486492291948834
 Word2vec Postprocess :0.12044304573534262
 Glove Original :0.0894694131641246
 Glove Postprocess :0.09754523810300944


# Word Anology Dataset

In [24]:
file_path = f"/mnt/hdd/karmpatel/naman/demo/all_but_the_top/questions-words.csv"
df = pd.read_csv(file_path)
df = df.drop(columns = ['row_id', 'category'])
df

Unnamed: 0,word_one,word_two,word_three,word_four
0,Athens,Greece,Baghdad,Iraq
1,Athens,Greece,Bangkok,Thailand
2,Athens,Greece,Beijing,China
3,Athens,Greece,Berlin,Germany
4,Athens,Greece,Bern,Switzerland
...,...,...,...,...
19539,write,writes,talk,talks
19540,write,writes,think,thinks
19541,write,writes,vanish,vanishes
19542,write,writes,walk,walks


In [28]:
# Get unique words
unique_words = pd.concat([df['word_one'], df['word_two'], df['word_three'], df['word_four']]).unique()
unique_words_ls = unique_words.tolist()
# Sort all the unique words 
unique_words_ls.sort() 

# Create unique word mapping 
# Key : Word , Value : Index 
unique_words_mapper_word2idx = {} 
for i in range(len(unique_words_ls)) : 
    unique_words_mapper_word2idx[unique_words_ls[i]] = i 

print(f"Unique keys : {len(unique_words_mapper_word2idx)}")

Unique keys : 905


In [29]:
# Compute Vector v(w2) - v(w1) + v(3)
# Word2Vec Matrix
word2vec_matrix = [] 
for i in range(len(unique_words_ls)) : 
    word2vec_matrix.append(word2vec_model[unique_words_ls[i]])
    
word2vec_matrix = np.vstack(word2vec_matrix)
print(f"Word2vec matrix shape : {word2vec_matrix.shape}")

Word2vec matrix shape : (905, 300)


In [30]:
# Glove Matrix 
glove_matrix = [] 
for i in range(len(unique_words_ls)) : 
    word = unique_words_ls[i]
    vector = glove_model.get_vector(word) if word in glove_model else None
    if vector is not None : 
        glove_matrix.append(vector)
    else : 
        glove_matrix.append(np.zeros(300))

glove_matrix = np.vstack(glove_matrix)
print(f"Glove matrix shape : {glove_matrix.shape}")

Glove matrix shape : (905, 300)


In [31]:
# Word2vec
word2vec_matrix_postprocessed = get_processed_embeddings(word2vec_matrix, n_components = 1)
print(f"Word2vec matrix postprocess shape : {word2vec_matrix_postprocessed.shape}")

# Glove
glove_matrix_postprocessed = get_processed_embeddings(glove_matrix, n_components = 1)
print(f"Glove matrix postprocess shape : {glove_matrix_postprocessed.shape}")

Word2vec matrix postprocess shape : (905, 300)
Glove matrix postprocess shape : (905, 300)
