In [1]:
import pickle
import string

import time
from tqdm import tqdm
import gensim
import matplotlib.pyplot as plt
import nltk
import numpy as np
import scipy
import sklearn
from gensim.models import KeyedVectors
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer

from utils import (cosine_similarity, get_dict,
                   process_tweet)
from os import getcwd



In [2]:
from gensim.models import KeyedVectors

en_embeddings = KeyedVectors.load_word2vec_format('word_vecs/GoogleNews-vectors-negative300.bin', binary = True)
fr_embeddings = KeyedVectors.load_word2vec_format('word_vecs/wiki.r.vec')

In [3]:
en_fr_train = get_dict('en-fr.train.txt')
en_fr_test = get_dict('en-fr.test.txt')

english_set = set(en_embeddings.key_to_index)
french_set = set(fr_embeddings.key_to_index)
en_embeddings_subset = {}
fr_embeddings_subset = {}
french_words = set(en_fr_train.values())

In [4]:
for en_word in en_fr_train.keys():
    fr_word = en_fr_train[en_word]
    if fr_word in french_set and en_word in english_set:
        en_embeddings_subset[en_word] = en_embeddings[en_word]
        fr_embeddings_subset[fr_word] = fr_embeddings[fr_word]


In [5]:
for en_word in en_fr_test.keys():
    fr_word = en_fr_test[en_word]
    if fr_word in french_set and en_word in english_set:
        en_embeddings_subset[en_word] = en_embeddings[en_word]
        fr_embeddings_subset[fr_word] = fr_embeddings[fr_word]

In [6]:
def generate_translation_embeddings(en_fr,en_embed,fr_embed):
    X = []
    Y = []
    english_set = en_embed.keys()
    french_set = fr_embed.keys()
    
    for eng,fre in en_fr.items():
        if eng in english_set and fre in french_set:
            X.append(en_embed[eng])
            Y.append(fr_embed[fre])
            
            
    X = np.vstack(X)
    Y = np.vstack(Y)
    return (X,Y)


# getting the training set:
X_train, Y_train = generate_translation_embeddings(
    en_fr_train, fr_embeddings_subset, en_embeddings_subset)
        
        

In [7]:
def computing_loss(X,Y,R):
    
    m = X.shape[0]
    diff = np.dot(X,R)- Y
    diff_squared = diff**2
    sum_diff_squared = np.sum(diff_squared)
    loss = sum_diff_squared/m
    return loss


def train(X,Y,iters=10000, alpha = 0.1):
    m = X.shape[0]
    np.random.seed(123)
    #initialize R
    R = np.random.rand(X.shape[1], X.shape[1])
    
    for i in tqdm(range(iters)):
        
        gradient =  np.dot(X.transpose(),np.dot(X,R)-Y) * (2/m)
        
        R -= alpha * gradient
        
    return R

In [12]:
R_train = train(X_train,Y_train,iters=500,alpha=0.1)

100%|██████████| 500/500 [00:01<00:00, 289.64it/s]


In [13]:
def KNN(v, candidates, k = 1):
    
    similarity_l = []

    # for each candidate vector...
    for row in candidates:
        # get the cosine similarity
        cos_similarity = cosine_similarity(v,row)

        # append the similarity to the list
        similarity_l.append(cos_similarity)
        
    # sort the similarity list and get the indices of the sorted list
    sorted_ids = np.argsort(similarity_l)

    # get the indices of the k most similar candidate vectors
    k_idx = sorted_ids[-k:]
    ### END CODE HERE ###
    return k_idx
        


In [14]:
def test_accuracy(X,Y,R):
    
    prediction = np.dot(X,R)
    num_correct = 0
    
    for i in tqdm(range(len(prediction))):
        pred_idx = KNN(prediction[i],Y)
        if pred_idx == i:
            num_correct +=1
            
            
    accuracy = num_correct/len(prediction)
            
    return accuracy

In [17]:

R_train = train(X_train,Y_train,iters=100,alpha=.8)
X_val,Y_val = generate_translation_embeddings(en_fr_test, en_embeddings_subset,fr_embeddings_subset)
accuracy = test_accuracy(X_val,Y_val,R_train)
print(accuracy)

100%|██████████| 100/100 [00:00<00:00, 265.88it/s]
100%|██████████| 1438/1438 [00:19<00:00, 72.76it/s]

0.0006954102920723226



