In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pdb
import pickle
import string
import time
import gensim
import matplotlib.pyplot as plt
import nltk
import numpy as np
import scipy
import sklearn

from gensim.models import KeyedVectors
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer
from utils import (cosine_similarity, get_dict, process_tweet)
from os import getcwd

In [3]:
filePath = f"{getcwd()}/Datasets/"
nltk.data.path.append(filePath)

en_embeddings = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
fr_embeddings = KeyedVectors.load_word2vec_format('./wiki.multi.fr.vec')

In [4]:
# Loading the English to French Dictionaries
en_fr_train = get_dict('en-fr.train.txt')
print(f'The Length of the English to French Training Dictionary is: {len(en_fr_train)}')

The Length of the English to French Training Dictionary is: 5000


In [5]:
en_fr_test = get_dict('en-fr.test.txt')
print(f'The Length of the English to French Test Dictionary is: {len(en_fr_test)}')

The Length of the English to French Test Dictionary is: 1500


english_set = set(en_embeddings.vocab)
french_set = set(fr_embeddings.vocab)
en_embeddings_subset = {}
fr_embeddings_subset = {}

french_words = set(en_fr_train.values())

for en_word in en_fr_train.keys():
    fr_word = en_fr_train[en_word]
    if fr_word in french_set and en_word in english_set:
        en_embeddings_subset[en_word] = en_embeddings[en_word]
        fr_embeddings_subset[fr_word] = fr_embeddings[fr_word]

pickle.dump(en_embeddings_subset, open("en_embeddings.p", "wb"))
pickle.dump(en_embeddings_subset, open("fr_embeddings.p", "wb"))

In [6]:
en_embeddings_subset = pickle.load(open("en_embeddings.p", "rb"))
fr_embeddings_subset = pickle.load(open("fr_embeddings.p", "rb"))

In [7]:
# Loading the English to French Dictionaries
en_fr_train = get_dict('en-fr.train.txt')
print(f"The Length of the English to French Training Dictionary is: {len(en_fr_train)}")
en_fr_test = get_dict('en-fr.test.txt')
print(f"The Length of the English to French Training Dictionary is: {len(en_fr_test)}")

The Length of the English to French Training Dictionary is: 5000
The Length of the English to French Training Dictionary is: 1500


In [8]:
def get_matrices(en_fr, french_vecs, english_vecs):
    """
    Input:
        en_fr: English to French dictionary
        french_vecs: French words to their corresponding word embeddings.
        english_vecs: English words to their corresponding word embeddings.
    Output: 
        X: a matrix where the columns are the English embeddings.
        Y: a matrix where the columns correspong to the French embeddings.
        R: the projection matrix that minimizes the F norm ||X R -Y||^2
    """
    # X_l and Y_l are lists of the english and french word embeddings
    X_l = list()
    Y_l = list()
    
    # Get the English Words
    english_set = english_vecs.keys()
    
    # Get the French Words
    french_set = french_vecs.keys()
    
    # Store the French Words that are part of the English-French Dictionary
    french_words = set(en_fr.values())
    
    for en_word, fr_word in en_fr.items():
        if fr_word in french_set and en_word in english_set:
            en_vec = english_vecs[en_word]
            fr_vec = french_vecs[fr_word]
            X_l.append(en_vec)
            Y_l.append(fr_vec)
    X = np.vstack(X_l)
    Y = np.vstack(Y_l)
    return X, Y

In [9]:
X_train, Y_train = get_matrices(en_fr_train,fr_embeddings_subset, en_embeddings_subset)

In [10]:
X_train.shape, Y_train.shape

((4932, 300), (4932, 300))

In [11]:
def compute_loss(X, Y, R):
    m = X.shape[0]
    
    diff = np.dot(X,R) - Y
    
    diff_squared = diff ** 2
    
    sum_diff_squared = np.sum(diff_squared)
    
    loss = sum_diff_squared / m
    
    return loss

In [12]:
def compute_gradient(X, Y, R):
    m = X.shape[0]
    gradient = np.dot(X.transpose(), np.dot(X,R) - Y) * (2/m)
    return gradient

In [13]:
def align_embeddings(X, Y, train_steps=100, learning_rate=0.0003):
    np.random.seed(129)
    R = np.random.rand(X.shape[1], X.shape[1])
    for i in range(train_steps):
        if i%25 ==0 :
            print(f"Loss at Iteration {i} is: {compute_loss(X, Y, R):.4f}")
        gradient = compute_gradient(X, Y, R)
        
        R -= learning_rate * gradient
    return R

In [14]:
np.random.seed(129)

m = 10
n = 5
X = np.random.rand(m, n)
Y = np.random.rand(m, n) * .1
R = align_embeddings(X, Y)

Loss at Iteration 0 is: 3.7242
Loss at Iteration 25 is: 3.6283
Loss at Iteration 50 is: 3.5350
Loss at Iteration 75 is: 3.4442


In [15]:
R_train = align_embeddings(X_train, Y_train, train_steps=400, learning_rate=0.8)

Loss at Iteration 0 is: 963.0146
Loss at Iteration 25 is: 97.8292
Loss at Iteration 50 is: 26.8329
Loss at Iteration 75 is: 9.7893
Loss at Iteration 100 is: 4.3776
Loss at Iteration 125 is: 2.3281
Loss at Iteration 150 is: 1.4480
Loss at Iteration 175 is: 1.0338
Loss at Iteration 200 is: 0.8251
Loss at Iteration 225 is: 0.7145
Loss at Iteration 250 is: 0.6534
Loss at Iteration 275 is: 0.6185
Loss at Iteration 300 is: 0.5981
Loss at Iteration 325 is: 0.5858
Loss at Iteration 350 is: 0.5782
Loss at Iteration 375 is: 0.5735


In [16]:
def nearest_neighbour(v, candidates, k=1):
    similarity_l = []
    
    for row in candidates:
        cos_similarity = cosine_similarity(v, row)
        
        similarity_l.append(cos_similarity)
        
    sorted_ids = np.argsort(similarity_l)
    
    k_idx = sorted_ids[-k:]
    return k_idx

In [18]:
def test_vocabulary(X, Y, R):
    pred = np.dot(X,R)
    
    num_correct = 0
    
    for i in range(len(pred)):
        pred_idx = nearest_neighbour(pred[i], Y)
        
        if pred_idx == i:
            num_correct += 1
    accuracy = num_correct / len(pred)
    return accuracy

In [19]:
X_val, Y_val = get_matrices(en_fr_test, fr_embeddings_subset, en_embeddings_subset)

In [20]:
acc = test_vocabulary(X_val, Y_val, R_train)

In [21]:
print(f"Accuracy on Test Set is: {acc:.3f}")

Accuracy on Test Set is: 0.557


In [22]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
all_tweets = all_positive_tweets + all_negative_tweets