
<h1 style="font-family:verdana;font-size:300%;text-align:center;background-color:#f2f2f2;color:#0d0d0d">AMMI NLP - Review sessions</h1>

<h1 style="font-family:verdana;font-size:180%;text-align:Center;color:#993333"> Lab 2: Introduction to wordvectors </h1>

**Big thanks to Amr Khalifa who improved this lab and made it to a Jupyter Notebook!**

In [1]:
import io, sys
import numpy as np

In [2]:
def load_vectors(filename):
    fin = io.open(filename, 'r', encoding='utf-8', newline='\n')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])
    return data

In [4]:
# Loading word vectors

print('')
print(' ** Word vectors ** ')
print('')

'''
word_vectors is a dictionary that maps words to their numerical word vector
[word (string)] = [np-array] 
'''
word_vectors = load_vectors('wiki.en.vec')

tree_vector = word_vectors['tree']
print(type(tree_vector), len(tree_vector))


 ** Word vectors ** 

<class 'numpy.ndarray'> 300


In [5]:
## This function computes the cosine similarity between vectors u and v

def cosine(u, v):
    '''
    Parameters:
    u : 1-D numpy array
    v : 1-D numpy array 
    
    Returns:
    cos (float) : value of the cosine similairy between vectors u, v 
    '''
    # norm_u = round(np.sqrt(sum([a*a for a in u])),3)
    # norm_v = round(np.sqrt(sum([a*a for a in v])),3)
    # uv = sum(a*b for a,b in zip(u,v))
    # cos = uv/(norm_u * norm_v)
    norm_u = np.sqrt(u.T@u)
    norm_v = np.sqrt(v.T@v)
    
    ## FILL CODE
    #print(u.T@v)
    cos = u.T@v/(norm_u * norm_v)
    return cos 


In [6]:
# compute similarity between words

print('similarity(apple, apples) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['apples']))
print('similarity(apple, banana) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['banana']))
print('similarity(apple, tiger) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['tiger']))

similarity(apple, apples) = 0.637
similarity(apple, banana) = 0.431
similarity(apple, tiger) = 0.212


In [7]:
## Functions for nearest neighbor
## This function returns the word corresponding to 
## nearest neighbor vector of x
## The list exclude_words can be used to exclude some
## words from the nearest neighbors search

def nearest_neighbor(x, word_vectors, exclude_words=[]):
    '''
    Parameters:
    x (string): word to find its nearest neighbour 
    word_vectors (Python dict): {word (string): np-array of word vector}
    exclude_words (list of strings): words to be excluded from the search
    
    Returns:
    best_word (string) : the word whose word vector is the nearest neighbour 
    to the word vector of x
    '''
    best_score = -1.0
    best_word = None
    for word, value in word_vectors.items():
      cos = cosine(x, value)
      if cos >= best_score and word not in exclude_words:
        best_score = cos
        best_word = word
    ## FILL CODE
            
    return best_word

In [8]:
print('')
print('The nearest neighbor of cat is: ' +
      nearest_neighbor(word_vectors['cat'], word_vectors, exclude_words = ['cat', 'cats']))


The nearest neighbor of cat is: dog


#### Hint (using python priorty queues with the heapq datastructure): 
if you don't want to store all the words and scores you can use the priortiy queue and only store the best K element so far. 

In [9]:
## This function return the words corresponding to the
## K nearest neighbors of vector x.
## You can use the functions heappush and heappop.
from heapq import  heappush, nlargest
def knn(x, vectors, k):
    '''
    Parameters:
    x (string): word to find its nearest neighbour 
    word_vectors (Python dict): {word (string): np-array of word vector}
    k (int): number of nearest neighbours to be found
    
    Returns: 
    k_nearest_neighbors (list of tuples): [(score, word), (score, word), ....]
    '''

    k_nearest_neighbors = []
    cos_array = []
    ## FILL CODE
    for word, value in word_vectors.items():
      cos = cosine(x, value)
      heappush(cos_array, (cos, word))
    k_nearest_neighbors = nlargest(k,cos_array)
    
    return k_nearest_neighbors

In [10]:
knn_cat = knn(word_vectors['cat'], word_vectors, 7)
knn_cat
print('')
print('cat')
print('--------------')
for score, word in knn(word_vectors['cat'], word_vectors, 5):
    print (word + '\t%.3f' % score)



cat
--------------
cat	1.000
cats	0.732
dog	0.638
pet	0.573
rabbit	0.549


#### Hint: 
To find the analogies, we find the nearest neighbour associated with the wordvector d
$$ d = \frac{c}{\Vert {c} \Vert} + \frac{b}{\Vert {b} \Vert} - \frac{a}{\Vert {a} \Vert}$$


In [11]:
## This function return the words d, such that a:b and c:d
## verifies the same relation

def analogy(a, b, c, word_vectors):
    '''
    Parameters:
    a (string): word a
    b (string): word b
    c (string): word c
    word_vectors (Python dict): {word (string): np-array of word vector}
    
    Returnrs: 
    the word d (string) associated with c such that c:d is similar to a:b 
    
    '''
    val_a = word_vectors[a]
    val_b = word_vectors[b]
    val_c = word_vectors[c]
    val_d = val_c/ np.sqrt(val_c.T@val_c) + val_b/ np.sqrt(val_b.T@val_b) - val_a /np.sqrt(val_a.T@val_a)
    ## FILL CODE
    d = nearest_neighbor(val_d, word_vectors, exclude_words=["paris", "france", "rome"])
    return d

In [12]:
# Word analogies

print('')
print('france - paris + rome = ' + analogy('paris', 'france', 'rome', word_vectors))


france - paris + rome = italy


## A word about biases in word vectors

In [13]:
## A word about biases in word vectors:

print('')
print('similarity(genius, man) = %.3f' %
      cosine(word_vectors['man'], word_vectors['genius']))
print('similarity(genius, woman) = %.3f' %
      cosine(word_vectors['woman'], word_vectors['genius']))


similarity(genius, man) = 0.445
similarity(genius, woman) = 0.325


In [14]:
## Compute the association strength between:
##   - a word w
##   - two sets of attributes A and B

def association_strength(w, A, B, vectors):
    '''
    Parameters:
    w (string): word w
    A (list of strings): The words belonging to set A
    B (list of strings): The words belonging to set B
    vectors (Python dict): {word (string): np-array of word vector}
    
    Returnrs: 
    strength (float): the value of the association strength 
    '''
    
    strength = 0.0
    part_a = 0.0
    part_b = 0.0 

    for a in A:
      cos_a= cosine(word_vectors[w],word_vectors[a])
      part_a = part_a + cos_a
    part_a /= len(A)
    for b in B:
      cos_b= cosine(word_vectors[w],word_vectors[b])
      part_b = part_b + cos_b
    part_b /= len(B)
    strength = part_a- part_b
    ## FILL CODE
    return strength

In [15]:
## Perform the word embedding association test between:
##   - two sets of words X and Y
##   - two sets of attributes A and B

def weat(X, Y, A, B, vectors):
    '''
    Parameters:
    X (list of strings): The words belonging to set X
    Y (list of strings): The words belonging to set Y
    A (list of strings): The words belonging to set A
    B (list of strings): The words belonging to set B
    vectors (Python dict): {word (string): np-array of word vector}
    
    Returns: 
    score (float): the value of the group association strength  
    '''
    
    score = 0.0
    asso_x = 0.0
    asso_y = 0.0
    for x in X:
      strength_x = association_strength(x, A, B, vectors)
      asso_x = asso_x + strength_x
    for y in Y:
      strength_y = association_strength(y, A, B, vectors)
      asso_y = asso_y + strength_y
    score = asso_x - asso_y
    ## FILL CODE
    return score

In [16]:
## Replicate one of the experiments from:
##
## Semantics derived automatically from language corpora contain human-like biases
## Caliskan, Bryson, Narayanan (2017)

career = ['executive', 'management', 'professional', 'corporation', 
          'salary', 'office', 'business', 'career']
family = ['home', 'parents', 'children', 'family',
          'cousins', 'marriage', 'wedding', 'relatives']
male = ['john', 'paul', 'mike', 'kevin', 'steve', 'greg', 'jeff', 'bill']
female = ['amy', 'joan', 'lisa', 'sarah', 'diana', 'kate', 'ann', 'donna']

print('')
print('Word embedding association test: %.3f' %
      weat(career, family, male, female, word_vectors))


Word embedding association test: 0.847


## Word translation using word vectors

In the following, we will use word vectors in English and French to translate words from English to French. The idea is to learn a linear function that maps English word vectors to their correponding French word vectors. To learn this linear mapping, we will use a small bilingual lexicon, that contains pairs of words in English and French that are translations of each other.

The following function will load the small English-French bilingual lexicon:

In [17]:
def load_lexicon(filename):
    '''
    Parameters:
    filename(string): the path of the lexicon
    
    Returns:
    data(list of pairs of string): the bilingual lexicon
    '''
    fin = io.open(filename, 'r', encoding='utf-8', newline='\n')
    data = []
    for line in fin:
        a, b = line.rstrip().split(' ')
        data.append((a, b))
    return data

In [18]:
word_vectors_en = load_vectors('wiki.en.vec')
word_vectors_fr = load_vectors('wiki.fr.vec')
lexicon = load_lexicon("lexicon-en-fr.txt")
print(lexicon[:5])

[('the', 'le'), ('the', 'les'), ('the', 'la'), ('and', 'et'), ('was', 'fut')]


In [19]:
# We split the lexicon into a train and validation set
train = lexicon[:5000]
valid = lexicon[5000:5100]

The following function will learn the mapping from English to French. The idea is to build two matrices $X_{\text{en}}$ and $X_{\text{fr}}$, and to find a mapping $M$ that minimizes $||X_{\text{en}} W - X_{\text{fr}} ||_2$. In numpy, this mapping can be obtained by using the `numpy.linalg.lstsq` function.

In [20]:
def align(word_vectors_en, word_vectors_fr, lexicon):
    '''
    Parameters:
    word_vectors_en(dict: string -> np.array): English word vectors
    word_vectors_en(dict: string -> np.array): French word vectors
    lexicon(list of pairs of string): bilingual training lexicon
    
    Returns
    mapping(np.array): the mapping from English to French vectors
    '''
    x_en, x_fr = [], []
    for english, french in lexicon:
      ang = word_vectors_en[english]
      fra = word_vectors_fr[french]

      x_en.append(ang)
      x_fr.append(fra)

    mapper = np.linalg.lstsq(x_en, x_fr,rcond=False)[0]
    ## FILL CODE
    
    return mapper

In [21]:
mapping = align(word_vectors_en, word_vectors_fr, lexicon)

Given a mapping, a set of word English word vector and French word vectors, the next function will translate the English word to French. To do so, we apply the mapping on the English word, and retrieve the nearest neighbor of the obtained vector in the set of French word vectors. The translation is then the corresponding French word.

In [22]:
def translate(word, word_vectors_en, word_vectors_fr, mapping):
    '''
    Parameters:
    word(string): an English word
    word_vectors_en(dict: string -> np.array): English word vectors
    word_vectors_en(dict: string -> np.array): French word vectors
    mapping(np.array): the mapping from English to French vectors
    
    Returns
    A string containing the translation of the English word
    '''
    word_en =word_vectors_en[word]
    vect_fr = word_en@mapping
    word_pred = nearest_neighbor(vect_fr, word_vectors_fr)
    ## FILL CODE

    return word_pred

In [23]:
print(translate("dog", word_vectors_en, word_vectors_fr, mapping))
print(translate("cats", word_vectors_en, word_vectors_fr, mapping))
print(translate("learning", word_vectors_en, word_vectors_fr, mapping))

chien
chats
apprentissage


Finally, let's implement a function to evaluate this method on the validation lexicon:

In [24]:
def evaluate(valid, word_vectors_en, word_vectors_fr, mapping):
    '''
    Parameters:
    valid(a list of pairs of string): the validation lexicon
    word_vectors_en(dict: string -> np.array): English word vectors
    word_vectors_en(dict: string -> np.array): French word vectors
    mapping(np.array): the mapping from English to French vectors
    
    Returns
    Accuracy(float): the accuracy on the validation lexicon
    '''
    acc, n = 0.0, 0
    for x, y in valid:
      word_pred = translate(x, word_vectors_en, word_vectors_fr, mapping)
      y_equal= np.where(word_pred==y,1,0)
      if y_equal == 1:
        acc +=1
    acc = acc/len(valid)
    ## FILL CODE

    return acc

In [25]:
evaluate(valid, word_vectors_en, word_vectors_fr, mapping)

0.64