
<h1 style="font-family:verdana;font-size:300%;text-align:center;background-color:#f2f2f2;color:#0d0d0d">MMI_2024_NLP - Week 1</h1>

<h1 style="font-family:verdana;font-size:180%;text-align:Center;color:#993333"> Lab 2: Introduction to wordvectors </h1>


In [1]:
import io, sys
import numpy as np

In [17]:

def load_vectors(filename):
    fin = io.open(filename, 'r', encoding='utf-8', newline='\n')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])
    return data

In [18]:
# Loading word vectors

print('')
print(' ** Word vectors ** ')
print('')

'''
word_vectors is a dictionary that maps words to their numerical word vector
[word (string)] = [np-array]
'''
word_vectors = load_vectors('wiki.en.vec')

tree_vector = word_vectors['tree']
print(type(tree_vector), len(tree_vector))


 ** Word vectors ** 

<class 'numpy.ndarray'> 300


In [19]:
## This function computes the cosine similarity between vectors u and v
import numpy as np

def cosine(u: np.ndarray, v: np.ndarray) -> float:
    '''
    Parameters:
    u : 1-D numpy array
    v : 1-D numpy array

    Returns:
    cos (float) : value of the cosine similarity between vectors u, v
    '''
    ##########################################################################
    #                      TODO: Implement this function                     #
    ##########################################################################
    # Compute the dot product of u and v
    dot_prod = np.dot(u, v)
    
    # Compute the L2 norm (magnitude) of u and v
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    
    # Compute the cosine similarity
    cos = dot_prod / (norm_u * norm_v)
    ##########################################################################
    #                            END OF YOUR CODE                            #
    ##########################################################################
    
    return cos

In [5]:
# compute similarity between words
print (f"test similarity {cosine(np.array([1,0,0]),np.array([1,0,0]))}", )
print('similarity(apple, apples) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['apples']))
print('similarity(apple, banana) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['banana']))
print('similarity(apple, tiger) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['tiger']))

test similarity 1.0
similarity(apple, apples) = 0.637
similarity(apple, banana) = 0.431
similarity(apple, tiger) = 0.212


In [29]:
## Functions for nearest neighbor
## This function returns the word corresponding to
## nearest neighbor vector of x
## The list exclude_words can be used to exclude some
## words from the nearest neighbors search

import numpy as np

def nearest_neighbor(x: str, word_vectors: dict, exclude_words: list = []) -> str:
    '''
    Parameters:
    x (string): word to find its nearest neighbour
    word_vectors (Python dict): {word (string): np-array of word vector}
    exclude_words (list of strings): words to be excluded from the search

    Returns:
    best_word (string) : the word whose word vector is the nearest neighbour
    to the word vector of x
    '''
    best_score = -1.0
    best_word = None
    ##########################################################################
    #                      TODO: Implement this function                     #
    ##########################################################################
    #print(type(x))
    x_vector = x 
    if x_vector is None:
        raise ValueError(f"The word '{x}' is not in the word_vectors dictionary.")
    
    for word, vector in word_vectors.items():
        if word != x and word not in exclude_words:
            score = cosine(x_vector, vector)
            if score > best_score:
                best_score = score
                best_word = word
    
    return best_word
    ##########################################################################
    #                            END OF YOUR CODE                            #
    ##########################################################################

In [30]:
print('')
print('The nearest neighbor of cat is: ' +
      nearest_neighbor(word_vectors['cat'], word_vectors, exclude_words = ['cat', 'cats']))




  if word != x and word not in exclude_words:


The nearest neighbor of cat is: dog


#### Hint (using python priorty queues with the heapq datastructure):
if you don't want to store all the words and scores you can use the priortiy queue and only store the best K element so far.

In [33]:
## This function return the words corresponding to the
## K nearest neighbors of vector x.
## You can use the functions heappush and heappop.
import heapq

def knn(x: str, word_vectors: dict, k: int):
    '''
    Parameters:
    x (string): word to find its nearest neighbour
    word_vectors (Python dict): {word (string): np-array of word vector}
    k (int): number of nearest neighbours to be found

    Returns:
    k_nearest_neighbors (list of tuples): [(score, word), (score, word), ....]
    '''
    k_nearest_neighbors = []
    ##########################################################################
    #                      TODO: Implement this function                     #
    ##########################################################################
    x_vector = x 
    
    if x_vector is None:
        raise ValueError(f"The word '{x}' is not in the word_vectors dictionary.")
    
    for word, vector in word_vectors.items():
        if word != x:
            score = cosine(x_vector, vector)
            heapq.heappush(k_nearest_neighbors, (score, word))
            if len(k_nearest_neighbors) > k:
                heapq.heappop(k_nearest_neighbors)
    
    k_nearest_neighbors.sort(reverse=True, key=lambda x: x[0])  # Sort by score in descending order
    return k_nearest_neighbors
    ##########################################################################
    #                            END OF YOUR CODE                            #
    ##########################################################################

In [34]:
knn_cat = knn(word_vectors['cat'], word_vectors, 5)
print('')
print('cat')
print('--------------')
for score, word in knn(word_vectors['cat'], word_vectors, 5):
    print (word + '\t%.3f' % score)


  if word != x:



cat
--------------
cat	1.000
cats	0.732
dog	0.638
pet	0.573
rabbit	0.549


#### Hint:
To find the analogies, we find the nearest neighbour associated with the wordvector d
$$ d = \frac{c}{\Vert {c} \Vert} + \frac{b}{\Vert {b} \Vert} - \frac{a}{\Vert {a} \Vert}$$


In [35]:

import numpy as np

def analogy(a: str, b: str, c: str, word_vectors: dict) -> str:
    '''
    Parameters:
    a (string): word a
    b (string): word b
    c (string): word c
    word_vectors (Python dict): {word (string): np-array of word vector}

    Returns:
    the word d (string) associated with c such that c:d is similar to a:b
    '''
    ##########################################################################
    #                      TODO: Implement this function                     #
    ##########################################################################
    a_vector = word_vectors.get(a)
    b_vector = word_vectors.get(b)
    c_vector = word_vectors.get(c)
    
    if a_vector is None or b_vector is None or c_vector is None:
        raise ValueError(f"One or more words are not in the word_vectors dictionary.")
    
    target_vector = b_vector - a_vector + c_vector
    
    best_score = -1.0
    best_word = None
    
    for word, vector in word_vectors.items():
        if word not in [a, b, c]:
            score = cosine(target_vector, vector)
            if score > best_score:
                best_score = score
                best_word = word
                
    return best_word
    ##########################################################################
    #                            END OF YOUR CODE                            #
    ##########################################################################

In [36]:
# Word analogies
print('')
print('france - paris + rome = ' + analogy('paris', 'france', 'rome', word_vectors))


france - paris + rome = italy


## A word about biases in word vectors

In [38]:
## A word about biases in word vectors:
print('')
print('similarity(genius, man) = %.3f' %
      cosine(word_vectors['man'], word_vectors['genius']))
print('similarity(genius, woman) = %.3f' %
      cosine(word_vectors['woman'], word_vectors['genius']))


similarity(genius, man) = 0.445
similarity(genius, woman) = 0.325


In [39]:
## Compute the association strength between:
##   - a word w
##   - two sets of attributes A and B
import numpy as np

def association_strength(w: str, A: list, B: list, vectors: dict) -> float:
    '''
    Parameters:
    w (string): word w
    A (list of strings): The words belonging to set A
    B (list of strings): The words belonging to set B
    vectors (Python dict): {word (string): np-array of word vector}

    Returns:
    strength (float): the value of the association strength
    '''
    strength = 0.0
    part_a = 0.0
    part_b = 0.0
    ##########################################################################
    #                      TODO: Implement this function                     #
    ##########################################################################
    
    w_vector = vectors.get(w)
    if w_vector is None:
        raise ValueError(f"The word '{w}' is not in the vectors dictionary.")
    
    for a in A:
        a_vector = vectors.get(a)
        if a_vector is not None:
            part_a += cosine(w_vector, a_vector)
    
    for b in B:
        b_vector = vectors.get(b)
        if b_vector is not None:
            part_b += cosine(w_vector, b_vector)
    
    # Normalize by the number of words in sets A and B to get the average cosine similarity
    part_a /= len(A)
    part_b /= len(B)
    
    # Association strength is the difference between the average similarities
    strength = part_a - part_b
    
    return strength
    ##########################################################################
    #                            END OF YOUR CODE                            #
    ##########################################################################

In [40]:
## Perform the word embedding association test between:
##   - two sets of words X and Y
##   - two sets of attributes A and B

def weat(X: list, Y: list, A: list, B: list, vectors: dict) -> float:
    '''
    Parameters:
    X (list of strings): The words belonging to set X
    Y (list of strings): The words belonging to set Y
    A (list of strings): The words belonging to set A
    B (list of strings): The words belonging to set B
    vectors (Python dict): {word (string): np-array of word vector}

    Returns:
    score (float): the value of the group association strength
    '''
    score_X = 0.0
    score_Y = 0.0
    ##########################################################################
    #                      TODO: Implement this function                     #
    ##########################################################################
    for x in X:
        score_X += association_strength(x, A, B, vectors)
    
    for y in Y:
        score_Y += association_strength(y, A, B, vectors)
    
    score = score_X - score_Y
    
    return score
    ##########################################################################
    #                            END OF YOUR CODE                            #
    ##########################################################################

In [42]:
## Replicate one of the experiments from:
##
## Semantics derived automatically from language corpora contain human-like biases
## Caliskan, Bryson, Narayanan (2017)

career = ['executive', 'management', 'professional', 'corporation',
          'salary', 'office', 'business', 'career']
family = ['home', 'parents', 'children', 'family',
          'cousins', 'marriage', 'wedding', 'relatives']
male = ['john', 'paul', 'mike', 'kevin', 'steve', 'greg', 'jeff', 'bill']
female = ['amy', 'joan', 'lisa', 'sarah', 'diana', 'kate', 'ann', 'donna']

print('')
print('Word embedding association test: %.3f' %
      weat(career, family, male, female, word_vectors))


Word embedding association test: 0.847


## Word translation using word vectors

In the following, we will use word vectors in English and French to translate words from English to French. The idea is to learn a linear function that maps English word vectors to their correponding French word vectors. To learn this linear mapping, we will use a small bilingual lexicon, that contains pairs of words in English and French that are translations of each other.

The following function will load the small English-French bilingual lexicon:

In [43]:
def load_lexicon(filename):
    '''
    Parameters:
    filename(string): the path of the lexicon

    Returns:
    data(list of pairs of string): the bilingual lexicon
    '''
    fin = io.open(filename, 'r', encoding='utf-8', newline='\n')
    data = []
    for line in fin:
        a, b = line.rstrip().split(' ')
        data.append((a, b))
    return data

In [45]:
word_vectors_en = load_vectors('wiki.en.vec')
word_vectors_fr = load_vectors('wiki.fr.vec')
lexicon = load_lexicon("lexicon-en-fr.txt")
print(lexicon[:5])

[('the', 'le'), ('the', 'les'), ('the', 'la'), ('and', 'et'), ('was', 'fut')]


In [46]:
# We split the lexicon into a train and validation set
train = lexicon[:5000]
valid = lexicon[5000:5100]

The following function will learn the mapping from English to French. The idea is to build two matrices $X_{\text{en}}$ and $X_{\text{fr}}$, and to find a mapping $M$ that minimizes $||X_{\text{en}} W - X_{\text{fr}} ||_2$. In numpy, this mapping can be obtained by using the `numpy.linalg.lstsq` function.

In [47]:
def align(word_vectors_en, word_vectors_fr, lexicon):
    '''
    Parameters:
    word_vectors_en (dict: string -> np.array): English word vectors
    word_vectors_fr (dict: string -> np.array): French word vectors
    lexicon (list of pairs of string): bilingual training lexicon

    Returns:
    mapping (np.array): the mapping from English to French vectors
    '''
    x_en, x_fr = [], []
    
    for en_word, fr_word in lexicon:
        if en_word in word_vectors_en and fr_word in word_vectors_fr:
            x_en.append(word_vectors_en[en_word])
            x_fr.append(word_vectors_fr[fr_word])
    
    x_en = np.array(x_en)
    x_fr = np.array(x_fr)
    
    # Compute the mapping using the least squares solution
    mapping, _, _, _ = np.linalg.lstsq(x_en, x_fr, rcond=None)
    
    return mapping

In [48]:
mapping = align(word_vectors_en, word_vectors_fr, lexicon)
mapping

array([[-0.06183285, -0.01071552,  0.00175985, ..., -0.01107046,
         0.01629405, -0.01644996],
       [-0.01655313, -0.02930488,  0.09810107, ..., -0.01744702,
        -0.02848298,  0.02070179],
       [-0.01970861, -0.0147154 ,  0.01231819, ...,  0.03036093,
        -0.00209909, -0.00944313],
       ...,
       [ 0.0669847 ,  0.02351181,  0.02041902, ...,  0.00886501,
         0.08635366,  0.00595836],
       [ 0.01936122,  0.00552446,  0.01234669, ..., -0.00623332,
        -0.05116348,  0.05634361],
       [ 0.00530333, -0.03424679, -0.03369923, ..., -0.01344391,
        -0.00051053, -0.00491391]])

Given a mapping, a set of word English word vector and French word vectors, the next function will translate the English word to French. To do so, we apply the mapping on the English word, and retrieve the nearest neighbor of the obtained vector in the set of French word vectors. The translation is then the corresponding French word.

In [49]:
def translate(word, word_vectors_en, word_vectors_fr, mapping):
    '''
    Parameters:
    word (string): an English word
    word_vectors_en (dict: string -> np.array): English word vectors
    word_vectors_fr (dict: string -> np.array): French word vectors
    mapping (np.array): the mapping from English to French vectors

    Returns:
    A string containing the translation of the English word
    '''
    if word not in word_vectors_en:
        raise ValueError(f"The word '{word}' is not in the English word vectors dictionary.")
    
    # Get the English word vector
    en_vector = word_vectors_en[word]
    
    # Apply the mapping to get the corresponding French vector
    mapped_vector = np.dot(en_vector, mapping)
    
    # Find the nearest neighbor in the set of French word vectors
    best_score = -1.0
    best_word = None
    
    for fr_word, fr_vector in word_vectors_fr.items():
        score = cosine(mapped_vector, fr_vector)
        if score > best_score:
            best_score = score
            best_word = fr_word
    
    return best_word

In [50]:
print(translate("man", word_vectors_en, word_vectors_fr, mapping))
print(translate("machine", word_vectors_en, word_vectors_fr, mapping))
print(translate("learning", word_vectors_en, word_vectors_fr, mapping))

homme
machine
apprentissage


Finally, let's implement a function to evaluate this method on the validation lexicon:

In [51]:
def evaluate(valid, word_vectors_en, word_vectors_fr, mapping):
    '''
    Parameters:
    valid (a list of pairs of string): the validation lexicon
    word_vectors_en (dict: string -> np.array): English word vectors
    word_vectors_fr (dict: string -> np.array): French word vectors
    mapping (np.array): the mapping from English to French vectors

    Returns:
    Accuracy (float): the accuracy on the validation lexicon
    '''
    acc, n = 0.0, 0
    
    for en_word, fr_word in valid:
        if en_word in word_vectors_en and fr_word in word_vectors_fr:
            translated_word = translate(en_word, word_vectors_en, word_vectors_fr, mapping)
            if translated_word == fr_word:
                acc += 1
            n += 1
    
    if n == 0:
        return 0.0
    
    return acc / n

In [52]:
evaluate(valid, word_vectors_en, word_vectors_fr, mapping)

0.64