<a href="https://colab.research.google.com/github/Muhirwakyeyune/AMMI-NATURAL-LANGUAGE-PROCESSING/blob/main/Muhirwa_Salomon_intro_to_wordvectors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


<h1 style="font-family:verdana;font-size:300%;text-align:center;background-color:#f2f2f2;color:#0d0d0d">AMMI NLP - Review sessions</h1>

<h1 style="font-family:verdana;font-size:180%;text-align:Center;color:#993333"> Lab 2: Introduction to wordvectors </h1>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Big thanks to Amr Khalifa who improved this lab and made it to a Jupyter Notebook!**

In [2]:
import io, sys
import numpy as np

In [13]:
def load_vectors(filename):
    fin = io.open(filename, 'r', encoding='utf-8', newline='\n')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])
    return data

In [14]:
# Loading word vectors

print('')
print(' ** Word vectors ** ')
print('')

'''
word_vectors is a dictionary that maps words to their numerical word vector
[word (string)] = [np-array]
'''
word_vectors = load_vectors('/content/drive/MyDrive/session2/wiki.en.vec')

tree_vector = word_vectors['tree']
print(type(tree_vector), len(tree_vector))


 ** Word vectors ** 

<class 'numpy.ndarray'> 300


In [15]:
## This function computes the cosine similarity between vectors u and v

def cosine(u, v):
    '''
    Parameters:
    u : 1-D numpy array
    v : 1-D numpy array

    Returns:
    cos (float) : value of the cosine similairy between vectors u, v
    '''

    ## FILL CODE
    dot_product=np.dot(u, v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    cos = dot_product / (norm_u * norm_v)


    return cos


In [16]:
# compute similarity between words

print('similarity(apple, apples) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['apples']))
print('similarity(apple, banana) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['banana']))
print('similarity(apple, tiger) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['tiger']))

similarity(apple, apples) = 0.637
similarity(apple, banana) = 0.431
similarity(apple, tiger) = 0.212


In [17]:
def nearest_neighbor(x, word_vectors, exclude_words=[]):
    '''
    Parameters:
    x (string): word to find its nearest neighbour
    word_vectors (Python dict): {word (string): np-array of word vector}
    exclude_words (list of strings): words to be excluded from the search

    Returns:
    best_word (string) : the word whose word vector is the nearest neighbour
    to the word vector of x
    '''
    best_score = -1.0
    best_word = None

    # Iterate through each word in the word_vectors dictionary
    for word, vector in word_vectors.items():
        # Check if the word should be excluded from the search
        if word in exclude_words:
            continue

        # Calculate the cosine similarity score between x and the current word vector
        score = cosine(x, vector)  # cosine() is a function to calculate cosine similarity

        # Update the best score and best word if the current score is higher
        if score > best_score:
            best_score = score
            best_word = word

    # Return the word with the highest similarity score (nearest neighbor)
    return best_word


In [18]:
print('')
print('The nearest neighbor of cat is: ' +
      nearest_neighbor(word_vectors['cat'], word_vectors, exclude_words = ['cat', 'cats']))


The nearest neighbor of cat is: dog


#### Hint (using python priorty queues with the heapq datastructure):
if you don't want to store all the words and scores you can use the priortiy queue and only store the best K element so far.

In [30]:
import heapq
import numpy as np

## This function returns the words corresponding to the
## K nearest neighbors of vector x.
## You can use the functions heappush and heappop.

def knn(x, vectors, k):
    '''
    Parameters:
    x (string): word to find its nearest neighbour
    vectors (Python dict): {word (string): np-array of word vector}
    k (int): number of nearest neighbours to be found

    Returns:
    k_nearest_neighbors (list of tuples): [(score, word), (score, word), ....]
    '''

    k_nearest_neighbors = []
    x_vector = vectors[x]  # Fetch the word vector corresponding to x

    for word, vector in vectors.items():
        if word != x:
            score = -np.linalg.norm(vector - x_vector)  # Negative Euclidean distance for max heap
            heapq.heappush(k_nearest_neighbors, (score, word))

            if len(k_nearest_neighbors) > k:
                heapq.heappop(k_nearest_neighbors)

    k_nearest_neighbors = sorted(k_nearest_neighbors, key=lambda x: x[0], reverse=True)

    return k_nearest_neighbors



In [31]:

knn_cat = knn('cat', word_vectors, 5)  # Pass 'cat' as the word, not the vector
print('')
print('cat')
print('--------------')
for score, word in knn_cat:
    print(word + '\t%.3f' % score)



cat
--------------
cats	-3.225
dog	-3.478
pet	-3.889
dogs	-4.050
rabbit	-4.065


#### Hint:
To find the analogies, we find the nearest neighbour associated with the wordvector d
$$ d = \frac{c}{\Vert {c} \Vert} + \frac{b}{\Vert {b} \Vert} - \frac{a}{\Vert {a} \Vert}$$


In [None]:
## This function return the words d, such that a:b and c:d
## verifies the same relation

def analogy(a, b, c, word_vectors):
    '''
    Parameters:
    a (string): word a
    b (string): word b
    c (string): word c
    word_vectors (Python dict): {word (string): np-array of word vector}

    Returnrs:
    the word d (string) associated with c such that c:d is similar to a:b

    '''

    ## FILL CODE

    return None

In [34]:
import numpy as np

def analogy(a, b, c, word_vectors):
    '''
    Parameters:
    a (string): word a
    b (string): word b
    c (string): word c
    word_vectors (Python dict): {word (string): np-array of word vector}

    Returns:
    the word d (string) associated with c such that c:d is similar to a:b
    '''

    # Retrieve the word vectors for a, b, and c from the word_vectors dictionary
    a_vector = word_vectors[a]
    b_vector = word_vectors[b]
    c_vector = word_vectors[c]

    # Compute the vector d using the provided equation
    d_vector = c_vector / np.linalg.norm(c_vector) + (b_vector / np.linalg.norm(b_vector)) - (a_vector / np.linalg.norm(a_vector))

    # Initialize variables for tracking the best score and best word
    best_score = -1.0
    best_word = None

    # Iterate over the word vectors in the word_vectors dictionary
    for word, vector in word_vectors.items():
        # Exclude a, b, and c from consideration
        if word != a and word != b and word != c:
            # Calculate the cosine similarity score between the word vector and d_vector
            score = np.dot(vector, d_vector) / (np.linalg.norm(vector) * np.linalg.norm(d_vector))
            # Update the best_score and best_word if a higher score is found
            if score > best_score:
                best_score = score
                best_word = word

    # Return the best_word, which is the word d associated with c that satisfies the analogy relation
    return best_word


In [35]:
# Word analogies

print('')
print('france - paris + rome = ' + analogy('paris', 'france', 'rome', word_vectors))


france - paris + rome = italy


## A word about biases in word vectors

In [36]:
## A word about biases in word vectors:

print('')
print('similarity(genius, man) = %.3f' %
      cosine(word_vectors['man'], word_vectors['genius']))
print('similarity(genius, woman) = %.3f' %
      cosine(word_vectors['woman'], word_vectors['genius']))


similarity(genius, man) = 0.445
similarity(genius, woman) = 0.325


In [37]:
import numpy as np

def association_strength(w, A, B, vectors):
    '''
    Parameters:
    w (string): word w
    A (list of strings): The words belonging to set A
    B (list of strings): The words belonging to set B
    vectors (Python dict): {word (string): np-array of word vector}

    Returns:
    strength (float): the value of the association strength
    '''

    strength = 0.0
    part_a = 0.0
    part_b = 0.0

    # Compute part_a: the average similarity between word w and the words in set A
    for word in A:
        part_a += np.dot(vectors[w], vectors[word]) / (np.linalg.norm(vectors[w]) * np.linalg.norm(vectors[word]))
    part_a /= len(A)

    # Compute part_b: the average similarity between word w and the words in set B
    for word in B:
        part_b += np.dot(vectors[w], vectors[word]) / (np.linalg.norm(vectors[w]) * np.linalg.norm(vectors[word]))
    part_b /= len(B)

    # Compute the association strength by subtracting part_b from part_a
    strength = part_a - part_b

    return strength


In [38]:
import numpy as np

def weat(X, Y, A, B, vectors):
    '''
    Parameters:
    X (list of strings): The words belonging to set X
    Y (list of strings): The words belonging to set Y
    A (list of strings): The words belonging to set A
    B (list of strings): The words belonging to set B
    vectors (Python dict): {word (string): np-array of word vector}

    Returns:
    score (float): the value of the group association strength
    '''

    score = 0.0

    # Compute the association strength for each word in set X
    for x in X:
        score_x = association_strength(x, A, B, vectors)
        score += score_x

    # Compute the association strength for each word in set Y
    for y in Y:
        score_y = association_strength(y, A, B, vectors)
        score -= score_y

    return score


In [39]:
## Replicate one of the experiments from:
##
## Semantics derived automatically from language corpora contain human-like biases
## Caliskan, Bryson, Narayanan (2017)

career = ['executive', 'management', 'professional', 'corporation',
          'salary', 'office', 'business', 'career']
family = ['home', 'parents', 'children', 'family',
          'cousins', 'marriage', 'wedding', 'relatives']
male = ['john', 'paul', 'mike', 'kevin', 'steve', 'greg', 'jeff', 'bill']
female = ['amy', 'joan', 'lisa', 'sarah', 'diana', 'kate', 'ann', 'donna']

print('')
print('Word embedding association test: %.3f' %
      weat(career, family, male, female, word_vectors))


Word embedding association test: 0.847


## Word translation using word vectors

In the following, we will use word vectors in English and French to translate words from English to French. The idea is to learn a linear function that maps English word vectors to their correponding French word vectors. To learn this linear mapping, we will use a small bilingual lexicon, that contains pairs of words in English and French that are translations of each other.

The following function will load the small English-French bilingual lexicon:

In [40]:
def load_lexicon(filename):
    '''
    Parameters:
    filename(string): the path of the lexicon

    Returns:
    data(list of pairs of string): the bilingual lexicon
    '''
    fin = io.open(filename, 'r', encoding='utf-8', newline='\n')
    data = []
    for line in fin:
        a, b = line.rstrip().split(' ')
        data.append((a, b))
    return data

In [42]:
word_vectors_en = load_vectors('/content/drive/MyDrive/session2/wiki.en.vec')
word_vectors_fr = load_vectors('/content/drive/MyDrive/session2/wiki.en.vec')
lexicon = load_lexicon("/content/drive/MyDrive/session2/lexicon-en-fr.txt")
print(lexicon[:5])

[('the', 'le'), ('the', 'les'), ('the', 'la'), ('and', 'et'), ('was', 'fut')]


In [43]:
# We split the lexicon into a train and validation set
train = lexicon[:5000]
valid = lexicon[5000:5100]

The following function will learn the mapping from English to French. The idea is to build two matrices $X_{\text{en}}$ and $X_{\text{fr}}$, and to find a mapping $M$ that minimizes $||X_{\text{en}} W - X_{\text{fr}} ||_2$. In numpy, this mapping can be obtained by using the `numpy.linalg.lstsq` function.

In [44]:
import numpy as np

def align(word_vectors_en, word_vectors_fr, lexicon):
    '''
    Parameters:
    word_vectors_en(dict: string -> np.array): English word vectors
    word_vectors_en(dict: string -> np.array): French word vectors
    lexicon(list of pairs of string): bilingual training lexicon

    Returns
    mapping(np.array): the mapping from English to French vectors
    '''
    x_en, x_fr = [], []

    # Gather corresponding word vectors from the lexicon
    for en_word, fr_word in lexicon:
        if en_word in word_vectors_en and fr_word in word_vectors_fr:
            x_en.append(word_vectors_en[en_word])
            x_fr.append(word_vectors_fr[fr_word])

    # Convert the lists to numpy arrays
    x_en = np.array(x_en)
    x_fr = np.array(x_fr)

    # Perform alignment or mapping between the vectors
    mapping = np.linalg.lstsq(x_en, x_fr, rcond=None)[0]

    return mapping


In [45]:
mapping = align(word_vectors_en, word_vectors_fr, lexicon)

Given a mapping, a set of word English word vector and French word vectors, the next function will translate the English word to French. To do so, we apply the mapping on the English word, and retrieve the nearest neighbor of the obtained vector in the set of French word vectors. The translation is then the corresponding French word.

In [46]:
import numpy as np

def translate(word, word_vectors_en, word_vectors_fr, mapping):
    '''
    Parameters:
    word(string): an English word
    word_vectors_en(dict: string -> np.array): English word vectors
    word_vectors_en(dict: string -> np.array): French word vectors
    mapping(np.array): the mapping from English to French vectors

    Returns
    A string containing the translation of the English word
    '''

    # Check if the word is present in the English word vectors
    if word in word_vectors_en:
        # Get the English word vector
        en_vector = word_vectors_en[word]
        # Perform translation using the mapping
        fr_vector = np.dot(mapping, en_vector)

        # Find the closest French word to the translated vector
        closest_word = None
        min_distance = float('inf')
        for fr_word, fr_vector_word in word_vectors_fr.items():
            distance = np.linalg.norm(fr_vector_word - fr_vector)
            if distance < min_distance:
                closest_word = fr_word
                min_distance = distance

        return closest_word

    # Return None if the word is not found in the English word vectors
    return None


In [47]:
print(translate("world", word_vectors_en, word_vectors_fr, mapping))
print(translate("machine", word_vectors_en, word_vectors_fr, mapping))
print(translate("learning", word_vectors_en, word_vectors_fr, mapping))

world
machine
learning


Finally, let's implement a function to evaluate this method on the validation lexicon:

In [48]:
import numpy as np

def evaluate(valid, word_vectors_en, word_vectors_fr, mapping):
    '''
    Parameters:
    valid(a list of pairs of string): the validation lexicon
    word_vectors_en(dict: string -> np.array): English word vectors
    word_vectors_en(dict: string -> np.array): French word vectors
    mapping(np.array): the mapping from English to French vectors

    Returns
    Accuracy(float): the accuracy on the validation lexicon
    '''
    acc, n = 0.0, 0

    for en_word, fr_word in valid:
        # Check if both English and French words are present in their respective word vectors
        if en_word in word_vectors_en and fr_word in word_vectors_fr:
            # Translate English word to French
            translated_word = translate(en_word, word_vectors_en, word_vectors_fr, mapping)
            # Check if the translation matches the expected French word
            if translated_word == fr_word:
                acc += 1
            n += 1

    # Calculate the accuracy by dividing the number of correct translations by the total number of translations
    accuracy = acc / n if n > 0 else 0.0

    return accuracy


In [49]:
evaluate(valid, word_vectors_en, word_vectors_fr, mapping)

0.7727272727272727