In [1]:
import io, sys
import numpy as np
from heapq import *

In [2]:
def load_vectors(filename):
    fin = io.open(filename, 'r', encoding='utf-8', newline='\n')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
#         data[tokens[0]] = np.asarray(map(float, tokens[1:]))
        data[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])
    return data

In [3]:
# Loading word vectors

print('')
print(' ** Word vectors ** ')
print('')

word_vectors = load_vectors('wiki.en.vec')


 ** Word vectors ** 



In [4]:
## This function computes the cosine similarity between vectors u and v

def cosine(u, v):
    ## FILL CODE
    return u.dot(v)/(np.linalg.norm(u)* np.linalg.norm(v))

## This function returns the word corresponding to 
## nearest neighbor vector of x
## The list exclude_words can be used to exclude some
## words from the nearest neighbors search

In [5]:
# !python --version

In [6]:
# compute similarity between words

print('similarity(apple, apples) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['apples']))
print('similarity(apple, banana) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['banana']))
print('similarity(apple, tiger) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['tiger']))

similarity(apple, apples) = 0.637
similarity(apple, banana) = 0.431
similarity(apple, tiger) = 0.212


In [7]:
import ipdb

In [8]:
## Functions for nearest neighbors

def nearest_neighbor(x, word_vectors, exclude_words=[]):
    best_score = -1.0
    best_word = ''
    
    ## FILL CODE
    for words in word_vectors.keys():
#         ipdb.set_trace()
        temp = cosine(word_vectors[words], x)
        if temp > best_score and words not in exclude_words:
            best_score = temp
            best_word = words
            
    return best_word

## This function return the words corresponding to the
## K nearest neighbors of vector x.
## You can use the functions heappush and heappop.

def knn(x, vectors, k):
    heap = []

    ## FILL CODE
#     for words in vectors.keys():
#         score = cosine(x,  vectors[words])
#         b_word = nearest_neighbor(x, vectors, exclude)
#         heappush(heap, (b_word, score))
#         if len(heap)>k:
# #             ipdb.set_trace()
#             heappop(heap)
#         exclude.append(b_word)
        
        
    exclude = []    
    for ki in range(k+1):
        
        b_word = nearest_neighbor(x, vectors, exclude)
        score = cosine(x,  vectors[b_word])
#         if score == 1:
#             exclude.append(b_word)
#             pass
        heap.append((score, b_word))
        exclude.append(b_word)

    return [heappop(heap) for i in range(len(heap))][::-1][:-1] # reverse and don't take the last element

In [9]:
# cosine(word_vectorsnearest_neighbor(word_vectors['cat'],  word_vectors), nearest_neighbor(word_vectors['cat'],word_vectors))

In [10]:
# >>> h = []
# >>> heappush(h, (5, 'write code'))
# >>> heappush(h, (7, 'release product'))
# >>> heappush(h, (1, 'write spec'))
# >>> heappush(h, (3, 'create tests'))
# >>> print(heappop(h))
# >>> print(h)

In [11]:
# looking at nearest neighbors of a word

print('The nearest neighbor of cat is: ' +
      nearest_neighbor(word_vectors['cat'], word_vectors, ['cat', 'cats']))

knn_cat = knn(word_vectors['cat'], word_vectors, 5)
print('')
print('cat')
print('--------------')
for score, word in knn(word_vectors['cat'], word_vectors, 5):
    print(word + '\t%.3f' % score)

The nearest neighbor of cat is: dog

cat
--------------
cats	0.732
dog	0.638
pet	0.573
rabbit	0.549
dogs	0.538


$$
\mathbf{x}_{d}=\arg \max _{i}\left(\mathbf{x}_{c}-\mathbf{x}_{a}+\mathbf{x}_{b}\right)^{\top} \mathbf{x}_{i}
$$

In [12]:
## This function return the word d, such that a:b and c:d
## verifies the same relation
from nltk.stem import PorterStemmer
ps =PorterStemmer()
def analogy(a, b, c, word_vectors):
    ## FILL CODE
    a = a.lower()
    b = b.lower()
    c = c.lower()
    
    best_anal = - np.inf
    best_anal_word = ''
    
    x_a = word_vectors[a]/np.linalg.norm(word_vectors[a])
    x_b = word_vectors[b]/np.linalg.norm(word_vectors[b])
    x_c = word_vectors[c]/np.linalg.norm(word_vectors[c])
    
    for word in word_vectors.keys():
        if True in [i in word for i in [a, b, c]]: # make sure to not consider all word that are in connextion with a, b and c
#             ipdb.set_trace()
            continue
            
        word_vectors[word] = word_vectors[word]/np.linalg.norm(word_vectors[word])
        
        anal = (x_c + x_b - x_a).dot(word_vectors[word])
        
        if anal > best_anal:
            best_anal = anal
            best_anal_word = word
        
    return best_anal_word

In [13]:
# Word analogies

print('')
print('france - paris + rome = ' + analogy('paris', 'france', 'rome', word_vectors))

print('')
print('uncle - father + mother = ' + analogy('father', 'uncle', 'mother', word_vectors))

print('')
print('king - man + woman = ' + analogy('man', 'king', 'woman', word_vectors))


france - paris + rome = italy

uncle - father + mother = aunt

king - man + woman = queen


In [14]:
## A word about biases in word vectors:

print('')
print('similarity(genius, man) = %.3f' %
      cosine(word_vectors['man'], word_vectors['genius']))
print('similarity(genius, woman) = %.3f' %
      cosine(word_vectors['woman'], word_vectors['genius']))


similarity(genius, man) = 0.445
similarity(genius, woman) = 0.325


In [15]:
## Compute the association strength between:
##   - a word w
##   - two sets of attributes A and B

def association_strength(w, A, B, vectors):
    strength = 0.0
    ## FILL CODE
    a_sum = 0.0
    b_sum = 0.0
    
    for a in A : 
        a_sum += cosine(vectors[w], vectors[a])
    
    for b in B : 
        b_sum += cosine(vectors[w], vectors[b])
    
    
    
    strength = 1/len(A) * a_sum - 1/len(B) * b_sum
    return strength

## Perform the word embedding association test between:
##   - two sets of words X and Y
##   - two sets of attributes A and B

def weat(X, Y, A, B, vectors):
    score = 0.0
    ## FILL CODE
    score_1 = 0.0
    score_2 = 0.0
    for w in X:
        score_1 += association_strength(w, A, B, vectors)
    
    for z in Y:
        score_2 += association_strength(z, A, B, vectors)
     
    score = score_1 - score_2
    
    return score

In [16]:
## Replicate one of the experiments from:
##
## Semantics derived automatically from language corpora contain human-like biases
## Caliskan, Bryson, Narayanan (2017)

career = ['executive', 'management', 'professional', 'corporation', 
          'salary', 'office', 'business', 'career']
family = ['home', 'parents', 'children', 'family',
          'cousins', 'marriage', 'wedding', 'relatives']
male = ['john', 'paul', 'mike', 'kevin', 'steve', 'greg', 'jeff', 'bill']
female = ['amy', 'joan', 'lisa', 'sarah', 'diana', 'kate', 'ann', 'donna']

print('')
print('Word embedding association test: %.3f' %
      weat(career, family, male, female, word_vectors))


Word embedding association test: 0.847


<h1>About the Authors:</h1> 

<a href="https://skabongo.github.io/">Salomon Kabongo</a>, Master degree student at <a href="https://aims.ac.za/">the African Master in Machine Intelligence (AMMI, Ghana)</a> his research focused on the use machine learning technique in the field of Natural Language Processing, learn more about him [here](https://skabongo.github.io/) or [twitter](https://twitter.com/SalomonKabongo1).

**References :** NLP Course at AMMI by [Edouard Grave](https://twitter.com/exgrv?lang=en)

Copyright &copy; 2020. This notebook and its source code are released under the terms of the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License 2.0</a>.