We will work on this notebook to implement the algorithm.
Please check regularly for updates.

Main tasks:
1. Write the code for the algorithm
2. Find better $\alpha$ and $\beta$ values for the model.
3. Find stopping criteria for the model.

In [42]:
# Importing the libraries:
import numpy as np
import gensim.downloader as api
import math
import re
from copy import deepcopy
import sys
import gzip
from nltk.corpus import wordnet 


In [10]:
# Toy corpus of 10 words:
toy_corpus = ['frog', 'toad', 'extrinsic', 'cat', 'intrinsic', 'dog', 'mission', 'true', 'false', 'incorrect']

In [6]:
# Load the Google News word2vec model:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')



In [16]:
# Vectorize the toy corpus:
vec_toy_corpus = [wv[word] for word in toy_corpus]

In [30]:
# Compare vectors using cosine similarity:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Compute the cosine similarity matrix:
similarity_matrix = np.zeros((len(vec_toy_corpus), len(vec_toy_corpus)))
for i in range(len(vec_toy_corpus)):
    for j in range(len(vec_toy_corpus)):
        similarity_matrix[i][j] = cosine_similarity(vec_toy_corpus[i], vec_toy_corpus[j])

# Print the cosine similarity matrix:
for i in range(len(vec_toy_corpus)):
    for j in range(len(vec_toy_corpus)):
        print(f'Similarity between {toy_corpus[i]} and {toy_corpus[j]} is {similarity_matrix[i][j]:.4f}')

Similarity between frog and frog is 1.0000
Similarity between frog and toad is 0.7050
Similarity between frog and extrinsic is 0.0780
Similarity between frog and cat is 0.4789
Similarity between frog and intrinsic is 0.0972
Similarity between frog and dog is 0.3550
Similarity between frog and mission is 0.0185
Similarity between frog and true is 0.1128
Similarity between frog and false is -0.0008
Similarity between frog and incorrect is 0.0589
Similarity between toad and frog is 0.7050
Similarity between toad and toad is 1.0000
Similarity between toad and extrinsic is 0.0638
Similarity between toad and cat is 0.4725
Similarity between toad and intrinsic is 0.0609
Similarity between toad and dog is 0.3837
Similarity between toad and mission is 0.0607
Similarity between toad and true is 0.0903
Similarity between toad and false is 0.0358
Similarity between toad and incorrect is 0.0119
Similarity between extrinsic and frog is 0.0780
Similarity between extrinsic and toad is 0.0638
Similarit

In [49]:
# We implement the retrofitting algorithm of Faruqui et al. (2015):
# Preprocessing the data:
isNumber = re.compile(r'\d+.*')
def norm_word(word):
  if isNumber.search(word.lower()):
    return '---num---'
  elif re.sub(r'\W+', '', word) == '':
    return '---punc---'
  else:
    return word.lower()

# Read all the word vectors and normalize them:
def read_word_vecs(filename):
  wordVectors = {}
  if filename.endswith('.gz'): fileObject = gzip.open(filename, 'r')
  else: fileObject = open(filename, 'r')
  
  for line in fileObject:
    line = line.strip().lower()
    word = line.split()[0]
    wordVectors[word] = np.zeros(len(line.split())-1, dtype=float)
    for index, vecVal in enumerate(line.split()[1:]):
      wordVectors[word][index] = float(vecVal)
    ''' normalize weight vector '''
    wordVectors[word] /= math.sqrt((wordVectors[word]**2).sum() + 1e-6)
    
  sys.stderr.write("Vectors read from: "+filename+" \n")
  return wordVectors

# Read lexicon as a dictionary:
def read_lexicon(filename):
  lexicon = {}
  if filename.endswith('.gz'): fileObject = gzip.open(filename, 'r')
  else: fileObject = open(filename, 'r')
  
  for line in fileObject:
    words = line.lower().strip().split()
    lexicon[norm_word(words[0])] = [norm_word(word) for word in words[1:]]
  return lexicon

# Retrofit word vectors to a lexicon:
def retrofit(wordVecs, lexicon, numIters): # wordVecs is a dictionary
  newWordVecs = deepcopy(wordVecs)
  wvVocab = set(newWordVecs.keys())
  loopVocab = wvVocab.intersection(set(lexicon.keys()))
  for it in range(numIters):
    # loop through every node also in ontology (else just use data estimate)
    for word in loopVocab:
      wordNeighbours = set(lexicon[word]).intersection(wvVocab)
      numNeighbours = len(wordNeighbours)
      #no neighbours, pass - use data estimate
      if numNeighbours == 0:
        continue
      # the weight of the data estimate if the number of neighbours
      newVec = numNeighbours * wordVecs[word]
      # loop over neighbours and add to new vector (currently with weight 1)
      for ppWord in wordNeighbours:
        newVec += newWordVecs[ppWord]
      newWordVecs[word] = newVec/(2*numNeighbours)
  return newWordVecs

In [41]:
# Transform the toy corpus into a wordVecs dictionary:
toy_wordVecs = {}
for i in range(len(vec_toy_corpus)):
    toy_wordVecs[toy_corpus[i]] = vec_toy_corpus[i]
print(toy_wordVecs.keys())

dict_keys(['frog', 'toad', 'extrinsic', 'cat', 'intrinsic', 'dog', 'mission', 'true', 'false', 'incorrect'])


In [51]:
# Read the lexicon. We can use NLTK WordNet.
lexicon = read_lexicon('ppdb-2.0-xl-lexical.gz') # I am not sure how to read the lexicon files.
#newWordVecs = retrofit(toy_wordVecs, lexicon, 10)

TypeError: cannot use a string pattern on a bytes-like object