# **CS565 - Intelligent Systems and Interfaces - Assignment 2**
#### **Lavish Gulati - 170101082**

In [None]:
# Read corpus from Google Drive
from google.colab import drive
drive.mount("/content/drive")

import codecs
eng_text = codecs.open('/content/drive/My Drive/CS565/Assignment 2/en_wiki.txt', 'r').read()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Importing required libraries
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from random import shuffle, uniform
import math
import statistics

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2.1 - N-gram language model

#### 0 - Data Preprocessing and Splitting

In [None]:
# Fraction subset of corpus taken
SMALL_RATIO = 1
eng_text = eng_text[0:int(SMALL_RATIO*len(eng_text))]

In [None]:
# Sentence segmentation
sentences = []
sentences = sent_tokenize(eng_text)
print('Number of sentences in corpus:', len(sentences))
print('Sample data:', sentences[0:5])

Number of sentences in corpus: 761582
Sample data: ['The word "atom" was coined by ancient Greek philosophers.', 'However, these ideas were founded in philosophical and theological reasoning rather than evidence and experimentation.', 'As a result, their views on what atoms look like and how they behave were incorrect.', 'They also could not convince everybody, so atomism was but one of a number of competing theories on the nature of matter.', 'It was not until the 19th century that the idea was embraced and refined by scientists, when the blossoming science of chemistry produced discoveries that only the concept of atoms could explain.']


In [None]:
# Word tokenization of each sentence
word_tokens = []
word_count = 0
for sentence in sentences:
  words = word_tokenize(sentence)
  word_count += len(words)
  word_tokens.append(words)
print('Number of words in corpus:', word_count)
print('Sample data:', word_tokens[0:10])

Number of words in corpus: 19602594
Sample data: [['The', 'word', '``', 'atom', "''", 'was', 'coined', 'by', 'ancient', 'Greek', 'philosophers', '.'], ['However', ',', 'these', 'ideas', 'were', 'founded', 'in', 'philosophical', 'and', 'theological', 'reasoning', 'rather', 'than', 'evidence', 'and', 'experimentation', '.'], ['As', 'a', 'result', ',', 'their', 'views', 'on', 'what', 'atoms', 'look', 'like', 'and', 'how', 'they', 'behave', 'were', 'incorrect', '.'], ['They', 'also', 'could', 'not', 'convince', 'everybody', ',', 'so', 'atomism', 'was', 'but', 'one', 'of', 'a', 'number', 'of', 'competing', 'theories', 'on', 'the', 'nature', 'of', 'matter', '.'], ['It', 'was', 'not', 'until', 'the', '19th', 'century', 'that', 'the', 'idea', 'was', 'embraced', 'and', 'refined', 'by', 'scientists', ',', 'when', 'the', 'blossoming', 'science', 'of', 'chemistry', 'produced', 'discoveries', 'that', 'only', 'the', 'concept', 'of', 'atoms', 'could', 'explain', '.'], ['In', 'the', 'early', '1800s', 

In [None]:
# Shuffle the sentences
shuffle(word_tokens)
print(word_tokens[0:5])

[['There', 'were', '244', 'households', 'out', 'of', 'which', '28.7', '%', 'had', 'children', 'under', 'the', 'age', 'of', '18', 'living', 'with', 'them', ',', '54.5', '%', 'were', 'married', 'couples', 'living', 'together', ',', '14.8', '%', 'had', 'a', 'female', 'householder', 'with', 'no', 'husband', 'present', ',', 'and', '27.9', '%', 'were', 'non-families', '.'], ['The', 'city', 'was', 'named', 'after', 'Mora', ',', 'Sweden', '.'], ['In', 'the', 'township', 'the', 'population', 'was', 'spread', 'out', 'with', '25.7', '%', 'under', 'the', 'age', 'of', '18', ',', '8.6', '%', 'from', '18', 'to', '24', ',', '20.0', '%', 'from', '25', 'to', '44', ',', '37.1', '%', 'from', '45', 'to', '64', ',', 'and', '8.6', '%', 'who', 'were', '65', 'years', 'of', 'age', 'or', 'older', '.'], ['Most', 'of', 'these', 'have', 'been', 'public', 'schools', 'of', 'The', 'Jefferson', 'County', 'School', 'System', 'which', 'was', 'founded', 'in', '1898', '.'], ['As', 'of', 'January', '2014', ',', 'Southwest',

In [None]:
print(len(word_tokens))

761582


In [None]:
# Split the sentences into 90% train set and 10% 
train_set = word_tokens[0:int(0.9*len(word_tokens))]
test_set = word_tokens[int(0.9*len(word_tokens)):]

#### 1 - Trigram Language Model

In [None]:
# Extract frequency of each ngram from the training set 
def extractNgramFreq(train_set):
  unigram_freq = {}
  bigram_freq = {}
  trigram_freq = {}

  for sentence in train_set:
    for i in range(len(sentence)):
      unigram = sentence[i]
      if unigram in unigram_freq:
        unigram_freq[unigram] += 1
      else:
        unigram_freq[unigram] = 1
    
    for i in range(len(sentence)-1):
      bigram = (sentence[i], sentence[i+1])
      if bigram in bigram_freq:
        bigram_freq[bigram] += 1
      else:
        bigram_freq[bigram] = 1
    
    for i in range(len(sentence)-2):
      trigram = (sentence[i], sentence[i+1], sentence[i+2])
      if trigram in trigram_freq:
        trigram_freq[trigram] += 1
      else:
        trigram_freq[trigram] = 1
  
  return unigram_freq, bigram_freq, trigram_freq

# Get probability of each unigram based on frequency and total frequency
def getUnigramProb(unigram_freq):
  count = 0
  for unigram in unigram_freq:
    count += unigram_freq[unigram]
  
  unigram_prob = {}
  for unigram in unigram_freq:
    unigram_prob[unigram] = unigram_freq[unigram]/count
  
  return unigram_prob


# Get probability of each bigram based on frequency and total frequency
def getBigramProb(unigram_freq, bigram_freq):
  bigram_prob = {}
  for bigram in bigram_freq:
    bigram_prob[bigram] = bigram_freq[bigram]/unigram_freq[bigram[0]]
  
  return bigram_prob


# Get probability of each trigram based on frequency and total frequency
def getTrigramProb(bigram_freq, trigram_freq):
  trigram_prob = {}
  for trigram in trigram_freq:
    trigram_prob[trigram] = trigram_freq[trigram]/bigram_freq[(trigram[0], trigram[1])]
  
  return trigram_prob

##### Discounting method

In [None]:
# Get discounted trigrams based on trigram and bigram frequency
def trigramsDiscounting(trigram_freq, bigram_freq, beta):
  discounted_trigrams = {}
  for trigram in trigram_freq:
    count = trigram_freq[trigram] - beta
    discounted_trigrams[trigram] = count / bigram_freq[(trigram[0], trigram[1])]
  
  return discounted_trigrams

# Calculate perplexity of the discounted trigrams 
def getDiscountingPerplexity(test_set, test_trigram_freq, discounted_trigrams):
  entropy = 0
  count = 0
  test_count = 0
  for trigram in test_trigram_freq:
    count += test_trigram_freq[trigram]
  
  for sentence in test_set:
    for i in range(len(sentence)-2):
      trigram = (sentence[i], sentence[i+1], sentence[i+2])
      if trigram in discounted_trigrams:
        prob = discounted_trigrams[trigram]
      else:
        prob = test_trigram_freq[trigram]/count
      
      if prob > 0:
        entropy += math.log(prob, 2)
      
      test_count += 1
  
  entropy = (-1*entropy)/test_count
  perplexity = math.pow(2, entropy)
  return perplexity

In [None]:
# Discounting smoothing of trigrams
def discounting(train_set, test_set):
  unigram_freq, bigram_freq, trigram_freq = extractNgramFreq(train_set)
  unigram_prob = getUnigramProb(unigram_freq)
  bigram_prob = getBigramProb(unigram_freq, bigram_freq)
  trigram_prob = getTrigramProb(bigram_freq, trigram_freq)

  test_unigram_freq, test_bigram_freq, test_trigram_freq = extractNgramFreq(test_set)

  beta = round(uniform(0.6,0.7), 2)
  discounted_trigrams = trigramsDiscounting(trigram_freq, bigram_freq, beta)
  perplexity = getDiscountingPerplexity(test_set, test_trigram_freq, discounted_trigrams)
  return beta, perplexity

##### Interpolation method

In [None]:
# Calculate lambda set for linear interpolation smoothing
def getLambda(test_set, epsilon, unigram_prob, bigram_prob, trigram_prob):
  lambda_set = [0.2, 0.3, 0.5]
  while True:
    exp_count = [0, 0, 0]
    for sentence in test_set:
      for i in range(len(sentence)-2):
        unigram = sentence[i]
        bigram = (sentence[i], sentence[i+1])
        trigram = (sentence[i], sentence[i+1], sentence[i+2])
        
        prob1 = unigram_prob[unigram] if unigram in unigram_prob else 0
        prob2 = bigram_prob[bigram] if bigram in bigram_prob else 0
        prob3 = trigram_prob[trigram] if trigram in trigram_prob else 0

        prob = lambda_set[0]*prob1 + lambda_set[1]*prob2 + lambda_set[2]*prob3

        if prob > 0:
          exp_count[0] += (lambda_set[0]*prob1)/prob
          exp_count[1] += (lambda_set[1]*prob2)/prob
          exp_count[2] += (lambda_set[2]*prob3)/prob
    
    exp_sum = sum(exp_count)
    if exp_sum == 0:
      return lambda_set
    
    new_lambda_set = [exp/exp_sum for exp in exp_count]

    flag = True
    for i in range(0, 3):
      diff = abs(new_lambda_set[i]-lambda_set[i])
      if diff > epsilon:
        flag = False
        break
    
    if flag:
      return lambda_set
    else:
      lambda_set = new_lambda_set


# Calculate perplexity for interpolation
def getInterpolationPerplexity(test_set, lambda_set, unigram_prob, bigram_prob, trigram_prob):
  entropy = 0
  test_count = 0
  for sentence in test_set:
    for i in range(len(sentence)-2):
      unigram = sentence[i]
      bigram = (sentence[i], sentence[i+1])
      trigram = (sentence[i], sentence[i+1], sentence[i+2])
        
      prob1 = unigram_prob[unigram] if unigram in unigram_prob else 0
      prob2 = bigram_prob[bigram] if bigram in bigram_prob else 0
      prob3 = trigram_prob[trigram] if trigram in trigram_prob else 0

      prob = lambda_set[0]*prob1 + lambda_set[1]*prob2 + lambda_set[2]*prob3
      if prob > 0:
        entropy += math.log(prob, 2)
      
      test_count += 1
  
  entropy = (-1*entropy)/test_count
  perplexity = math.pow(2, entropy)
  return perplexity

In [None]:
# Linear Interpolation smoothing
def interpolation(train_set, test_set):
  unigram_freq, bigram_freq, trigram_freq = extractNgramFreq(train_set)
  unigram_prob = getUnigramProb(unigram_freq)
  bigram_prob = getBigramProb(unigram_freq, bigram_freq)
  trigram_prob = getTrigramProb(bigram_freq, trigram_freq)

  lambda_set = getLambda(test_set, 0.005, unigram_prob, bigram_prob, trigram_prob)
  perplexity = getInterpolationPerplexity(test_set, lambda_set, unigram_prob, bigram_prob, trigram_prob)
  return lambda_set, perplexity

#### 2 - Splitting training dataset

In [None]:
# Split original training set into training set and validation set
def splitTrainSet(train_set):
  shuffle(train_set)
  new_train_set = train_set[0:int(len(train_set)*0.9)]
  validation_set = train_set[int(len(train_set)*0.9):]
  return new_train_set, validation_set

In [None]:
ts1, vs1 = splitTrainSet(train_set)
ts2, vs2 = splitTrainSet(train_set)
ts3, vs3 = splitTrainSet(train_set)
ts4, vs4 = splitTrainSet(train_set)
ts5, vs5 = splitTrainSet(train_set)

#### 3 - Performance on Validation Set

##### Discounting

In [None]:
# Calculate discounting beta and perplexity for all validation sets
beta1, perp1 = discounting(ts1, vs1)
beta2, perp2 = discounting(ts2, vs2)
beta3, perp3 = discounting(ts3, vs3)
beta4, perp4 = discounting(ts4, vs4)
beta5, perp5 = discounting(ts5, vs5)
beta = [beta1, beta2, beta3, beta4, beta5]
perp = [perp1, perp2, perp3, perp4, perp5]

print('Discounting performance on validation set')
for i in range(0, 5):
  print('Set', i+1, '- beta:', beta[i], ', perplexity:', perp[i])

Discounting performance on validation set
Set 1 - beta: 0.61 , perplexity: 385.98464393175266
Set 2 - beta: 0.61 , perplexity: 335.20801152217865
Set 3 - beta: 0.66 , perplexity: 362.5238493150579
Set 4 - beta: 0.64 , perplexity: 378.2673871942632
Set 5 - beta: 0.62 , perplexity: 346.0446900001597


##### Interpolation

In [None]:
# Calculate interpolation lambda and perplexity for all validation sets
ls1, perp1 = interpolation(ts1, vs1)
ls2, perp2 = interpolation(ts2, vs2)
ls3, perp3 = interpolation(ts3, vs3)
ls4, perp4 = interpolation(ts4, vs4)
ls5, perp5 = interpolation(ts5, vs5)
ls = [ls1, ls2, ls3, ls4, ls5]
perp = [perp1, perp2, perp3, perp4, perp5]

print('Interpolation performance on validation set')
for i in range(0, 5):
  print('Set', i+1, '- lambda set:', ls[i], ', perplexity:', perp[i])

Interpolation performance on validation set
Set 1 - lambda set: [0.3277119588920003, 0.39650946907696855, 0.2757785720310312] , perplexity: 91.22769316275311
Set 2 - lambda set: [0.3295335779794422, 0.3963021384365197, 0.2741642835840382] , perplexity: 92.42521601363109
Set 3 - lambda set: [0.32792929962145495, 0.396991874037794, 0.2750788263407509] , perplexity: 91.77695155824924
Set 4 - lambda set: [0.3307374088539322, 0.39644097666968764, 0.2728216144763801] , perplexity: 93.04468779624617
Set 5 - lambda set: [0.3294072328392538, 0.3964805020221355, 0.27411226513861076] , perplexity: 92.3473527959448


#### 4 - Performance on Test Set

##### Discounting

In [None]:
# Calculate discounting beta and perplexity for all test sets and compute variance
beta1, perp1 = discounting(ts1, test_set)
beta2, perp2 = discounting(ts2, test_set)
beta3, perp3 = discounting(ts3, test_set)
beta4, perp4 = discounting(ts4, test_set)
beta5, perp5 = discounting(ts5, test_set)
beta = [beta1, beta2, beta3, beta4, beta5]
perp = [perp1, perp2, perp3, perp4, perp5]

print('Discounting performance on test set')
for i in range(0, 5):
  print('Set', i+1, '- beta:', beta[i], ', perplexity:', perp[i])

print("Variance of perplexity:", statistics.variance(perp))

Discounting performance on test set
Set 1 - beta: 0.65 , perplexity: 384.1294669832734
Set 2 - beta: 0.61 , perplexity: 379.87093784436684
Set 3 - beta: 0.63 , perplexity: 383.77810953043127
Set 4 - beta: 0.64 , perplexity: 382.5792679308408
Set 5 - beta: 0.64 , perplexity: 382.0304207051159
Variance of perplexity: 2.8562457111186608


##### Interpolation

In [None]:
# Calculate interpolation lambda and perplexity for all test sets and compute variance
ls1, perp1 = interpolation(ts1, test_set)
ls2, perp2 = interpolation(ts2, test_set)
ls3, perp3 = interpolation(ts3, test_set)
ls4, perp4 = interpolation(ts4, test_set)
ls5, perp5 = interpolation(ts5, test_set)
ls = [ls1, ls2, ls3, ls4, ls5]
perp = [perp1, perp2, perp3, perp4, perp5]

print('Interpolation performance on test set')
for i in range(0, 5):
  print('Set', i+1, '- lambda set:', ls[i], ', perplexity:', perp[i])

print("Variance of perplexity:", statistics.variance(perp))

Interpolation performance on test set
Set 1 - lambda set: [0.3296833397486049, 0.39652110232477983, 0.27379555792661514] , perplexity: 91.99739879716303
Set 2 - lambda set: [0.3295044469515101, 0.39658843144569367, 0.2739071216027961] , perplexity: 91.78268638383216
Set 3 - lambda set: [0.3296018985447135, 0.39650282503347606, 0.2738952764218105] , perplexity: 91.87850940426299
Set 4 - lambda set: [0.3296188623024507, 0.3963528913693927, 0.27402824632815653] , perplexity: 91.91424649511205
Set 5 - lambda set: [0.3294907874206512, 0.3965074778124422, 0.27400173476690654] , perplexity: 91.8246370330346
Variance of perplexity: 0.006872811974727759


#### 5 - Laplace Smoothing

In [None]:
# Get laplace probability of trigrams based on trigram and bigram frequency
def getLaplaceProb(trigram_freq, bigram_freq):
  k = 1
  trigram_prob = {}
  for trigram in trigram_freq:
    bigram = (trigram[0], trigram[1])
    trigram_prob[trigram] = (trigram_freq[trigram] + k) / (bigram_freq[bigram]+ k*len(trigram_freq))
  return trigram_prob


# Calculate perplexity of laplace smoothing
def getLaplacePerplexity(test_set, trigram_prob):
  entropy = 0
  test_count = 0
  for sentence in test_set:
    for i in range(len(sentence)-2):
      trigram = (sentence[i], sentence[i+1], sentence[i+2])
      prob = trigram_prob[trigram] if trigram in trigram_prob else 0

      if prob > 0:
        entropy += math.log(prob, 2)

      test_count += 1

  entropy = (-1*entropy)/test_count
  perplexity = math.pow(2, entropy)
  return perplexity

In [None]:
# Laplace smoothing
def laplace(train_set, test_set):
  unigram_freq, bigram_freq, trigram_freq = extractNgramFreq(train_set)
  trigram_prob = getLaplaceProb(trigram_freq, bigram_freq)
  perplexity = getLaplacePerplexity(test_set, trigram_prob)
  return perplexity

In [None]:
# Calculate laplace perplexity for all test sets and compute variance
perp1 = laplace(ts1, test_set)
perp2 = laplace(ts2, test_set)
perp3 = laplace(ts3, test_set)
perp4 = laplace(ts4, test_set)
perp5 = laplace(ts5, test_set)
perp = [perp1, perp2, perp3, perp4, perp5]

print('Laplace performance on test set')
for i in range(0, 5):
  print('Set', i+1, '- perplexity:', perp[i])

print("Variance of perplexity:", statistics.variance(perp))

Laplace performance on test set
Set 1 - perplexity: 589.2159070170205
Set 2 - perplexity: 589.9118492378913
Set 3 - perplexity: 588.7354057333338
Set 4 - perplexity: 591.6856934734319
Set 5 - perplexity: 589.5165635032213
Variance of perplexity: 1.2805890031643805


### 2.2 - Vector Semantics: GloVE implementation

#### 1 - GloVe embedding method implementation

In [None]:
# Import required libraries
from collections import Counter
from scipy import sparse
import numpy as np
from math import log
from tqdm import tqdm
import pickle

In [None]:
# Builds a vocabulary of words mapped to word ID and word frequency in the corpus
def build_vocabulary(corpus):
  vocabulary = Counter()
  # Tokenize corpus into sentences
  sentences = sent_tokenize(corpus)
  for sentence in sentences:
    # Tokenize each sentence into words and update the vocabulary
    words = word_tokenize(sentence)
    vocabulary.update(words)
  
  # Return vocabulary and list of sentences
  return {word: (id, freq) for id, (word, freq) in enumerate(vocabulary.items())}, sentences

In [None]:
# Builds the word cooccurrence matrix X for the given vocabulary. The final
# result is a list containing tuples of (center_ID, context_ID, Xij)
def build_cooccurrences(vocabulary, sentences, contextWindow):
  # Let V be the size of the vocabulary
  vocabSize = len(vocabulary)
  result = []

  # Initialize a sparse matrix for cooccurrences of size V x V
  cooccurrences = sparse.lil_matrix((vocabSize, vocabSize), dtype=np.float64)

  # Iterate over all sentences in the corpus
  for i, sentence in enumerate(sentences):
    # Iterate over all words in the sentence and fetch their word IDs in a list
    words = word_tokenize(sentence)
    wordIds = [vocabulary[word][0] for word in words]

    for i, centerId in enumerate(wordIds):
      # Collect all context word IDs in left window of center word
      contextIds = wordIds[max(0, i - contextWindow) : i]
      contextLen = len(contextIds)

      # Iterate over each context word in left window
      for j, contextId in enumerate(contextIds):
        # Distance from center word
        distance = contextLen - j
        
        # Weight by inverse of distance between words
        increment = 1.0 / float(distance)
        
        # Update cooccurrences of context word and center word symmetrically to
        # handle both left window and right window cooccurrences
        cooccurrences[centerId, contextId] += increment
        cooccurrences[contextId, centerId] += increment
  
  # Build the output result from the sparse matrix
  for i, (row, data) in enumerate(zip(cooccurrences.rows, cooccurrences.data)):
    for data_idx, j in enumerate(row):
      result.append((i, j, data[data_idx]))
  
  return result

In [None]:
# Runs a single iteration of adaptive gradient descent (AdaGrad) while training
# the GloVe word embeddings. Takes input cooccurrence data, weight vectors,
# biases and gradient histories. Returns the cost associated with
# the given weights and updates the weights by AdaGrad
def run_iteration(vocabulary, data, learningRate, xMax, alpha):
  globalCost = 0

  # Shuffle the data to avoid biasing of the word vectors
  shuffle(data)

  for (vMain, vContext, bMain, bContext, gradWMain, gradWContext,
       gradBMain, gradBContext, cooccurrence) in data:
    
    weight = (cooccurrence / xMax) ** alpha if cooccurrence < xMax else 1
    
    # Compute inner component of cost function
    costInner = (vMain.dot(vContext) + bMain[0] + bContext[0] - log(cooccurrence))
    
    # Compute cost function
    cost = weight * (costInner ** 2)

    # Add weighted cost to the global cost
    globalCost += 0.5 * cost

    # Compute gradients for word vectors
    gradMain = weight * costInner * vContext
    gradContext = weight * costInner * vMain

    # Compute gradients for bias terms
    gradBiasMain = weight * costInner
    gradBiasContext = weight * costInner

    # Perform adaptive gradient descent
    vMain -= (learningRate * gradMain / np.sqrt(gradWMain))
    vContext -= (learningRate * gradContext / np.sqrt(gradWContext))

    bMain -= (learningRate * gradBiasMain / np.sqrt(gradBMain))
    bContext -= (learningRate * gradBiasContext / np.sqrt(gradBContext))
    
    # Update squared gradient sums
    gradWMain += np.square(gradMain)
    gradWContext += np.square(gradContext)
    gradBMain += gradBiasMain ** 2
    gradBContext += gradBiasContext ** 2
  
  return globalCost

In [None]:
# Train GloVe vectors given cooccurrences and vocabulary. Takes input other
# parameters such as dimSize, iterations, learningRate, xMax and alpha. Returns
# the computed word vector matrix W of size 2V * d
def train_glove(vocabulary, cooccurrences, dimSize, iterations, learningRate, xMax, alpha):
  
  vocabSize = len(vocabulary)

  # Word vector matrix of size 2V * d initialized by random values in range
  # (-0.5, 0.5]
  W = (np.random.rand(vocabSize * 2, dimSize) - 0.5) / float(dimSize + 1)

  # Bias terms associated with each single vector initialized by random values
  # in range (-0.5, 0.5]
  biases = (np.random.rand(vocabSize * 2) - 0.5) / float(dimSize + 1)

  # Sum of squares of all previous gradients for adaptive gradient descent
  # (AdaGrad) initialized to 1 so that initial adaptive learning rate is equal
  # to global learning rate
  gradient = np.ones((vocabSize * 2, dimSize), dtype=np.float64)

  # Sum of squared gradients for the bias terms
  gradientBiases = np.ones(vocabSize * 2, dtype=np.float64)

  data = [ (W[iMain], W[iContext + vocabSize],
            biases[iMain : iMain + 1],
            biases[iContext + vocabSize : iContext + vocabSize + 1],
            gradient[iMain], gradient[iContext + vocabSize],
            gradientBiases[iMain : iMain + 1],
            gradientBiases[iContext + vocabSize : iContext + vocabSize + 1],
            cooccurrence )
            for iMain, iContext, cooccurrence in cooccurrences]
  
  # Train the word vector matrix for specific number of iterations
  for i in tqdm(range(iterations)):
    cost = run_iteration(vocabulary, data, learningRate, xMax, alpha)
    print('Iteration', i, '- Cost:', cost)

  # Return the word vector matrix
  return W

In [None]:
def glove_embeddings(corpus, contextWindow, dimSize, iterations, learningRate, xMax, alpha):
  vocabulary, sentences = build_vocabulary(corpus)
  cooccurrences = build_cooccurrences(vocabulary, sentences, contextWindow)
  W = train_glove(vocabulary, cooccurrences, dimSize, iterations, learningRate, xMax, alpha)
  return W, vocabulary

In [None]:
CONTEXT_WINDOW = 10
DIM_SIZE = 100
ITERATIONS = 50
LEARNING_RATE = 0.05
X_MAX = 100
ALPHA = 0.75
SMALL_RATIO = 0.15
CORPUS = eng_text[0:int(SMALL_RATIO*len(eng_text))]

W, vocabulary = glove_embeddings(CORPUS, contextWindow=CONTEXT_WINDOW, dimSize=DIM_SIZE,
                            iterations=ITERATIONS, learningRate=LEARNING_RATE,
                            xMax=X_MAX, alpha=ALPHA)

print('Vocabulary Size:', len(vocabulary))

  2%|▏         | 1/50 [05:05<4:09:47, 305.87s/it]

Iteration 0 - Cost: 246414.4416228635


  4%|▍         | 2/50 [10:08<4:03:57, 304.94s/it]

Iteration 1 - Cost: 154526.7653301233


  6%|▌         | 3/50 [15:10<3:58:04, 303.93s/it]

Iteration 2 - Cost: 129975.49430860451


  8%|▊         | 4/50 [20:08<3:51:40, 302.19s/it]

Iteration 3 - Cost: 115975.06002828087


 10%|█         | 5/50 [25:07<3:46:02, 301.38s/it]

Iteration 4 - Cost: 106917.5988612259


 12%|█▏        | 6/50 [30:00<3:39:09, 298.85s/it]

Iteration 5 - Cost: 100210.47771836862


 14%|█▍        | 7/50 [34:57<3:33:42, 298.19s/it]

Iteration 6 - Cost: 94930.37532468761


 16%|█▌        | 8/50 [40:05<3:30:42, 301.00s/it]

Iteration 7 - Cost: 90562.55627817665


 18%|█▊        | 9/50 [45:27<3:30:02, 307.38s/it]

Iteration 8 - Cost: 86830.5861695506


 20%|██        | 10/50 [50:38<3:25:46, 308.66s/it]

Iteration 9 - Cost: 83600.9301966498


 22%|██▏       | 11/50 [55:52<3:21:37, 310.20s/it]

Iteration 10 - Cost: 80745.79094217354


 24%|██▍       | 12/50 [1:00:57<3:15:30, 308.69s/it]

Iteration 11 - Cost: 78171.45209902308


 26%|██▌       | 13/50 [1:05:53<3:07:58, 304.82s/it]

Iteration 12 - Cost: 75819.23543946167


 28%|██▊       | 14/50 [1:10:49<3:01:19, 302.20s/it]

Iteration 13 - Cost: 73644.20599651117


 30%|███       | 15/50 [1:15:45<2:55:04, 300.14s/it]

Iteration 14 - Cost: 71617.69289320953


 32%|███▏      | 16/50 [1:20:40<2:49:14, 298.66s/it]

Iteration 15 - Cost: 69710.3028750657


 34%|███▍      | 17/50 [1:25:40<2:44:27, 299.01s/it]

Iteration 16 - Cost: 67913.0281358426


 36%|███▌      | 18/50 [1:30:55<2:42:08, 304.00s/it]

Iteration 17 - Cost: 66233.67418178875


 38%|███▊      | 19/50 [1:36:04<2:37:48, 305.45s/it]

Iteration 18 - Cost: 64660.949307642906


 40%|████      | 20/50 [1:41:08<2:32:31, 305.05s/it]

Iteration 19 - Cost: 63164.29752962177


 42%|████▏     | 21/50 [1:46:21<2:28:33, 307.35s/it]

Iteration 20 - Cost: 61758.71037683993


 44%|████▍     | 22/50 [1:51:19<2:22:05, 304.47s/it]

Iteration 21 - Cost: 60434.02070074566


 46%|████▌     | 23/50 [1:56:17<2:16:12, 302.70s/it]

Iteration 22 - Cost: 59188.25196562519


 48%|████▊     | 24/50 [2:01:14<2:10:26, 301.01s/it]

Iteration 23 - Cost: 58001.709574166394


 50%|█████     | 25/50 [2:06:17<2:05:40, 301.61s/it]

Iteration 24 - Cost: 56879.76416359309


 52%|█████▏    | 26/50 [2:11:25<2:01:25, 303.57s/it]

Iteration 25 - Cost: 55811.873864295165


 54%|█████▍    | 27/50 [2:16:24<1:55:50, 302.18s/it]

Iteration 26 - Cost: 54799.311242563985


 56%|█████▌    | 28/50 [2:21:26<1:50:44, 302.03s/it]

Iteration 27 - Cost: 53831.833512562655


 58%|█████▊    | 29/50 [2:26:25<1:45:20, 300.97s/it]

Iteration 28 - Cost: 52911.14058270578


 60%|██████    | 30/50 [2:31:23<1:40:05, 300.27s/it]

Iteration 29 - Cost: 52028.00405656768


 62%|██████▏   | 31/50 [2:36:22<1:34:55, 299.76s/it]

Iteration 30 - Cost: 51182.89150377552


 64%|██████▍   | 32/50 [2:41:21<1:29:53, 299.62s/it]

Iteration 31 - Cost: 50374.914319206444


 66%|██████▌   | 33/50 [2:46:20<1:24:51, 299.53s/it]

Iteration 32 - Cost: 49600.20728149509


 68%|██████▊   | 34/50 [2:51:17<1:19:39, 298.74s/it]

Iteration 33 - Cost: 48853.89665440055


 70%|███████   | 35/50 [2:56:21<1:15:03, 300.25s/it]

Iteration 34 - Cost: 48140.89954903039


 72%|███████▏  | 36/50 [3:01:21<1:10:00, 300.03s/it]

Iteration 35 - Cost: 47452.756402472274


 74%|███████▍  | 37/50 [3:06:24<1:05:12, 301.00s/it]

Iteration 36 - Cost: 46791.227763308445


 76%|███████▌  | 38/50 [3:11:31<1:00:34, 302.84s/it]

Iteration 37 - Cost: 46154.14248940452


 78%|███████▊  | 39/50 [3:16:28<55:12, 301.13s/it]  

Iteration 38 - Cost: 45539.823257775985


 80%|████████  | 40/50 [3:21:27<50:05, 300.58s/it]

Iteration 39 - Cost: 44946.5896295189


 82%|████████▏ | 41/50 [3:26:27<45:01, 300.17s/it]

Iteration 40 - Cost: 44375.06064292489


 84%|████████▍ | 42/50 [3:31:25<39:56, 299.52s/it]

Iteration 41 - Cost: 43822.886799995


 86%|████████▌ | 43/50 [3:36:22<34:52, 298.98s/it]

Iteration 42 - Cost: 43288.36095228255


 88%|████████▊ | 44/50 [3:41:18<29:47, 297.92s/it]

Iteration 43 - Cost: 42771.512271710104


 90%|█████████ | 45/50 [3:46:15<24:49, 297.84s/it]

Iteration 44 - Cost: 42272.99411572941


 92%|█████████▏| 46/50 [3:51:12<19:49, 297.47s/it]

Iteration 45 - Cost: 41785.37080241512


 94%|█████████▍| 47/50 [3:56:10<14:52, 297.54s/it]

Iteration 46 - Cost: 41315.10245755931


 96%|█████████▌| 48/50 [4:01:09<09:56, 298.19s/it]

Iteration 47 - Cost: 40859.07206297673


 98%|█████████▊| 49/50 [4:06:06<04:57, 297.75s/it]

Iteration 48 - Cost: 40415.34033631018


100%|██████████| 50/50 [4:11:06<00:00, 301.34s/it]

Iteration 49 - Cost: 39985.61136303595





Vocabulary Size: 99287


In [None]:
# Final GloVe word embedding is obtained by adding the center word embedding
# and context word embedding for each word
print(W.shape)
embeddings = {}
vocabSize = len(vocabulary)
for word, (id, _) in vocabulary.items():
  embeddings[word] = W[id]+W[id+vocabSize]

# Save the embeddings in a pickle file
with open('embeddings.pickle', 'wb') as handle:
    pickle.dump(embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save the weights in a pickle file
with open('weights.pickle', 'wb') as handle:
    pickle.dump(W, handle, protocol=pickle.HIGHEST_PROTOCOL)

(153650, 100)


#### 2 - Comparing Word Similarity
Dependencies: embeddings.pickle

In [None]:
# Load the embeddings from pickle file
with open('embeddings.pickle', 'rb') as handle:
    embeddings = pickle.load(handle)

In [None]:
# Copy the benchmark tool to the current session
! rm -rf web
! cp -r '/content/drive/My Drive/CS565/Assignment 2/word-embeddings-benchmarks/web' .

In [None]:
# Import required libraries
from web.datasets.similarity import *
from web.evaluate import evaluate_similarity
from web.embeddings import fetch_GloVe

In [None]:
# Define benchmark tasks for word similarity comparison 
tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "SIMLEX999": fetch_SimLex999(),
    "RG65": fetch_RG65(),
    "MTurk": fetch_MTurk(),
    "RW": fetch_RW()
}

In [None]:
# Print sample data
for name, data in tasks.items():
    print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".format(name, data.X[0][0], data.X[0][1], data.y[0]))

Sample data from MEN: pair "sun" and "sunlight" is assigned score [10.]
Sample data from WS353: pair "love" and "sex" is assigned score 6.77
Sample data from SIMLEX999: pair "old" and "new" is assigned score 1.58
Sample data from RG65: pair "gem" and "jewel" is assigned score 9.85
Sample data from MTurk: pair "episcopal" and "russia" is assigned score 5.5
Sample data from RW: pair "squishing" and "squirt" is assigned score 5.88


In [None]:
# Calculate results using helper function
for name, data in tasks.items():
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(embeddings, data.X, data.y)))

Missing 803 words. Will replace them with mean vector


Spearman correlation of scores on MEN 0.1295455136075063


Missing 52 words. Will replace them with mean vector


Spearman correlation of scores on WS353 0.20866820523498564


Missing 94 words. Will replace them with mean vector


Spearman correlation of scores on SIMLEX999 0.07120656894553914


Missing 30 words. Will replace them with mean vector


Spearman correlation of scores on RG65 0.1629543816722517


Missing 125 words. Will replace them with mean vector


Spearman correlation of scores on MTurk 0.18482107742423962


Missing 1743 words. Will replace them with mean vector


Spearman correlation of scores on RW 0.20551540968104426


In [None]:
# Fetch GloVe embeddings
gloveEmbeddings = fetch_GloVe(corpus="twitter-27B", dim=100)

File already downloaded, skipping


In [None]:
# Calculate results using helper function
for name, data in tasks.items():
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(gloveEmbeddings, data.X, data.y)))

Missing 1 words. Will replace them with mean vector
Missing 25 words. Will replace them with mean vector
Missing 1 words. Will replace them with mean vector
Missing 1 words. Will replace them with mean vector


Spearman correlation of scores on MEN 0.5773369776281995
Spearman correlation of scores on WS353 0.46979381939437287
Spearman correlation of scores on SIMLEX999 0.12221121100798378
Spearman correlation of scores on RG65 0.6774486160113895


Missing 1071 words. Will replace them with mean vector


Spearman correlation of scores on MTurk 0.5641004632190647
Spearman correlation of scores on RW 0.23074174522387267
