In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import re
import nltk.corpus

import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
nltk.download('stopwords')
from itertools import combinations
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Training the word2vec

The dataset can be found here: http://jmcauley.ucsd.edu/data/amazon/

In [None]:
#read the json that contains the dataset
data = pd.read_json('/content/gdrive/MyDrive/NLP/proiect2/reviews_Sports_and_Outdoors_5.json', lines=True)

In [None]:
def get_sentences(data):
  #get the preprocessing sentences from the dataset
  tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
  #get only the data in the reviewText column
  data = data['reviewText']
  corpus = ''.join(data + " ")

  #tokenize the sentences
  sentences = tokenizer.tokenize(corpus)
  modified = []
  for sentence in sentences:
    #perform preprocessing in order to obtain only the words that contains letters
    sentence = re.sub(re.compile("[^A-Za-z]+"), " ", sentence)
    modified.append(sentence.strip().split())
  
  return modified

documents = get_sentences(data)

In [None]:
#build the vocabulary and train the word2vec
model = Word2Vec(size=300, min_count=3, window=7, sg=1, seed=1)
model.build_vocab(sentences=documents)
model.train(sentences=documents, total_examples=model.corpus_count, epochs = 1)
voc = model.wv.vocab
print("Tokens: ", len(voc))

Tokens:  54665


In [None]:
model.save("/content/gdrive/MyDrive/NLP/proiect2/word2vec_model.model")

Examples for word2vec in order to see the most similar words for one entry

In [None]:
model.most_similar("great")

  """Entry point for launching an IPython kernel.


[('fantastic', 0.844652533531189),
 ('wonderful', 0.8247833251953125),
 ('terrific', 0.7980945110321045),
 ('fabulous', 0.7573331594467163),
 ('phenomenal', 0.7482106685638428),
 ('awesome', 0.739443302154541),
 ('Fantastic', 0.7390926480293274),
 ('good', 0.7317873239517212),
 ('prefect', 0.7303392887115479),
 ('excellent', 0.7212550044059753)]

In [None]:
model.most_similar("cat")

  """Entry point for launching an IPython kernel.


[('squirrel', 0.7444180846214294),
 ('daddy', 0.7386866807937622),
 ('cats', 0.7356756329536438),
 ('kiddie', 0.7353200912475586),
 ('bird', 0.7261258363723755),
 ('plink', 0.7246809005737305),
 ('coyotes', 0.7220070958137512),
 ('neighbor', 0.7194820642471313),
 ('lacrosse', 0.7177393436431885),
 ('nerf', 0.7172856330871582)]

### Evaluation of Word2vec

In [None]:
model = Word2Vec.load("/content/gdrive/MyDrive/NLP/proiect2/word2vec_model.model")

In [None]:
def coverage(model):
  #check the percentage of the coverage
  lemmatizer = WordNetLemmatizer()
  voc = model.wv.vocab
  voc_wn = 0
  voc_all = 0
  words_not_wn = []
  for key in voc:
    voc_all += 1
    #lemmatize the word of the vocabulary
    word = lemmatizer.lemmatize(key)
    synset_word = wn.synsets(key)
    synset_lemma = wn.synsets(word)
    #check if the word or its lemma has at least one synset
    if len(synset_word) > 0 or len(synset_lemma) > 0:
      voc_wn += 1
    else:
      words_not_wn.append(key)
  
  #print a bunch of examples of words that do not appear in the wordnet
  print("Examples of words that are not in WN, but are in the vocabulary: {}".format(words_not_wn[-10:-5]))
  return voc_wn / voc_all


In [None]:
def generate_sample(model):
  #function to generate the sample of number_samples words from the vocabulary that are not stopwords
  number_samples = 1000
  voc = model.wv.vocab
  stop_words = set(stopwords.words('english'))
  k = 0
  words = []
  for key in voc:
    if key.lower() not in stop_words:
      words.append(key)
      k += 1
    
    if k == number_samples:
      break
  
  return words

In [None]:
def synonyms_emb(word1, word2, model, threshold):
  #function to check if two words are similar based on cosine similarity and having a threshold
  sim = model.wv.similarity(word1, word2)
  if sim >= threshold:
    return True
  return False

In [None]:
def synonyms_wn(word1, word2):
  #function to check if two words have in common at least one synset
  lemmatizer = WordNetLemmatizer()
  word1 = lemmatizer.lemmatize(word1)
  word2 = lemmatizer.lemmatize(word2)
  synset1 = set(wn.synsets(word1))
  synset2 = set(wn.synsets(word2))
  if len(synset1.intersection(synset2)) > 0:
    return True
  return False

In [None]:
#generate the words
words = generate_sample(model)

In [None]:
thresholds = [0.6, 0.7, 0.8, 0.9]
#try various thresholds in order to obtain the best f1 score
for threshold in thresholds:
  #generate unique pairs of words
  pairs = combinations(words, 2)
  syn_emb = 0
  syn_wn = 0
  common = 0

  for pair in pairs:
    is_syn_emb = synonyms_emb(pair[0], pair[1], model, threshold)
    if is_syn_emb:
      #check if they are "synonyms" in terms of embeddings
      syn_emb += 1

    is_syn_wn = synonyms_wn(pair[0], pair[1])
    if is_syn_wn:
      #check if they are "synonyms" in terms of wordnet
      syn_wn += 1

    if is_syn_emb and is_syn_wn:
      #check if they are "synonyms" in terms of embeddings and wornet
      common += 1


  #compute the metrics
  precision = common / syn_emb
  recall = common / syn_wn
  f1 = 2 * precision * recall / (precision + recall)
  print("Threshold: {} - Precision: {} - Recall: {} - F1: {}".format(threshold, precision, recall, f1))

Threshold: 0.6 - Precision: 0.019661762683899353 - Recall: 0.15294117647058825 - F1: 0.034844054580896684
Threshold: 0.7 - Precision: 0.02579957356076759 - Recall: 0.06470588235294118 - F1: 0.03689024390243903
Threshold: 0.8 - Precision: 0.01312910284463895 - Recall: 0.012834224598930482 - F1: 0.012979989183342347
Threshold: 0.9 - Precision: 0.0033333333333333335 - Recall: 0.0010695187165775401 - F1: 0.001619433198380567


As it can be seen above, the best f1 score is obtained using threshold = 0.7 . I am going to use this threshold below in order to compute the metrics and show the results for this assigment.

Description:

Chosen corpus: Amazon Product Data - Sports and Outdoors

Language: English

Number of tokens: 54665

Coverage: 0.7258209091740602

Precision: 0.02579957356076759

Recall: 0.06470588235294118

F1: 0.03689024390243903

Threshold: 0.7

In [None]:
cov = coverage(model)
print('Coverage: {}'.format(cov))

Examples of words that are not in WN, but are in the vocabulary: ['Medifast', 'BlenderBall', 'Omaker', 'OnCore', 'Gogogu']
Coverage: 0.7258209091740602


In [None]:
pairs = combinations(words, 2)

threshold = 0.7
syn_emb = 0
syn_wn = 0
common = 0

pair_emb = []
pair_wn = []

#the same code as above
for pair in pairs:
  is_syn_emb = synonyms_emb(pair[0], pair[1], model, threshold)
  if is_syn_emb:
    syn_emb += 1

  is_syn_wn = synonyms_wn(pair[0], pair[1])
  if is_syn_wn:
    syn_wn += 1

  if is_syn_emb and is_syn_wn:
    common += 1
  
  if is_syn_emb and not is_syn_wn:
    pair_emb.append(pair)
  
  if not is_syn_emb and is_syn_wn:
    pair_wn.append(pair)


precision = common / syn_emb
recall = common / syn_wn
f1 = 2 * precision * recall / (precision + recall)
#show some precision and recall errors
print("Threshold: {} - Precision: {} - Recall: {} - F1: {}".format(threshold, precision, recall, f1))
print("Precision errors(word pairs synonyms in the embedding space, but not in the same synset in WN): {}".format(pair_emb[5:10]))
print("Recall errors(word pairs not synonyms in the embedding space, but in the same synset in WN): {}".format(pair_wn[5:10]))

Threshold: 0.7 - Precision: 0.02579957356076759 - Recall: 0.06470588235294118 - F1: 0.03689024390243903
Precision errors(word pairs synonyms in the embedding space, but not in the same synset in WN): [('haved', 'Bit'), ('haved', 'Assembling'), ('haved', 'mice'), ('haved', 'Bounced'), ('haved', 'witch')]
Recall errors(word pairs not synonyms in the embedding space, but in the same synset in WN): [('came', 'got'), ('came', 'amount'), ('came', 'amounts'), ('came', 'number'), ('came', 'Comes')]
