# Word2Vec
This notebook trains out word2vec model.

In [1]:
# importing all necessary modules 
import warnings  
import gensim 
import utils
import pickle
from nltk.tokenize import sent_tokenize, word_tokenize 
from gensim.models import Word2Vec 
from nltk.corpus import stopwords 

warnings.filterwarnings(action = 'ignore')

# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/
# https://towardsdatascience.com/a-beginners-guide-to-word-embedding-with-gensim-word2vec-model-5970fa56cc92#9731

In [2]:
train = utils.load_dataset('datasets/DBpedia/train.json')

In [3]:
stop_words = set(stopwords.words('english'))

lines = []
for doc in train:
    if doc['category'] == 'resource':
        q = doc['question']
        q = " ".join([w for w in word_tokenize(q)  if not w in stop_words])
        lines.append(q + " " + " ".join(doc['type']))
        
data = [] 

for line in lines:
    temp = []
    for j in word_tokenize(line): 
        temp.append(j) 
    data.append(temp)

In [4]:
# Create CBOW model 
model1 = gensim.models.Word2Vec(data, min_count = 1,size = 100, window = 10) 
  
# Create Skip Gram model 
model2 = gensim.models.Word2Vec(data, min_count = 1, size = 100,window = 10, sg = 1) 

In [6]:
word1 = 'church'
word2 = 'Church'
print("CBOW : ", model1.similarity(word1, word2))
print("Skip Gram : ", model2.similarity(word1, word2))

CBOW :  0.9355372
Skip Gram :  0.9712893


In [7]:
model2.most_similar('Church')

[('ReligiousBuilding', 0.9893750548362732),
 ('London', 0.9879487752914429),
 ('hub', 0.9841128587722778),
 ('Airport', 0.9836708903312683),
 ('Hotel', 0.9836187362670898),
 ('Road', 0.9793203473091125),
 ('Theatre', 0.9783246517181396),
 ('Venue', 0.9774261116981506),
 ('Station', 0.976046621799469),
 ('RailwayStation', 0.9749395847320557)]

Save the skip-gram model as it gives the most realistic similarities. 

In [5]:
pickle.dump(model2, open('word2vec_sg.sav', 'wb'))

In [8]:
q_terms = ['what', 'periodical', 'literature', 'delta', 'air', 'lines', 'use', 'moutpiece']
t_terms = ['publication', 'recurring', 'text', 'serial', 'airport', 'aerodrome', 'station', 'class', 'product']
# q_terms = ['who', 'country', 'head', 'state', 'mahmoud', 'abbas']
# t_terms = ['regime', 'state', 'country', 'person', 'omnivore', 'nation', 'republic', 'plutocracy']

for i in q_terms:
    for j in t_terms:
        score = model2.similarity(i, j)
        if score > 0.85:
            print(i, j, score)

KeyError: "word 'what' not in vocabulary"

In [None]:
for i in q_terms:
    for j in t_terms:
        score = model1.similarity(i, j)
        if score > 0.95:
            print(i, j, score)