# Word2Vec
This notebook trains the word2vec model.

In [1]:
# importing all necessary modules 
import warnings  
import gensim 
import utils
import pickle
from nltk.tokenize import sent_tokenize, word_tokenize 
from gensim.models import Word2Vec 
from nltk.corpus import stopwords 

warnings.filterwarnings(action = 'ignore')

# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/
# https://towardsdatascience.com/a-beginners-guide-to-word-embedding-with-gensim-word2vec-model-5970fa56cc92#9731

In [2]:
train = utils.load_dataset('datasets/DBpedia/train.json')

In [3]:
stop_words = set(stopwords.words('english'))

lines = []
for doc in train:
    if doc['category'] == 'resource':
        q = doc['question']
        q = " ".join([w for w in word_tokenize(q)  if not w in stop_words])
        lines.append(q + " " + " ".join(doc['type']))
        
data = [] 

for line in lines:
    temp = []
    for j in word_tokenize(line): 
        temp.append(j) 
    data.append(temp)

In [4]:
# Create CBOW model 
model1 = gensim.models.Word2Vec(data, min_count = 1,size = 100, window = 10) 
  
# Create Skip Gram model 
model2 = gensim.models.Word2Vec(data, min_count = 1, size = 100,window = 10, sg = 1) 

In [5]:
word1 = 'church'
word2 = 'Church'
print("CBOW : ", model1.similarity(word1, word2))
print("Skip Gram : ", model2.similarity(word1, word2))

CBOW :  0.8497436
Skip Gram :  0.9767906


In [6]:
model2.most_similar('Church')

[('London', 0.9873440265655518),
 ('Airport', 0.9852166771888733),
 ('station', 0.9852117300033569),
 ('ReligiousBuilding', 0.9849776029586792),
 ('hub', 0.9812229871749878),
 ('Building', 0.978388786315918),
 ('railway', 0.9768326282501221),
 ('church', 0.9767906069755554),
 ('Road', 0.976337194442749),
 ('Hotel', 0.9728286862373352)]

Save the skip-gram model as it gives the most realistic similarities. 

In [7]:
pickle.dump(model2, open('word2vec_sg.sav', 'wb'))