# Training Your Own Word2Vec Model

### *Word2Vec model can learn embeddings from any text corpus!*

* Continuous Bag of Words Model
* Skip Gram Model

Algorithm looks at window of target word(Y) to provide context word(X), the model is trained on (X,Y) pairs in a superwised manner. The algorithm was developed by Tomas Mikolov.

### *Data Preparation*

* Each sentence must be tokenized, into a list of words.
* The sentences can be text loaded into memory once, or we can build a data pipeline which iteratively feeds data to the model.

In [61]:
import gensim
from gensim.models import Word2Vec,KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
from nltk.corpus import stopwords

In [73]:
stopWords  = set(stopwords.words('english'))
def readFile(file):
    f = open(file,'r',encoding='utf-8')
    text = f.read()
    
    data = []
    sents = nltk.sent_tokenize(text)
    for sent in sents:
        words = nltk.word_tokenize(sent) 
        words = [w.lower() for w in words if len(w)>2 and w not in stopWords]
        data.append(words)
    
    return data

text = readFile("bollywood.txt")

In [74]:
model = Word2Vec(text,size=300,window=10,min_count=1)
print(model)

Word2Vec(vocab=116, size=300, alpha=0.025)


In [75]:
words = list(model.wv.vocab)
print(words)

['deepika', 'padukone', 'ranveer', 'singh', 'wedding', 'one', 'biggest', 'bollywood', 'events', 'happened', '2018', 'the', 'celebrations', 'hooked', 'phones', 'waiting', 'come', 'also', 'gave', 'enough', 'reason', 'believe', 'stylish', 'two', 'couple', 'from', 'airport', 'looks', 'reception', 'parties', 'everything', 'entire', 'timeline', 'style', 'file', 'not', 'ambanis', 'priyanka', 'nick', 'man', 'proves', 'year', 'this', 'big', 'fat', 'lavish', 'extravagant', 'weddings', 'isha', 'ambani', 'anand', 'piramal', 'chopra', 'jonas', 'kapil', 'sharma', 'ginni', 'chatrath', 'saw', 'many', 'grand', 'but', 'nothing', 'beats', 'award', 'social', 'media', 'shared', 'video', 'featuring', 'jonaswas', 'celebrating', 'family', 'first', 'celebrated', 'christmas', 'london', 'pictures', 'new', 'outstanding', 'glimpses', 'celebration', 'verbier', 'switzerland', 'married', 'december', 'three', 'receptions', 'delhi', 'mumbai', 'jaggo', 'night', 'made', 'even', 'special', 'industry', 'friends', 'long', '

In [76]:
def predict_actor(a,b,c,word_vectors):
    a,b,c = a.lower(),b.lower(),c.lower()
    max_similarity = -100 
    d = None
    words = ["ranveer","deepika","padukone","singh","nick","jonas","chopra","priyanka","virat","anushka","ginni"]

    wa,wb,wc = word_vectors[a],word_vectors[b],word_vectors[c]
    for w in words:
        
        if w in [a,b,c]:
            continue
        
        wv = word_vectors[w]
        sim = cosine_similarity([wb-wa],[wv-wc])
        
        if sim > max_similarity:
            max_similarity = sim
            d = w
    return d

In [77]:
print(predict_actor("deepika","ranveer","priyanka",model.wv))

chopra
