In [None]:
'''
WORD2VEC MODEL
https://www.youtube.com/watch?v=ouzJPlO64Dg&feature=youtu.be

Links : How does word2vec internally work? 
https://radimrehurek.com/gensim/models/word2vec.html
https://arxiv.org/pdf/1301.3781.pdf
https://arxiv.org/abs/1310.4546

-google pretrained model
-50 Billion Words
-similar context words have similar vectors
- euclidian Distance/Similarity measured by Cosine Similarity/Distance

Applications:
-text similarity
-language translation
-find odd words out
-word analogies

WordEmbeddings : 
-numerical representation of words in form of vectors
-how to use pre-trained word2vec model
-NLP package used : Gensim (NLP package)

Gensim's Word2Vec model = CBOW Model + SkipGram Model

models.word2vec -- Word2vec embeddings

'''

In [None]:
#!pip3 install gensim

In [None]:
import gensim
import numpy as np
# gensim also has Doc2vec,Word2vec,FastText
from gensim.models import word2vec
from gensim.models import KeyedVectors
# KeyedVectors - object containing the mapping between words and embeddings
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)

In [None]:
a=word_vectors["anger"]
b=word_vectors["fury"]
cosine_similarity([a],[b])

In [None]:
a

In [None]:
b.shape

###     ODD ONE OUT - Min Similarity

In [None]:
# to compute the odd one out 
# find the mean of all the vectors
# find the mean
# find the distance of each vector from the mean
def OddOneOut(word_list):
    word_embedd_vectors = [word_vectors[word] for word in word_list]
    vec_mean = np.mean(word_embedd_vectors,axis=0)
    Odd_elem=None
    Min_Similarity = 1.0
    
    for index in range(len(word_embedd_vectors)):
        similarity = cosine_similarity([word_embedd_vectors[index]],[vec_mean])
        print("Similarity between {} and mean vector is {}".format(word_list[index],similarity))
        if similarity<Min_Similarity:
            Min_Similarity=similarity
            Odd_elem=word_list[index]
    
    return Odd_elem

In [None]:
input_1 = ["apple","mango","juice","party","orange"] 
input_2 = ["music","dance","sleep","dancer","food"]        
input_3  = ["match","player","football","cricket","dancer"]
input_4 = ["india","paris","russia","france","germany"]

In [None]:
inputs = [input_1,input_2,input_3,input_4]
for input in inputs:
    print("\n\nInput : ",input,"\n")
    print("\nOdd One Out : ",OddOneOut(input))

### WORD ANALOGIES

In [None]:
#a:b c:?
# Let ? = d
# so a-b = c-d is the relation --> 
# so find the similarity of a-b and look for d with similarity with same similarity
# 

In [None]:
import time
from sklearn.metrics.pairwise import cosine_similarity
#find the value of d
def WordAnalogy(a,b,c,word_vectors):
    start_time = time.time()
    pred_d = None
    a,b,c = a.lower(),b.lower(),c.lower()
    vec_a,vec_b,vec_c=word_vectors[a],word_vectors[b],word_vectors[c]
    word_list = word_vectors.vocab.keys()
    
    max_similarity = -100000 
    
    for word in word_list:
        if word in [a,b,c]:
            continue 
        word_vec = word_vectors[word]
        similarity = cosine_similarity([vec_a - vec_b],[vec_c - word_vec])
        
        if similarity > max_similarity:
            max_similarity = similarity
            pred_d = word
    
    print("\nThe Max Similarity : ",max_similarity,"\n")
    end_time = time.time()
    print("\nExecution Time : {}\n\n".format(end_time - start_time))
    return pred_d

In [None]:
input_1=("man","code","woman")
input_2=("italy","italian","india")
input_3=("india","country","newyork")

In [None]:
inputs = [input_1,input_2,input_3]
for input in inputs:
    print("Predicted Value = ",WordAnalogy(*input,word_vectors),)

#### WORD ANALOGY -MOST SIMILAR METHOD

In [None]:
word_vectors.most_similar(positive=['india','country'],negative=['delhi'],topn=1)