In [10]:
import warnings
warnings.filterwarnings('ignore')
from nltk.stem import SnowballStemmer
from nltk.tokenize import WordPunctTokenizer
import gensim
import _pickle as pickle
import numpy as np
from sklearn import preprocessing
from termcolor import colored


def init_doc_matrix(doc,w2v):

    matrix = np.zeros((len(doc),100)) #word embedding size is 100
    #print("----Converting words to Vectors")
    for i, word in enumerate(doc):
        if word in w2v.wv.key_to_index:
            matrix[i] = np.array(w2v.wv[word])
            #print("Vector representation of: ",word,":",colored(matrix[i],"green"))

    #l2 normalize
    try:
        norm = np.linalg.norm(matrix, axis=1).reshape(len(doc), 1)
        matrix = np.divide(matrix, norm, out=np.zeros_like(matrix), where=norm!=0)
        #matrix = matrix / np.linalg.norm(matrix, axis=1).reshape(len(doc), 1)
    except RuntimeWarning:
        print (doc)


    return matrix

def init_doc_idf_vector(doc,idf):
    idf_vector = np.zeros((1,len(doc)))  # word embedding size is 100
    for i, word in enumerate(doc):
        if word in idf:
            idf_vector[0][i] = idf[word][1]
            print("The idf value for word:",colored(word,"red")," : ",colored(idf_vector[0][i],"green"))

    return idf_vector



def sim_doc_pair(matrix1,matrix2,idf1,idf2):

    sim12 = (idf1*(matrix1.dot(matrix2.T).max(axis=1))).sum() / idf1.sum()

    sim21 = (idf2*(matrix2.dot(matrix1.T).max(axis=1))).sum() / idf2.sum()


    return 2 * sim12 * sim21 / (sim12 + sim21)
    total_len = matrix1.shape[0] + matrix2.shape[0]
    return sim12 * matrix2.shape[0] / total_len + sim21 * matrix1.shape[0] / total_len



if __name__ == "__main__":
    w2v = gensim.models.Word2Vec.load('../data/w2v_model_stemmed')
    idf = pickle.load(open('../data/idf','rb'))


    print(colored("Enter the statement 1:","green"))
    question1 = input()
    text1=question1
    question1 = WordPunctTokenizer().tokenize(question1.lower())
    print("Statement 1 after Tokenization:")
    print(colored(question1,"red"))
    question1 = [SnowballStemmer('english').stem(word) for word in question1]
    print("Statement 1 after Stemming:")
    print(colored(question1,"blue"))
    
    print("-------------------------------------------")
    
    print(colored("Enter the statement 2:","green"))
    question2 = input()
    text2=question2
    question2 = WordPunctTokenizer().tokenize(question2.lower())
    print("Statement 2 after Tokenization:")
    print(colored(question2,"red"))
    question2 = [SnowballStemmer('english').stem(word) for word in question2]
    print("Statement 2 after Stemming:")
    print(colored(question2,"blue"))

    matrix1 = init_doc_matrix(question1,w2v)
    matrix2 = init_doc_matrix(question2,w2v)
    #print(matrix1)
    matrix1_trans = matrix1.T
    matrix2_trans = matrix2.T

    idf1 = init_doc_idf_vector(question1,idf)
    idf2 = init_doc_idf_vector(question2,idf)
    print("\n\nThe similarity Score between the statements: ")
    print(colored(text1,"red")," && " ,colored(text2,"blue")," is: ",sim_doc_pair(matrix1, matrix2, idf1, idf2))

[32mEnter the statement 1:[0m
Float represent decimal numbers
Statement 1 after Tokenization:
[31m['float', 'represent', 'decimal', 'numbers'][0m
Statement 1 after Stemming:
[34m['float', 'repres', 'decim', 'number'][0m
-------------------------------------------
[32mEnter the statement 2:[0m
How are you
Statement 2 after Tokenization:
[31m['how', 'are', 'you'][0m
Statement 2 after Stemming:
[34m['how', 'are', 'you'][0m
The idf value for word: [31mfloat[0m  :  [32m5.063703716400199[0m
The idf value for word: [31mrepres[0m  :  [32m4.225874460716188[0m
The idf value for word: [31mdecim[0m  :  [32m5.690253015564198[0m
The idf value for word: [31mnumber[0m  :  [32m2.934353792096124[0m
The idf value for word: [31mhow[0m  :  [32m2.533656505487701[0m
The idf value for word: [31mare[0m  :  [32m1.3441964759949232[0m
The idf value for word: [31myou[0m  :  [32m0.37254450397982425[0m


The similarity Score between the statements: 
[31mFloat represent decima