Word2Vec

In [55]:
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

# Sample corpus
documents = [
'Owning a car is no longer a luxury, but it has become a necessity. Whether you drive to work or enjoy weekend drives with the family,\
 having a car can simplify your travels and not to forget the ease and comfort it brings',
'Purchasing their dream vehicle is easier than ever before for millions of Indians – thanks to the widespread availability of car loans in India.',
'Car loans offer you the money for the vehicle upfront. You can then comfortably repay the borrowed amount via affordable monthly EMIs.',
'An auto loan is a secured loan, as the car acts as the guarantee. There is no need to provide any additional asset or mortgage while procuring the loan.',
'Before you apply for an auto loan, you need to compare the interest rates charged by lenders. Even slight variations in the interest rates can play a huge role in increasing or reducing your overall burden.',
'To make it easy for you, here in this guide, we list out the interest rates charged by leading lenders for auto loans in India. You can use this handy table to quickly compare the interest rates before you make a decision.'
]

# Sample queries

queries = [
'what does car loan offer',
'guide me about loans'
]

documents_df=pd.DataFrame(documents,columns=['documents'])
queries_df = pd.DataFrame(queries,columns=['queries'])

In [56]:
documents_df

Unnamed: 0,documents
0,"Owning a car is no longer a luxury, but it has..."
1,Purchasing their dream vehicle is easier than ...
2,Car loans offer you the money for the vehicle ...
3,"An auto loan is a secured loan, as the car act..."
4,"Before you apply for an auto loan, you need to..."
5,"To make it easy for you, here in this guide, w..."


In [57]:
queries_df

Unnamed: 0,queries
0,what does car loan offer
1,guide me about loans


In [58]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/himanshujanbandhu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [59]:
stop_words_l=stopwords.words('english')
documents_df['documents_cleaned']=documents_df.documents.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )


In [60]:
documents_df

Unnamed: 0,documents,documents_cleaned
0,"Owning a car is no longer a luxury, but it has...",owning car longer luxury become necessity wh...
1,Purchasing their dream vehicle is easier than ...,purchasing dream vehicle easier ever millions ...
2,Car loans offer you the money for the vehicle ...,car loans offer money vehicle upfront comfort...
3,"An auto loan is a secured loan, as the car act...",auto loan secured loan car acts guarantee ne...
4,"Before you apply for an auto loan, you need to...",apply auto loan need compare interest rates c...
5,"To make it easy for you, here in this guide, w...",make easy you guide list interest rates char...


In [61]:
queries_df['queries_cleaned']=queries_df.queries.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )


In [62]:
queries_df

Unnamed: 0,queries,queries_cleaned
0,what does car loan offer,car loan offer
1,guide me about loans,guide loans


In [63]:
# tokenize and pad every document to make them of the same size
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
# tokenizer=Tokenizer()
# tokenizer.fit_on_texts(documents_df.documents_cleaned)
# tokenized_documents=tokenizer.texts_to_sequences(documents_df.documents_cleaned)
# tokenized_paded_documents=pad_sequences(tokenized_documents,maxlen=64,padding='post')
# vocab_size=len(tokenizer.word_index)+1
# print (tokenized_documents[0])

In [64]:
# tokenizer.fit_on_texts(queries_df.queries_cleaned)
# tokenized_queries = tokenizer.texts_to_sequences(queries_df.queries_cleaned)
# tokenized_paded_queries=pad_sequences(tokenized_queries,maxlen=64,padding='post')
# print (tokenized_queries[0])

In [65]:
# from sklearn.metrics.pairwise import cosine_similarity

In [66]:
# loading pre-trained embeddings, each word is represented as a 300 dimensional vector
from gensim.models.word2vec import Word2Vec

In [67]:
lst = documents_df.documents_cleaned.tolist()
corpus = [x.split() for x in lst]
corpus

[['owning',
  'car',
  'longer',
  'luxury',
  'become',
  'necessity',
  'whether',
  'drive',
  'work',
  'enjoy',
  'weekend',
  'drives',
  'family',
  'car',
  'simplify',
  'travels',
  'forget',
  'ease',
  'comfort',
  'brings'],
 ['purchasing',
  'dream',
  'vehicle',
  'easier',
  'ever',
  'millions',
  'indians',
  'thanks',
  'widespread',
  'availability',
  'car',
  'loans',
  'india'],
 ['car',
  'loans',
  'offer',
  'money',
  'vehicle',
  'upfront',
  'comfortably',
  'repay',
  'borrowed',
  'amount',
  'via',
  'affordable',
  'monthly',
  'emis'],
 ['auto',
  'loan',
  'secured',
  'loan',
  'car',
  'acts',
  'guarantee',
  'need',
  'provide',
  'additional',
  'asset',
  'mortgage',
  'procuring',
  'loan'],
 ['apply',
  'auto',
  'loan',
  'need',
  'compare',
  'interest',
  'rates',
  'charged',
  'lenders',
  'even',
  'slight',
  'variations',
  'interest',
  'rates',
  'play',
  'huge',
  'role',
  'increasing',
  'reducing',
  'overall',
  'burden'],
 ['

In [68]:
model = Word2Vec(corpus,min_count=1,size= 50,window =5, sg = 1)

In [69]:
print('Vocabulary size:', len(model.wv.vocab))

Vocabulary size: 80


In [70]:
model.vector_size

50

In [71]:
my_dict = dict({})
for idx, key in enumerate(model.wv.vocab):
    my_dict[key] = model.wv[key]

In [72]:
my_dict['acts']

array([ 0.00385123,  0.00920668, -0.00035922,  0.00590899,  0.00267128,
       -0.00461593,  0.00807486,  0.00076367,  0.00459765,  0.00948801,
        0.00484462, -0.00973404,  0.00692207,  0.00112204, -0.00928194,
       -0.00788446,  0.00061087, -0.00862984,  0.00049357, -0.00087135,
        0.00772012,  0.00149789,  0.0067379 , -0.00079674, -0.00198222,
        0.00890125,  0.00621644, -0.00702879, -0.00236349, -0.00698148,
        0.00117638, -0.00547134, -0.00346957,  0.00467568, -0.00866969,
       -0.00470264,  0.00489274, -0.00421651, -0.00189275,  0.00563362,
       -0.00127553, -0.00630939,  0.00674941, -0.00875689, -0.00832035,
       -0.00430686, -0.0083062 , -0.00689117, -0.00141898,  0.00263147],
      dtype=float32)

In [73]:
model.wv.vocab

{'owning': <gensim.models.keyedvectors.Vocab at 0x7fc729bf8a90>,
 'car': <gensim.models.keyedvectors.Vocab at 0x7fc729bf8ac0>,
 'longer': <gensim.models.keyedvectors.Vocab at 0x7fc729bf8a60>,
 'luxury': <gensim.models.keyedvectors.Vocab at 0x7fc729bf8cd0>,
 'become': <gensim.models.keyedvectors.Vocab at 0x7fc6e80b8160>,
 'necessity': <gensim.models.keyedvectors.Vocab at 0x7fc6e80b8be0>,
 'whether': <gensim.models.keyedvectors.Vocab at 0x7fc6e80b8580>,
 'drive': <gensim.models.keyedvectors.Vocab at 0x7fc6e80b89d0>,
 'work': <gensim.models.keyedvectors.Vocab at 0x7fc6e80b83d0>,
 'enjoy': <gensim.models.keyedvectors.Vocab at 0x7fc6e80b87f0>,
 'weekend': <gensim.models.keyedvectors.Vocab at 0x7fc6e80b8af0>,
 'drives': <gensim.models.keyedvectors.Vocab at 0x7fc6e80b8b50>,
 'family': <gensim.models.keyedvectors.Vocab at 0x7fc6e80b8cd0>,
 'simplify': <gensim.models.keyedvectors.Vocab at 0x7fc6e80b8f70>,
 'travels': <gensim.models.keyedvectors.Vocab at 0x7fc6e80b88e0>,
 'forget': <gensim.model

In [74]:
# Function returning vector reperesentation of a query
# def get_embedding_w2v(query_tokens):
#     embeddings = []
#     if len(query_tokens)<1:
#         return np.zeros(300)
#     else:
#         for tok in query_tokens:
#             if tok in model.wv.vocab:
#                 embeddings.append(model.wv.word_vec(tok))
#             else:
#                 embeddings.append(np.random.rand(300))
#         # mean the vectors of individual words to get the vector of the document
#         return np.mean(embeddings, axis=0)

# # Getting Word2Vec Vectors for Queries
# queries_df['vector']=queries_df['queries_cleaned'].apply(lambda x :get_embedding_w2v(x.split()))

In [75]:
# from sklearn.metrics.pairwise import cosine_similarity

# # Function for calculating average precision for a query
# def average_precision(qid,qvector):
  
#   # Getting the ground truth and document vectors
#   qresult=testing_result.loc[testing_result['qid']==qid,['docid','rel']]
#   qcorpus=testing_corpus.loc[testing_corpus['docid'].isin(qresult['docid']),['docid','vector']]
#   qresult=pd.merge(qresult,qcorpus,on='docid')
  
#   # Ranking documents for the query
#   qresult['similarity']=qresult['vector'].apply(lambda x: cosine_similarity(np.array(qvector).reshape(1, -1),np.array(x).reshape(1, -1)).item())
#   qresult.sort_values(by='similarity',ascending=False,inplace=True)

#   # Taking Top 10 documents for the evaluation
#   ranking=qresult.head(10)['rel'].values
  
#   # Calculating precision
#   precision=[]
#   for i in range(1,11):
#     if ranking[i-1]:
#       precision.append(np.sum(ranking[:i])/i)
  
#   # If no relevant document in list then return 0
#   if precision==[]:
#     return 0

#   return np.mean(precision)

# # Calculating average precision for all queries in the test set
# testing_queries['AP']=testing_queries.apply(lambda x: average_precision(x['qid'],x['vector']),axis=1)

# # Finding Mean Average Precision
# print('Mean Average Precision=>',testing_queries['AP'].mean())


In [76]:
search = queries_df['queries_cleaned'][0].split()

In [77]:
search

['car', 'loan', 'offer']

In [78]:
res = model.wv.most_similar(positive=search,topn=4)
res

[('ease', 0.33007386326789856),
 ('acts', 0.2779266834259033),
 ('drives', 0.27062657475471497),
 ('millions', 0.26138967275619507)]

Doc2Vec

In [79]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec

In [106]:
data = ["The process of searching for a job can be very stressful, but it doesn’t have to be. Start with a\
        well-written resume that has appropriate keywords for your occupation. Next, conduct a targeted job search\
        for positions that meet your needs.",
        "Gardening in mixed beds is a great way to get the most productivity from a small space. Some investment\
        is required, to purchase materials for the beds themselves, as well as soil and compost. The\
        investment will likely pay-off in terms of increased productivity.",
        "Looking for a job can be very stressful, but it doesn’t have to be. Begin by writing a good resume with\
        appropriate keywords for your occupation. Second, target your job search for positions that match your\
        needs."]

In [107]:
# import nltk
# nltk.download('all')

In [108]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [109]:
print (tagged_data)

[TaggedDocument(words=['the', 'process', 'of', 'searching', 'for', 'a', 'job', 'can', 'be', 'very', 'stressful', ',', 'but', 'it', 'doesn', '’', 't', 'have', 'to', 'be', '.', 'start', 'with', 'a', 'well-written', 'resume', 'that', 'has', 'appropriate', 'keywords', 'for', 'your', 'occupation', '.', 'next', ',', 'conduct', 'a', 'targeted', 'job', 'search', 'for', 'positions', 'that', 'meet', 'your', 'needs', '.'], tags=['0']), TaggedDocument(words=['gardening', 'in', 'mixed', 'beds', 'is', 'a', 'great', 'way', 'to', 'get', 'the', 'most', 'productivity', 'from', 'a', 'small', 'space', '.', 'some', 'investment', 'is', 'required', ',', 'to', 'purchase', 'materials', 'for', 'the', 'beds', 'themselves', ',', 'as', 'well', 'as', 'soil', 'and', 'compost', '.', 'the', 'investment', 'will', 'likely', 'pay-off', 'in', 'terms', 'of', 'increased', 'productivity', '.'], tags=['1']), TaggedDocument(words=['looking', 'for', 'a', 'job', 'can', 'be', 'very', 'stressful', ',', 'but', 'it', 'doesn', '’', '

In [110]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=1)

In [111]:
model.build_vocab(tagged_data)

In [112]:
model.corpus_count

3

In [113]:
model.train(tagged_data, total_examples=model.corpus_count,epochs=100)

In [114]:
query = 'process of searching a job'.lower()

In [115]:
#query_tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(query)]

In [90]:
query_tokenized = word_tokenize(query)

In [91]:
print(query)
print(query_tokenized)

process of searching a job
['process', 'of', 'searching', 'a', 'job']


In [116]:
#print(query_tagged_data)
query_vec = model.infer_vector(query_tokenized)

In [117]:
len(query_vec)

50

In [118]:
from sklearn.metrics.pairwise import cosine_similarity

In [119]:
model.docvecs[1]

array([-0.349812  , -0.19321649,  0.75485283,  0.287241  ,  0.1026722 ,
        0.02998208,  0.23706621,  0.04920838, -0.41096556, -0.5342553 ,
       -0.07923943,  0.20176034,  0.38084936, -0.3624449 ,  0.11026239,
       -0.02323695,  0.2919407 , -0.08807204, -0.45580143, -0.19195323,
        0.35513693, -0.00448021,  0.0096084 , -0.0125532 , -0.04710443,
       -0.11648001,  0.18020472, -0.27589712,  0.4860394 , -0.68532294,
        0.39384538,  0.24650384,  0.71603054, -0.587305  , -0.05274221,
       -0.50906616,  0.01653084,  0.17423831, -0.27608195, -0.05666911,
       -0.46533734,  0.05290885,  0.41059574,  0.2032333 ,  0.50715727,
        0.00533207,  0.30471328, -0.16774367,  0.07639784,  0.12832478],
      dtype=float32)

In [120]:
query_vec

array([-1.66936055e-01, -8.42429027e-02,  3.48357022e-01,  1.42835751e-01,
        3.72074246e-02,  1.22742383e-02,  9.50397924e-02,  2.94510126e-02,
       -1.82758465e-01, -2.36456856e-01, -4.27522250e-02,  1.01009697e-01,
        1.72342464e-01, -1.71825230e-01,  5.14724962e-02, -1.22073432e-02,
        1.31342098e-01, -4.72277775e-02, -2.01278940e-01, -8.48445594e-02,
        1.71674162e-01, -8.41954537e-03,  2.12655155e-04,  5.59032895e-03,
       -2.29960699e-02, -4.86994237e-02,  7.20302761e-02, -1.28505260e-01,
        2.21231341e-01, -3.17818075e-01,  1.90943480e-01,  1.11968465e-01,
        3.19611698e-01, -2.80475795e-01, -2.35656165e-02, -2.32289359e-01,
        8.23969673e-03,  8.36651400e-02, -1.22789629e-01, -1.94343757e-02,
       -2.19380513e-01,  2.71922182e-02,  1.90256372e-01,  8.24016556e-02,
        2.29763851e-01,  4.40729968e-03,  1.45249188e-01, -7.85977170e-02,
        3.41023207e-02,  6.12180196e-02], dtype=float32)

In [121]:
model.docvecs[0]

array([-0.27264896, -0.12852068,  0.555219  ,  0.22509257,  0.08267272,
        0.00754713,  0.16109484,  0.04004182, -0.3071199 , -0.3912739 ,
       -0.06857116,  0.15476093,  0.278479  , -0.28543296,  0.07762524,
       -0.02681672,  0.21906453, -0.07512114, -0.3350457 , -0.12997364,
        0.2666728 ,  0.00510854,  0.01602956,  0.00646436, -0.03726865,
       -0.08447666,  0.12572095, -0.2094025 ,  0.371879  , -0.5131045 ,
        0.3048104 ,  0.1911615 ,  0.53533506, -0.4495811 , -0.02740825,
       -0.36685982,  0.01721055,  0.14021617, -0.21501447, -0.04350779,
       -0.3409291 ,  0.05989716,  0.31831512,  0.1502098 ,  0.37299162,
        0.00936915,  0.2242387 , -0.14000219,  0.05430428,  0.10793807],
      dtype=float32)

In [122]:
import numpy as np

In [123]:
data_array = np.array([model.docvecs[0],model.docvecs[1],model.docvecs[2]])

In [124]:
query_v = query_vec.reshape(1,50)

In [125]:
data_array.shape

(3, 50)

In [128]:
results = cosine_similarity(data_array, query_v)
print(results)

[[0.9991844 ]
 [0.99913013]
 [0.9992902 ]]


In [129]:
res = np.argsort(results, axis=0)

In [130]:
res

array([[1],
       [0],
       [2]])

In [131]:
k=1
for i in res[-1:-4:-1]:
    print("result ", k, "=============================")
    print(data[i[0]])
    print("=============================")
    k+=1

Looking for a job can be very stressful, but it doesn’t have to be. Begin by writing a good resume with        appropriate keywords for your occupation. Second, target your job search for positions that match your        needs.
The process of searching for a job can be very stressful, but it doesn’t have to be. Start with a        well-written resume that has appropriate keywords for your occupation. Next, conduct a targeted job search        for positions that meet your needs.
Gardening in mixed beds is a great way to get the most productivity from a small space. Some investment        is required, to purchase materials for the beds themselves, as well as soil and compost. The        investment will likely pay-off in terms of increased productivity.


In [52]:
similar_doc = model.docvecs.most_similar('0')

In [53]:
print(similar_doc)

[('2', 0.999376118183136), ('1', 0.9992620348930359)]


In [54]:
data[int(similar_doc[0][0])]

'Looking for a job can be very stressful, but it doesn’t have to be. Begin by writing a good resume with        appropriate keywords for your occupation. Second, target your job search for positions that match your        needs.'