## Understand Genism for Document Similarity

In [12]:
import gensim
from nltk.tokenize import word_tokenize

In [13]:
doc = ["She was worried about her examinations",
        "He was anxious about his interview",
        "He faught against the enemies",
         "She fights alot",
          "Mango and apple are my favorite fruits","I hate pumpkin and papaya"]
print("Number of documents:",len(doc))

Number of documents: 6


In [42]:
gen_docs = [[w.lower() for w in word_tokenize(text)] 
            for text in doc]
print(gen_docs)

[['she', 'was', 'worried', 'about', 'her', 'examinations'], ['he', 'was', 'anxious', 'about', 'his', 'interview'], ['he', 'faught', 'against', 'the', 'enemies'], ['she', 'fights', 'alot'], ['mango', 'and', 'apple', 'are', 'my', 'favorite', 'fruits'], ['i', 'hate', 'pumpkin', 'and', 'papaya']]


In [43]:
dictionary = gensim.corpora.Dictionary(gen_docs)
print(dictionary)
print(dictionary.token2id['she'])
print("Number of words in dictionary:",len(dictionary))
for i in range(len(dictionary)):
    print(i, dictionary[i])

Dictionary(27 unique tokens: ['about', 'examinations', 'her', 'she', 'was']...)
3
Number of words in dictionary: 27
0 about
1 examinations
2 her
3 she
4 was
5 worried
6 anxious
7 he
8 his
9 interview
10 against
11 enemies
12 faught
13 the
14 alot
15 fights
16 and
17 apple
18 are
19 favorite
20 fruits
21 mango
22 my
23 hate
24 i
25 papaya
26 pumpkin


#### Bag of words 

In [44]:
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)], [(0, 1), (4, 1), (6, 1), (7, 1), (8, 1), (9, 1)], [(7, 1), (10, 1), (11, 1), (12, 1), (13, 1)], [(3, 1), (14, 1), (15, 1)], [(16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1)], [(16, 1), (23, 1), (24, 1), (25, 1), (26, 1)]]


#### Term Frequency - Inverse Document Frequency

In [45]:
tf_idf = gensim.models.TfidfModel(corpus)
print(tf_idf)

TfidfModel(num_docs=6, num_nnz=32)


#### Similarity 

In [46]:
sims = gensim.similarities.Similarity('sim.txt',tf_idf[corpus],num_features=len(dictionary))
print(sims)
print(type(sims))

Similarity index with 6 documents in 0 shards (stored under sim.txt)
<class 'gensim.similarities.docsim.Similarity'>


In [55]:
query_doc = [w.lower() for w in word_tokenize("Simon is afraid of interview")]
print(query_doc)
query_doc_bow = dictionary.doc2bow(query_doc)
print(query_doc_bow)
query_doc_tf_idf = tf_idf[query_doc_bow]
print(query_doc_tf_idf)

['simon', 'is', 'afraid', 'of', 'interview']
[(9, 1)]
[(9, 1.0)]


In [56]:
sims[query_doc_tf_idf]

array([ 0.        ,  0.49219605,  0.        ,  0.        ,  0.        ,  0.        ], dtype=float32)