In [20]:
import numpy as np
import pandas as pd
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
doc1 = "This is a very good and plain paper. this is really \
good and interesting"
doc2 = "This paper is very good interesting, awesome"
doc3 = "xyz dummy nothing relevant"

In [3]:
def clean_txt(sent):
    tokens = word_tokenize(sent.lower())
    stop_updated = stopwords.words("english") + list(punctuation) 
    final_word = [term for term in tokens if term not in stop_updated 
               and len(term) > 2] 
    res = " ".join(final_word)
    return res

In [18]:
doc1_clean = clean_txt(doc1)
doc2_clean = clean_txt(doc2)
doc3_clean = clean_txt(doc3)
#doc1_clean

In [19]:
doc = pd.DataFrame([doc1_clean, doc2_clean, doc3_clean ], columns=["text"])
doc

Unnamed: 0,text
0,good plain paper really good interesting
1,paper good interesting awesome
2,xyz dummy nothing relevant


In [21]:
#Instantiating CountVectorizer
count_vect = CountVectorizer()
X = count_vect.fit_transform(doc['text'])
DTM = pd.DataFrame(X.toarray(),columns = count_vect.get_feature_names())
DTM



Unnamed: 0,awesome,dummy,good,interesting,nothing,paper,plain,really,relevant,xyz
0,0,0,2,1,0,1,1,1,0,0
1,1,0,1,1,0,1,0,0,0,0
2,0,1,0,0,1,0,0,0,1,1


In [22]:
#Term document matrix or TDM is a transpose of DTM 
#which is used in finding similarity between words
TDM = DTM.T
TDM

Unnamed: 0,0,1,2
awesome,0,1,0
dummy,0,0,1
good,2,1,0
interesting,1,1,0
nothing,0,0,1
paper,1,1,0
plain,1,0,0
really,1,0,0
relevant,0,0,1
xyz,0,0,1


### Cosine Similarity

In [23]:
cs = cosine_similarity(DTM)
sim_mat = pd.DataFrame(cs)
sim_mat

Unnamed: 0,0,1,2
0,1.0,0.707107,0.0
1,0.707107,1.0,0.0
2,0.0,0.0,1.0


In [24]:
sort_val = sim_mat[0].sort_values(ascending=False)
sort_val

0    1.000000
1    0.707107
2    0.000000
Name: 0, dtype: float64

In [25]:
cs_words = cosine_similarity(DTM.T)

In [26]:
sim_mat = pd.DataFrame(cs_words,columns=DTM.columns,index=DTM.columns)
sim_mat

Unnamed: 0,awesome,dummy,good,interesting,nothing,paper,plain,really,relevant,xyz
awesome,1.0,0.0,0.447214,0.707107,0.0,0.707107,0.0,0.0,0.0,0.0
dummy,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
good,0.447214,0.0,1.0,0.948683,0.0,0.948683,0.894427,0.894427,0.0,0.0
interesting,0.707107,0.0,0.948683,1.0,0.0,1.0,0.707107,0.707107,0.0,0.0
nothing,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
paper,0.707107,0.0,0.948683,1.0,0.0,1.0,0.707107,0.707107,0.0,0.0
plain,0.0,0.0,0.894427,0.707107,0.0,0.707107,1.0,1.0,0.0,0.0
really,0.0,0.0,0.894427,0.707107,0.0,0.707107,1.0,1.0,0.0,0.0
relevant,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
xyz,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0


In [28]:
sort_val = sim_mat['good'].sort_values(ascending=False)
sort_val

good           1.000000
interesting    0.948683
paper          0.948683
plain          0.894427
really         0.894427
awesome        0.447214
dummy          0.000000
nothing        0.000000
relevant       0.000000
xyz            0.000000
Name: good, dtype: float64