# Word Representations and Text/Document Similarity (Count Based)

# Bag of Words

In [41]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize

In [42]:
review_1 = 'The movie was good and we really like it'
review_2 = 'the movie was good but the ending was boring'
review_3 = 'we did not like the movie as it was too lengthy'

In [43]:
review_1_tokens = word_tokenize(review_1)
review_2_tokens = word_tokenize(review_2)
review_3_tokens = word_tokenize(review_3)

In [44]:
print(review_2_tokens)
print(set(review_2_tokens))

['the', 'movie', 'was', 'good', 'but', 'the', 'ending', 'was', 'boring']
{'the', 'good', 'but', 'movie', 'boring', 'ending', 'was'}


In [45]:
review_tokens = set(review_1_tokens).union(set(review_2_tokens)).union(set(review_3_tokens))

In [46]:
print(len(review_tokens))
print(review_tokens)

18
{'we', 'did', 'too', 'really', 'as', 'ending', 'was', 'the', 'good', 'but', 'lengthy', 'movie', 'boring', 'not', 'The', 'like', 'it', 'and'}


In [51]:
review1_dict = dict.fromkeys(review_tokens,0)
print(review1_dict)

{'we': 0, 'did': 0, 'too': 0, 'really': 0, 'as': 0, 'ending': 0, 'was': 0, 'the': 0, 'good': 0, 'but': 0, 'lengthy': 0, 'movie': 0, 'boring': 0, 'not': 0, 'The': 0, 'like': 0, 'it': 0, 'and': 0}


In [52]:
review2_dict = dict.fromkeys(review_tokens,0)
review3_dict = dict.fromkeys(review_tokens,0)
print(review2_dict)
print(review3_dict)

{'we': 0, 'did': 0, 'too': 0, 'really': 0, 'as': 0, 'ending': 0, 'was': 0, 'the': 0, 'good': 0, 'but': 0, 'lengthy': 0, 'movie': 0, 'boring': 0, 'not': 0, 'The': 0, 'like': 0, 'it': 0, 'and': 0}
{'we': 0, 'did': 0, 'too': 0, 'really': 0, 'as': 0, 'ending': 0, 'was': 0, 'the': 0, 'good': 0, 'but': 0, 'lengthy': 0, 'movie': 0, 'boring': 0, 'not': 0, 'The': 0, 'like': 0, 'it': 0, 'and': 0}


In [53]:
# review_1 = 'The movie was good and we really like it'
for token in review_1_tokens:
    review1_dict[token]+=1
print(review1_dict)

{'we': 1, 'did': 0, 'too': 0, 'really': 1, 'as': 0, 'ending': 0, 'was': 1, 'the': 0, 'good': 1, 'but': 0, 'lengthy': 0, 'movie': 1, 'boring': 0, 'not': 0, 'The': 1, 'like': 1, 'it': 1, 'and': 1}


In [54]:
for token in review_2_tokens:
    review2_dict[token]+=1
    
for token in review_3_tokens:
    review3_dict[token]+=1

In [56]:
# review_1 = 'The movie was good and we really like it'
# review_2 = 'the movie was good but the ending was boring'
# review_3 = 'we did not like the movie as it was too lengthy'
print(review1_dict)
print(review2_dict)
print(review3_dict)

{'we': 1, 'did': 0, 'too': 0, 'really': 1, 'as': 0, 'ending': 0, 'was': 1, 'the': 0, 'good': 1, 'but': 0, 'lengthy': 0, 'movie': 1, 'boring': 0, 'not': 0, 'The': 1, 'like': 1, 'it': 1, 'and': 1}
{'we': 0, 'did': 0, 'too': 0, 'really': 0, 'as': 0, 'ending': 1, 'was': 2, 'the': 2, 'good': 1, 'but': 1, 'lengthy': 0, 'movie': 1, 'boring': 1, 'not': 0, 'The': 0, 'like': 0, 'it': 0, 'and': 0}
{'we': 1, 'did': 1, 'too': 1, 'really': 0, 'as': 1, 'ending': 0, 'was': 1, 'the': 1, 'good': 0, 'but': 0, 'lengthy': 1, 'movie': 1, 'boring': 0, 'not': 1, 'The': 0, 'like': 1, 'it': 1, 'and': 0}


In [57]:
reviews_dict_df = pd.DataFrame([review1_dict,review2_dict,review3_dict])
reviews_dict_df

Unnamed: 0,we,did,too,really,as,ending,was,the,good,but,lengthy,movie,boring,not,The,like,it,and
0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,1,1,1
1,0,0,0,0,0,1,2,2,1,1,0,1,1,0,0,0,0,0
2,1,1,1,0,1,0,1,1,0,0,1,1,0,1,0,1,1,0


This is the Term Frequency Matrix (TFM) or Document Term matrix (DTM) or CV Matrix

# Count Vectorizer
sklearn implementation of TFM creation using Bag of Words

In [58]:
from sklearn.feature_extraction.text import CountVectorizer

In [59]:
review_list = [review_1,review_2,review_3]

In [60]:
review_list

['The movie was good and we really like it',
 'the movie was good but the ending was boring',
 'we did not like the movie as it was too lengthy']

In [61]:
# count_vect = CountVectorizer()
count_vect = CountVectorizer(stop_words='english')

In [62]:
#Fit - Generate BOW / Vocabulary
# Transform  -generate the DTM or CV Matrix 
X_counts = count_vect.fit_transform(review_list)
type(X_counts)

scipy.sparse.csr.csr_matrix

CSR means Compressed Sparse Row, which is a row-wise sparse matrix

In [63]:
X_counts.toarray()

array([[0, 0, 0, 1, 0, 1, 1, 1],
       [1, 0, 1, 1, 0, 0, 1, 0],
       [0, 1, 0, 0, 1, 1, 1, 0]], dtype=int64)

In [66]:
review_list

['The movie was good and we really like it',
 'the movie was good but the ending was boring',
 'we did not like the movie as it was too lengthy']

In [65]:
count_vect.get_feature_names()

['boring', 'did', 'ending', 'good', 'lengthy', 'like', 'movie', 'really']

In [21]:
X_names = count_vect.get_feature_names()
X_names

['boring', 'did', 'ending', 'good', 'lengthy', 'like', 'movie', 'really']

In [67]:
a = pd.DataFrame(X_counts.toarray(),columns=X_names)
a

Unnamed: 0,boring,did,ending,good,lengthy,like,movie,really
0,0,0,0,1,0,1,1,1
1,1,0,1,1,0,0,1,0
2,0,1,0,0,1,1,1,0


# TF-IDF

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [69]:
tf_vect = TfidfVectorizer(min_df=1, lowercase=True, stop_words='english')

In [70]:
tf_matrix = tf_vect.fit_transform(review_list)
print(type(tf_matrix))
tf_matrix.shape

<class 'scipy.sparse.csr.csr_matrix'>


(3, 8)

In [71]:
tf_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.4804584 , 0.        ,
        0.4804584 , 0.37311881, 0.63174505],
       [0.5844829 , 0.        , 0.5844829 , 0.44451431, 0.        ,
        0.        , 0.34520502, 0.        ],
       [0.        , 0.5844829 , 0.        , 0.        , 0.5844829 ,
        0.44451431, 0.34520502, 0.        ]])

In [72]:
tf_names = tf_vect.get_feature_names()
tf_names

['boring', 'did', 'ending', 'good', 'lengthy', 'like', 'movie', 'really']

In [73]:
tf_df = pd.DataFrame(tf_matrix.toarray(),columns=tf_names)

In [74]:
tf_df

Unnamed: 0,boring,did,ending,good,lengthy,like,movie,really
0,0.0,0.0,0.0,0.480458,0.0,0.480458,0.373119,0.631745
1,0.584483,0.0,0.584483,0.444514,0.0,0.0,0.345205,0.0
2,0.0,0.584483,0.0,0.0,0.584483,0.444514,0.345205,0.0


# Document Similarity Estimation using TF-IDF

In [75]:
doc1 = 'Natural Language Processing is the study of making a machine understand and generate languages like humans'
doc2 = 'Cricket is a sports played with a bat and a ball. It is not played in many countries'
doc3 = 'Languages are the cornerstone of human evolution. Making a machine study languages is not easy'
doc4 = 'Football is a sport played in almost all countries of the world. It is played by kicking a ball'

In [76]:
documents = [doc1, doc2, doc3, doc4]

In [77]:
tf_vect_docs = TfidfVectorizer(min_df=1, lowercase=True, stop_words='english')
tf_matrix_docs = tf_vect_docs.fit_transform(documents)

In [78]:
tf_df = pd.DataFrame(tf_matrix_docs.toarray(),columns=tf_vect_docs.get_feature_names())
tf_df

Unnamed: 0,ball,bat,cornerstone,countries,cricket,easy,evolution,football,generate,human,...,machine,making,natural,played,processing,sport,sports,study,understand,world
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324676,0.0,...,0.255978,0.255978,0.324676,0.0,0.324676,0.0,0.0,0.255978,0.324676,0.0
1,0.30392,0.385484,0.0,0.30392,0.385484,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.607841,0.0,0.0,0.385484,0.0,0.0,0.0
2,0.0,0.0,0.34604,0.0,0.0,0.34604,0.34604,0.0,0.0,0.34604,...,0.272822,0.272822,0.0,0.0,0.0,0.0,0.0,0.272822,0.0,0.0
3,0.28358,0.0,0.0,0.28358,0.0,0.0,0.0,0.359685,0.0,0.0,...,0.0,0.0,0.0,0.56716,0.0,0.359685,0.0,0.0,0.0,0.359685


### Calculating the document similarities
* Cosine similarity is a metric used to measure how similar the documents are irrespective of their size. Mathematically, it measures the cosine of the angle between two vectors projected in a multi-dimensional space
* The cosine similarity is advantageous because even if the two similar documents are far apart by the Euclidean distance (due to the size of the document), chances are they may still be oriented closer together. The smaller the angle, higher the cosine similarity

https://www.machinelearningplus.com/nlp/cosine-similarity/

In [79]:
from sklearn.metrics.pairwise import cosine_similarity as c_sim

In [80]:
#tf_matrix_docs[0:1].toarray()

In [81]:
tf_matrix_docs[0].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.32467583, 0.        ,
        0.32467583, 0.        , 0.32467583, 0.25597815, 0.32467583,
        0.25597815, 0.25597815, 0.32467583, 0.        , 0.32467583,
        0.        , 0.        , 0.25597815, 0.32467583, 0.        ]])

In [82]:
# doc1 = 'Natural Language Processing is the study of making a machine understand and generate languages like humans'
# doc2 = 'Cricket is a sports played with a bat and a ball. It is not played in many countries'
# doc3 = 'Languages are the cornerstone of human evolution. Making a machine study languages is not easy'
# doc4 = 'Football is a sport played in almost all countries of the world. It is played by kicking a ball'
c_sim(tf_matrix_docs[0], tf_matrix_docs[2])
#c_sim(tf_matrix_docs[0:1], tf_matrix_docs[2:3])

array([[0.34918271]])

In [83]:
c_sim(tf_matrix_docs[1], tf_matrix_docs[3])

array([[0.51711443]])

In [84]:
c_sim(tf_matrix_docs[1], tf_matrix_docs[2])

array([[0.]])

In [85]:
c_sim(tf_matrix_docs[0], tf_matrix_docs[3])

array([[0.]])