In [15]:
import string
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
docs = [
    'Sachin is considered to be one of the greatest crickket players',
    'Nadal is considered one of the greatest tennis players',
    'Federer is considered one of the greatest tennis players',
    'Virat is the captain of the Indian cricket team',
]

In [17]:
def create_vacab(docList):
    '''
    docList: list of strings
    vocab: dict(key:word, value:count)
    '''
    vocab = {}
    for doc in docList:
        print(doc)
        doc = doc.translate(str.maketrans('', '', string.punctuation)) # remove punc
        words = word_tokenize(doc.lower()) # lower then tokenize
        for word in words:
            if (word in vocab.keys()):
                vocab[word] += 1
            else:
                vocab[word] = 1
    return vocab

In [18]:
# unit test: fun create_vacab
vocab_tennis = create_vacab(docs)
vocab_tennis

Sachin is considered to be one of the greatest crickket players
Nadal is considered one of the greatest tennis players
Federer is considered one of the greatest tennis players
Virat is the captain of the Indian cricket team


{'sachin': 1,
 'is': 4,
 'considered': 3,
 'to': 1,
 'be': 1,
 'one': 3,
 'of': 4,
 'the': 5,
 'greatest': 3,
 'crickket': 1,
 'players': 3,
 'nadal': 1,
 'tennis': 2,
 'federer': 1,
 'virat': 1,
 'captain': 1,
 'indian': 1,
 'cricket': 1,
 'team': 1}

In [20]:
# use sklearn api Tfidf
vectorizer = TfidfVectorizer(analyzer='word', norm=None, use_idf=True, smooth_idf=True)
tfidf_matrix = vectorizer.fit_transform(docs)
feature_names = sorted(vectorizer.get_feature_names())
doc_list = ['doc1','doc2','doc3','doc4']
df_tfidf = pd.DataFrame(tfidf_matrix.todense(), index=sorted(doc_list), columns=feature_names)
df_tfidf

Unnamed: 0,be,captain,considered,cricket,crickket,federer,greatest,indian,is,nadal,of,one,players,sachin,team,tennis,the,to,virat
doc1,1.916291,0.0,1.223144,0.0,1.916291,0.0,1.223144,0.0,1.0,0.0,1.0,1.223144,1.223144,1.916291,0.0,0.0,1.0,1.916291,0.0
doc2,0.0,0.0,1.223144,0.0,0.0,0.0,1.223144,0.0,1.0,1.916291,1.0,1.223144,1.223144,0.0,0.0,1.510826,1.0,0.0,0.0
doc3,0.0,0.0,1.223144,0.0,0.0,1.916291,1.223144,0.0,1.0,0.0,1.0,1.223144,1.223144,0.0,0.0,1.510826,1.0,0.0,0.0
doc4,0.0,1.916291,0.0,1.916291,0.0,0.0,0.0,1.916291,1.0,0.0,1.0,0.0,0.0,0.0,1.916291,0.0,2.0,0.0,1.916291


In [21]:
# user sklearn api Cos_similarity
# get cosine similarity between each doc
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cos_sim_df = pd.DataFrame(cos_sim, index=sorted(doc_list), columns=sorted(doc_list))
cos_sim_df

Unnamed: 0,doc1,doc2,doc3,doc4
doc1,1.0,0.477745,0.477745,0.166566
doc2,0.477745,1.0,0.75419,0.209677
doc3,0.477745,0.75419,1.0,0.209677
doc4,0.166566,0.209677,0.209677,1.0
