# Cosine Similarity

- a concept to determine how similar or different 2 documents or text are
- finding the similliarity with cos

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
corpus = [
    'This is the first document',
    'Here is another document',
    'And this is the third document'
]

query = 'This is the query text'
vectorizer = CountVectorizer()
matrix = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(matrix.toarray())

['and' 'another' 'document' 'first' 'here' 'is' 'the' 'third' 'this']
[[0 0 1 1 0 1 1 0 1]
 [0 1 1 0 1 1 0 0 0]
 [1 0 1 0 0 1 1 1 1]]


In [8]:
df_matrix = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names_out())
print(df_matrix)

   and  another  document  first  here  is  the  third  this
0    0        0         1      1     0   1    1      0     1
1    0        1         1      0     1   1    0      0     0
2    1        0         1      0     0   1    1      1     1


In [10]:
query_v = vectorizer.transform([query])
similarities = cosine_similarity(query_v, matrix)
data = {'Document': corpus, 'Similarity': similarities[0]}

In [11]:
df = pd.DataFrame(data)
df

Unnamed: 0,Document,Similarity
0,This is the first document,0.774597
1,Here is another document,0.288675
2,And this is the third document,0.707107


# TF-IDF
- TF = Term Frequency
- IDF = Inverse Document Frequency
- TF-IDF = checks how important a word is in a document

In [14]:
import nltk
from nltk import word_tokenize

first_sentence = "The sky is blue and beautiful"
second_sentence = "Love is in the air tonight"

first_sentence = word_tokenize(first_sentence.lower())
second_sentence = word_tokenize(second_sentence.lower())
total = set(first_sentence).union(set(second_sentence))
print(total)

word_dict_a = dict.fromkeys(total, 0)
# print(word_dict_a['air'])
word_dict_b = dict.fromkeys(total, 0)

for word in first_sentence:
    word_dict_a[word] += 1

for word in second_sentence:
    word_dict_b[word] += 1

pd.DataFrame([word_dict_a, word_dict_b])


{'sky', 'and', 'love', 'tonight', 'blue', 'in', 'air', 'beautiful', 'is', 'the'}


Unnamed: 0,sky,and,love,tonight,blue,in,air,beautiful,is,the
0,1,1,0,0,1,0,0,1,1,1
1,0,0,1,1,0,1,1,0,1,1


In [15]:
def computeTF(word_dict, doc):
    tf_dict = {}
    corpus_count = len(doc)
    for word, count in word_dict.items():
        tf_dict[word] = count / float(corpus_count)
    return tf_dict

tf_first = computeTF(word_dict_a, first_sentence)
tf_second = computeTF(word_dict_b, second_sentence)

tf = pd.DataFrame([tf_first, tf_second])
tf

Unnamed: 0,sky,and,love,tonight,blue,in,air,beautiful,is,the
0,0.166667,0.166667,0.0,0.0,0.166667,0.0,0.0,0.166667,0.166667,0.166667
1,0.0,0.0,0.166667,0.166667,0.0,0.166667,0.166667,0.0,0.166667,0.166667


In [16]:
import math

def computeIDF(doc_list):
    idf_dict = {}
    N = len(doc_list)

    # all_words = set([word for doc in doc_list for word in doc])
    all_words = set()
    for doc in doc_list:
        all_words.update(doc.keys())
    idf_dict = dict.fromkeys(all_words, 0)

    for word in all_words:
        for doc in doc_list:
            if doc[word] > 0:
                idf_dict[word] += 1
    
    for word, val in idf_dict.items():
        idf_dict[word] = math.log10(N / float(val))
    idf_df = pd.DataFrame(idf_dict.items(), columns=['word', 'idf'])

    return idf_df

idf = computeIDF([word_dict_a, word_dict_b])
idf



Unnamed: 0,word,idf
0,sky,0.30103
1,the,0.0
2,love,0.30103
3,tonight,0.30103
4,blue,0.30103
5,in,0.30103
6,air,0.30103
7,beautiful,0.30103
8,is,0.0
9,and,0.30103


In [17]:
# Function to compute TF-IDF
def computeTFIDF(tf, idf):
    tfidf = {}
    for word, val in tf.items():
        tfidf[word] = val * idf['idf'][idf['word'] == word].values[0]
    return tfidf

tfidf_first = computeTFIDF(tf_first, idf)
tfidf_second = computeTFIDF(tf_second, idf)

tfidf = pd.DataFrame([tfidf_first, tfidf_second])
tfidf

Unnamed: 0,sky,and,love,tonight,blue,in,air,beautiful,is,the
0,0.050172,0.050172,0.0,0.0,0.050172,0.0,0.0,0.050172,0.0,0.0
1,0.0,0.0,0.050172,0.050172,0.0,0.050172,0.050172,0.0,0.0,0.0
