In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

documents=["Action movies with superheroes and special effects.",
           "Romantic comedy with heartwarming scenes.",
           "Superhero comedy with funny dialogues."]

vectorizer=CountVectorizer(stop_words='english')
word_count_matrix=vectorizer.fit_transform(documents)
terms=vectorizer.get_feature_names_out()

print("Word count Matrix:\n",word_count_matrix.toarray())
print("Vocabulary:\n",terms)

tf=word_count_matrix.toarray()/np.sum(word_count_matrix.toarray(),axis=1,keepdims=True)
print("\nTerm frequency (TF):\n",tf)

df=np.sum(word_count_matrix.toarray() >0,axis=0)
print("\nDocument Frequency (DF):\n",dict(zip(terms,df)))

idf=np.log((1+len(documents))/(1+df))+1
print("\nInverse Document Frequency (IDF):\n",dict(zip(terms,idf)))

tf_idf=tf*idf
print("\nTF-IDF Matrix:\n",tf_idf)

for i, doc in enumerate(documents):
    print(f"\nDocument {i+1}:")
    for term, score in zip(terms,tf_idf[i]):
        print(f"{term}:{score:.3f}")

Word count Matrix:
 [[1 0 0 1 0 0 1 0 0 1 0 1]
 [0 1 0 0 0 1 0 1 1 0 0 0]
 [0 1 1 0 1 0 0 0 0 0 1 0]]
Vocabulary:
 ['action' 'comedy' 'dialogues' 'effects' 'funny' 'heartwarming' 'movies'
 'romantic' 'scenes' 'special' 'superhero' 'superheroes']

Term frequency (TF):
 [[0.2  0.   0.   0.2  0.   0.   0.2  0.   0.   0.2  0.   0.2 ]
 [0.   0.25 0.   0.   0.   0.25 0.   0.25 0.25 0.   0.   0.  ]
 [0.   0.25 0.25 0.   0.25 0.   0.   0.   0.   0.   0.25 0.  ]]

Document Frequency (DF):
 {'action': 1, 'comedy': 2, 'dialogues': 1, 'effects': 1, 'funny': 1, 'heartwarming': 1, 'movies': 1, 'romantic': 1, 'scenes': 1, 'special': 1, 'superhero': 1, 'superheroes': 1}

Inverse Document Frequency (IDF):
 {'action': 1.6931471805599454, 'comedy': 1.2876820724517808, 'dialogues': 1.6931471805599454, 'effects': 1.6931471805599454, 'funny': 1.6931471805599454, 'heartwarming': 1.6931471805599454, 'movies': 1.6931471805599454, 'romantic': 1.6931471805599454, 'scenes': 1.6931471805599454, 'special': 1.693147

Cosine similarity

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim_matrix=cosine_similarity(tf_idf, tf_idf)

print("Cosine Similarity Matrix:")
print(cosine_sim_matrix)

Cosine Similarity Matrix:
[[1.         0.         0.        ]
 [0.         1.         0.16163636]
 [0.         0.16163636 1.        ]]
