In [40]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from numpy.linalg import norm


In [41]:
# Sentences
s1 = "sunshine state enjoy sunshine"
s2 = "brown fox jump high, brown fox run"
s3 = "sunshine state fox run fast"
sentences = [s1, s2, s3]
print(sentences)

['sunshine state enjoy sunshine', 'brown fox jump high, brown fox run', 'sunshine state fox run fast']


In [None]:
# BoW model
bow_vectorizer = CountVectorizer()
bow = bow_vectorizer.fit_transform(sentences)
bow_array = bow.toarray()



In [22]:
print("BoW model for s1:", bow_array[0])
print("Bow model for s2:", bow_array[1])
print("Bow model for s3", bow_array[2])

BoW model for s1: [0 1 0 0 0 0 0 1 2]
Bow model for s2: [2 0 0 2 1 1 1 0 0]
Bow model for s3 [0 0 1 1 0 0 1 1 1]


In [43]:
# TF model
tf_vectorizer = CountVectorizer(max_df=1.0, min_df=1, max_features=None, stop_words=None, vocabulary=None, binary=False)
tf = tf_vectorizer.fit_transform(sentences)
tf_array = tf.toarray()

In [54]:
print("TF model for s1:", tf_array[0])
print("TF model for s2", tf_array[1])
print("TF model for s3", tf_array[2])


TF model for s1: [0 1 0 0 0 0 0 1 2]
TF model for s2 [2 0 0 2 1 1 1 0 0]
TF model for s3 [0 0 1 1 0 0 1 1 1]


In [45]:
# IDF model
idf_vectorizer = TfidfVectorizer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)
idf = idf_vectorizer.fit_transform(sentences)
idf_array = idf.toarray()


In [46]:
print("IDF model for s1:", idf_array[0])
print("IDF model for s2:", idf_array[1])
print("IDF model for s3:", idf_array[2])


IDF model for s1: [0.         0.50689001 0.         0.         0.         0.
 0.         0.38550292 0.77100584]
IDF model for s2: [0.67070313 0.         0.         0.51008702 0.33535157 0.33535157
 0.25504351 0.         0.        ]
IDF model for s3: [0.         0.         0.54935123 0.41779577 0.         0.
 0.41779577 0.41779577 0.41779577]


In [47]:
# TF-IDF model
tf_idf_vectorizer = TfidfVectorizer(norm='l2', smooth_idf=True)
tf_idf = tf_idf_vectorizer.fit_transform(sentences)
tf_idf_array = tf_idf.toarray()

In [48]:
print("TF-IDF model for s1:", tf_idf_array[0])
print("TF-IDF model for s2:", tf_idf_array[1])
print("TF-IDF model for s3:", tf_idf_array[2])



TF-IDF model for s1: [0.         0.50689001 0.         0.         0.         0.
 0.         0.38550292 0.77100584]
TF-IDF model for s2: [0.67070313 0.         0.         0.51008702 0.33535157 0.33535157
 0.25504351 0.         0.        ]
TF-IDF model for s3: [0.         0.         0.54935123 0.41779577 0.         0.
 0.41779577 0.41779577 0.41779577]


In [49]:
# Print results

print()
print("BoW model for s2:", bow_array[1])
print("TF model for s2:", tf_array[1])
print("TF-IDF model for s2:", tf_idf_array[1])
print()
print("BoW model for s3:", bow_array[2])
print("TF model for s3:", tf_array[2])
print("TF-IDF model for s3:", tf_idf_array[2])


BoW model for s2: [2 0 0 2 1 1 1 0 0]
TF model for s2: [2 0 0 2 1 1 1 0 0]
TF-IDF model for s2: [0.67070313 0.         0.         0.51008702 0.33535157 0.33535157
 0.25504351 0.         0.        ]

BoW model for s3: [0 0 1 1 0 0 1 1 1]
TF model for s3: [0 0 1 1 0 0 1 1 1]
TF-IDF model for s3: [0.         0.         0.54935123 0.41779577 0.         0.
 0.41779577 0.41779577 0.41779577]


In [53]:
S1 = np.array([0,	1,	0,	0,	0,	0,	0,	1,	2])                 
S3 = np.array([0,	0,	1,	1,	0,	0,	1,	1,	1])

print("S1:", S1)
print("S3:", S3)
 
cosine = np.dot(S1,S3)/(norm(S1)*norm(S3))
print("\n Cosine Similarity:", cosine)
     

S1: [0 1 0 0 0 0 0 1 2]
S3: [0 0 1 1 0 0 1 1 1]

 Cosine Similarity: 0.5477225575051661
