### importing libraries

In [256]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

### vocabulary

In [257]:
import re
def tokenize(sentence) :
    return re.findall(r'\b\w+\b', sentence)

vocabulary = set()

for doc in corpus :
    tokens = tokenize(doc)
    for word in tokens :
        if len(word) > 1 :
            vocabulary.add(word.lower())

vocabulary = list(vocabulary)
print(vocabulary)
print(len(vocabulary))


['sun', 'is', 'star', 'celestial', 'and', 'bodies', 'are', 'the', 'moon', 'satellite']
10


In [258]:
vocabulary.sort()
word2index = {}
for i in range(len(vocabulary)) :
    word2index[vocabulary[i]] = i
print(word2index)
print(vocabulary)

{'and': 0, 'are': 1, 'bodies': 2, 'celestial': 3, 'is': 4, 'moon': 5, 'satellite': 6, 'star': 7, 'sun': 8, 'the': 9}
['and', 'are', 'bodies', 'celestial', 'is', 'moon', 'satellite', 'star', 'sun', 'the']


## TF-IDF calculation

### term frequency

In [259]:
tf = np.zeros((len(corpus), len(vocabulary)))
print(tf.shape)

for i in range(len(corpus)) :
    tokens = tokenize(corpus[i])
    for j in range(len(tokens)) :
        if tokens[j].lower() in word2index :
            tf[i][word2index[tokens[j].lower()]] += 1    # raw count

print("Term Frequency Matrix:")
print(tf)

(3, 10)
Term Frequency Matrix:
[[0. 0. 0. 0. 1. 0. 0. 1. 1. 1.]
 [0. 0. 0. 0. 1. 1. 1. 0. 0. 1.]
 [1. 1. 1. 1. 0. 1. 0. 0. 1. 1.]]


### inverse document frequency

In [260]:
idf = np.zeros(len(vocabulary))
for i in range(len(vocabulary)) :
    count = 0
    for doc in corpus :
        tokens = [token.lower() for token in tokenize(doc)]
        if vocabulary[i] in tokens :
            count += 1
    idf[i] = np.log((len(corpus) +1)/ (count + 1)) + 1         # using smoothing by adding +1

print("IDF values:")
print(idf)

IDF values:
[1.69314718 1.69314718 1.69314718 1.69314718 1.28768207 1.28768207
 1.69314718 1.69314718 1.28768207 1.        ]


### tf-idf

In [261]:
from sklearn.preprocessing import normalize

tf_idf = tf * idf.reshape(1, -1)     # multiplying the tf and idf values
tf_idf = normalize(tf_idf, 'l2', axis = 1)    # using l2 normalisation

print("TF-IDF matrix:")
print(tf_idf)


TF-IDF matrix:
[[0.         0.         0.         0.         0.4804584  0.
  0.         0.63174505 0.4804584  0.37311881]
 [0.         0.         0.         0.         0.4804584  0.4804584
  0.63174505 0.         0.         0.37311881]
 [0.4261835  0.4261835  0.4261835  0.4261835  0.         0.32412354
  0.         0.         0.32412354 0.25171084]]


### CountVectorizer

In [262]:
vect1 = CountVectorizer()
X1 = vect1.fit_transform(corpus)

print("Vocabulary from CountVectorizer:")
print(vect1.vocabulary_)

print("features from CountVectorizer:")
print(vect1.get_feature_names_out())

print("CountVectorizer values:")
print(X1.toarray())

Vocabulary from CountVectorizer:
{'the': 9, 'sun': 8, 'is': 4, 'star': 7, 'moon': 5, 'satellite': 6, 'and': 0, 'are': 1, 'celestial': 3, 'bodies': 2}
features from CountVectorizer:
['and' 'are' 'bodies' 'celestial' 'is' 'moon' 'satellite' 'star' 'sun'
 'the']
CountVectorizer values:
[[0 0 0 0 1 0 0 1 1 1]
 [0 0 0 0 1 1 1 0 0 1]
 [1 1 1 1 0 1 0 0 1 1]]


### TfidfVectorizer

In [263]:
vect2 = TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
X2 = vect2.fit_transform(corpus)

print("Vocabulary from TfidfVectorizer:")
print(vect2.vocabulary_)

print("features from TfidfVectorizer:")
print(vect2.get_feature_names_out())

print("TfidfVectorizer:")
print(X2.toarray())

Vocabulary from TfidfVectorizer:
{'the': 9, 'sun': 8, 'is': 4, 'star': 7, 'moon': 5, 'satellite': 6, 'and': 0, 'are': 1, 'celestial': 3, 'bodies': 2}
features from TfidfVectorizer:
['and' 'are' 'bodies' 'celestial' 'is' 'moon' 'satellite' 'star' 'sun'
 'the']
TfidfVectorizer:
[[0.         0.         0.         0.         0.4804584  0.
  0.         0.63174505 0.4804584  0.37311881]
 [0.         0.         0.         0.         0.4804584  0.4804584
  0.63174505 0.         0.         0.37311881]
 [0.4261835  0.4261835  0.4261835  0.4261835  0.         0.32412354
  0.         0.         0.32412354 0.25171084]]


### TF and CountVectoriser

In [264]:
print("TF values calculate :")
print(tf)
print("Count Vectoriser values :")
print(X1.toarray())

TF values calculate :
[[0. 0. 0. 0. 1. 0. 0. 1. 1. 1.]
 [0. 0. 0. 0. 1. 1. 1. 0. 0. 1.]
 [1. 1. 1. 1. 0. 1. 0. 0. 1. 1.]]
Count Vectoriser values :
[[0 0 0 0 1 0 0 1 1 1]
 [0 0 0 0 1 1 1 0 0 1]
 [1 1 1 1 0 1 0 0 1 1]]


### TF-IDF and Tfidfvectoriser

In [265]:
print("TF-IDF values calculated :")
print(tf_idf)
print("TfidfVectorizer values :")
print(X2.toarray())

TF-IDF values calculated :
[[0.         0.         0.         0.         0.4804584  0.
  0.         0.63174505 0.4804584  0.37311881]
 [0.         0.         0.         0.         0.4804584  0.4804584
  0.63174505 0.         0.         0.37311881]
 [0.4261835  0.4261835  0.4261835  0.4261835  0.         0.32412354
  0.         0.         0.32412354 0.25171084]]
TfidfVectorizer values :
[[0.         0.         0.         0.         0.4804584  0.
  0.         0.63174505 0.4804584  0.37311881]
 [0.         0.         0.         0.         0.4804584  0.4804584
  0.63174505 0.         0.         0.37311881]
 [0.4261835  0.4261835  0.4261835  0.4261835  0.         0.32412354
  0.         0.         0.32412354 0.25171084]]
