In [26]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

# BoW model in Sci-kit Learn

In [27]:
corpus = ['You and I would have understood that sentence in a fraction of a second.', 
          'But machines simply cannot process text data in raw form.']

vectorizer = CountVectorizer(
    analyzer = 'word',
    lowercase = True,
    tokenizer=None,
    stop_words = None,
    preprocessor=None,
    max_features = 5000
)
# convert the documents into a document-term matrix
wm = vectorizer.fit_transform(corpus)
print(wm.todense()) 
#shape of count vector: 2 docs and 20 unique words (columns)!
wm.shape
# show resulting vocabulary; the numbers are not counts, they are the position in the sparse vector
vocabulary = vectorizer.vocabulary_
print(vocabulary)

tokens = vectorizer.get_feature_names_out()
print(tokens)

doc_name = ['Doc{:d}'.format(i) for i, _ in enumerate(wm)]
df = pd.DataFrame(data=wm.toarray(), columns=tokens, index=doc_name)
print(df)

[[1 0 0 0 0 1 1 1 0 1 0 0 1 1 0 0 1 1 1 1]
 [0 1 1 1 1 0 0 1 1 0 1 1 0 0 1 1 0 0 0 0]]
{'you': 19, 'and': 0, 'would': 18, 'have': 6, 'understood': 17, 'that': 16, 'sentence': 13, 'in': 7, 'fraction': 5, 'of': 9, 'second': 12, 'but': 1, 'machines': 8, 'simply': 14, 'cannot': 2, 'process': 10, 'text': 15, 'data': 3, 'raw': 11, 'form': 4}
['and' 'but' 'cannot' 'data' 'form' 'fraction' 'have' 'in' 'machines' 'of'
 'process' 'raw' 'second' 'sentence' 'simply' 'text' 'that' 'understood'
 'would' 'you']
      and  but  cannot  data  form  fraction  have  in  machines  of  process  \
Doc0    1    0       0     0     0         1     1   1         0   1        0   
Doc1    0    1       1     1     1         0     0   1         1   0        1   

      raw  second  sentence  simply  text  that  understood  would  you  
Doc0    0       1         1       0     0     1           1      1    1  
Doc1    1       0         0       1     1     0           0      0    0  


# Term Frequency
For term frequency in a document tf(t,d), the simplest choice is to use the raw count of a term in a document,
### tf(t,d)=log(1+ft,d)

# Inverse Document Frequency
The inverse-document frequency is a measure of how much information the word provides, i.e., if it is a common or rare across all the documents. It determines the weight of rare words across all documents in the corpus.
### idf(t,D)=log(N∣{d∈D:t∈d}∣)

In [32]:
doc = [
    "the house had a tiny little mouse",
    "the cat saw the mouse",
    "the mouse ran away from the house",
    "the cat finally ate the mouse",
    "the end of the mouse story"
]

#instantiate CountVectorizer()
cv = CountVectorizer()

# generates word counts for the words in doc
word_count_vector = cv.fit_transform(doc)
print(word_count_vector.shape) # doc has 5 rowa and 16 column (16 unique words)

tokens = cv.get_feature_names_out()
print(tokens)

# tearm document matrix
print(word_count_vector.toarray())
#create data frame
doc_names = ["Doc{:d}".format(i) for i, _ in enumerate(word_count_vector)]
df = pd.DataFrame(data=word_count_vector.toarray(), index=doc_names, columns=tokens)
print(df)

(5, 16)
['ate' 'away' 'cat' 'end' 'finally' 'from' 'had' 'house' 'little' 'mouse'
 'of' 'ran' 'saw' 'story' 'the' 'tiny']
[[0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1]
 [0 0 1 0 0 0 0 0 0 1 0 0 1 0 2 0]
 [0 1 0 0 0 1 0 1 0 1 0 1 0 0 2 0]
 [1 0 1 0 1 0 0 0 0 1 0 0 0 0 2 0]
 [0 0 0 1 0 0 0 0 0 1 1 0 0 1 2 0]]
      ate  away  cat  end  finally  from  had  house  little  mouse  of  ran  \
Doc0    0     0    0    0        0     0    1      1       1      1   0    0   
Doc1    0     0    1    0        0     0    0      0       0      1   0    0   
Doc2    0     1    0    0        0     1    0      1       0      1   0    1   
Doc3    1     0    1    0        1     0    0      0       0      1   0    0   
Doc4    0     0    0    1        0     0    0      0       0      1   1    0   

      saw  story  the  tiny  
Doc0    0      0    1     1  
Doc1    1      0    2     0  
Doc2    0      0    2     0  
Doc3    0      0    2     0  
Doc4    0      1    2     0  


In [33]:
Tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
Tfidf_transformer.fit(word_count_vector)

#print idf values
df_idf = pd.DataFrame(Tfidf_transformer.idf_, index=tokens, columns=["idf_weights"])
print(df_idf)

         idf_weights
ate         2.098612
away        2.098612
cat         1.693147
end         2.098612
finally     2.098612
from        2.098612
had         2.098612
house       1.693147
little      2.098612
mouse       1.000000
of          2.098612
ran         2.098612
saw         2.098612
story       2.098612
the         1.000000
tiny        2.098612


In [34]:
# count matrix
count_vector=cv.transform(doc)
# tf-idf scores
tf_idf_vector=Tfidf_transformer.transform(count_vector)

#get tfidf vector for first document
first_document_vector=tf_idf_vector[0]
 
#print the scores
df = pd.DataFrame(first_document_vector.T.todense(), index=tokens, columns=["tf-idf"])
df.sort_values(by=["tf-idf"],ascending=False)

Unnamed: 0,tf-idf
had,0.493562
little,0.493562
tiny,0.493562
house,0.398203
mouse,0.235185
the,0.235185
ate,0.0
away,0.0
cat,0.0
end,0.0


refrence:- https://mmuratarat.github.io/2020-04-03/bow_model_tf_idf