#### Count Vectorization

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
documents = [
  "My cat and dog sat on the mat",
  "strange cat jumped over the mat",
  "His cat ate the mouse ran across the mat",
  "Her dog run after my cat and mouse",
]

max_features = 6

vectorizer = CountVectorizer(max_features = max_features)

vectors = vectorizer.fit_transform(documents)

vectors_array = vectors.toarray()

feature_names = vectorizer.get_feature_names_out()

print(feature_names)

print(vectors_array)

['cat' 'dog' 'mat' 'mouse' 'my' 'the']
[[1 1 1 0 1 1]
 [1 0 1 0 0 1]
 [1 0 1 1 0 2]
 [1 1 0 1 1 0]]


For every word in the array of the (max_features) 6 most repeated words

How many times it occurred in the sentence?

For the third sentence: words "cat", "mat" and "mouse" occurred once, while word "the" occurred twice

In [29]:
import pandas as pd

vactors_data = pd.DataFrame(data = vectors_array, columns = feature_names)

vactors_data

Unnamed: 0,cat,dog,mat,mouse,my,the
0,1,1,1,0,1,1
1,1,0,1,0,0,1
2,1,0,1,1,0,2
3,1,1,0,1,1,0


In [32]:
frequencies = vectors_array.sum(axis = 0)

print(feature_names)
print(frequencies)

['cat' 'dog' 'mat' 'mouse' 'my' 'the']
[4 2 3 2 2 4]


Order most frequent words descending

In [40]:
# argsort returns ascending sorted array of indices of the frequencies, then we reverse the array

sorted_indices = frequencies.argsort()[::-1]

# get sorted array by the sorted indices

sorted_frequencies = frequencies[sorted_indices]

# the most frequent words ordered descending

sorted_words = [feature_names[idx] for idx in sorted_indices]

print(sorted_words)
print(sorted_frequencies)

['the', 'cat', 'mat', 'my', 'mouse', 'dog']
[4 4 3 2 2 2]


#### TF-IDF Vectorization

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
vectorizer = TfidfVectorizer(max_features = max_features)

vectors = vectorizer.fit_transform(documents)

vectors_array = vectors.toarray()

feature_names = vectorizer.get_feature_names_out()

print(feature_names)

print(vectors_array)

['cat' 'dog' 'mat' 'mouse' 'my' 'the']
[[0.34184591 0.51646957 0.41812662 0.         0.51646957 0.41812662]
 [0.5004907  0.         0.61217198 0.         0.         0.61217198]
 [0.30481296 0.         0.37283001 0.46051924 0.         0.74566002]
 [0.35696573 0.53931298 0.         0.53931298 0.53931298 0.        ]]


In [43]:
vactors_data = pd.DataFrame(data = vectors_array, columns = feature_names)

vactors_data

Unnamed: 0,cat,dog,mat,mouse,my,the
0,0.341846,0.51647,0.418127,0.0,0.51647,0.418127
1,0.500491,0.0,0.612172,0.0,0.0,0.612172
2,0.304813,0.0,0.37283,0.460519,0.0,0.74566
3,0.356966,0.539313,0.0,0.539313,0.539313,0.0
