#  feature extraction using CountVectorizer

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# List of text documents
text = ["AI is the future of technology"]
text1 = ["Football is a popular sport"]

# Initialize the CountVectorizer, it is our "model" for converting text data into numerical form
vectorizer = CountVectorizer()

# Fit the vectorizer to the first document and transform the text
#learning the vocabulary and converting the text into a numerical representation (vector).
vectorized_text = vectorizer.fit_transform(text)

# Print vocabulary after fitting, each word in the text has been assigned a unique index
print("Vocabulary:\n", vectorizer.vocabulary_)

Vocabulary:
 {'ai': 0, 'is': 2, 'the': 5, 'future': 1, 'of': 3, 'technology': 4}


In [4]:
# Transform the second document based on the fitted vectorizer
new_vector = vectorizer.transform(text1)

# Summarize the original text vector
print("\nVectorized form of 'text':\n", vectorized_text.toarray())

# Summarize the new document vector
print("\nVectorized form of 'text1':\n", new_vector.toarray())



Vectorized form of 'text':
 [[1 1 1 1 1 1]]

Vectorized form of 'text1':
 [[0 0 1 0 0 0]]


fit_transform() is used when you are creating the vocabulary and transforming the text for the first time.

transform() is used when the vocabulary has already been created (fitted), and you want to apply that vocabulary to new data without changing it.

# TF - IDF
TfidfVectorizer is used to convert text data into TF-IDF (Term Frequency-Inverse Document Frequency) features.

This method combines term frequency (how often a word appears) and inverse document frequency (how unique the word is across documents) to score words.

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
text = ["AI is the future of technology", "Football is a popular sport"]


In [4]:
#tokenize the documents and calculate the TF-IDF values for each word
#Fit and build vocabulary
vectorizer = TfidfVectorizer()
vectorizer.fit(text)


In [5]:
# Print the vocabulary
print("Vocabulary:\n", vectorizer.vocabulary_)

# Print the IDF values
print("IDF values:\n", vectorizer.idf_)

Vocabulary:
 {'ai': 0, 'is': 3, 'the': 8, 'future': 2, 'of': 4, 'technology': 7, 'football': 1, 'popular': 5, 'sport': 6}
IDF values:
 [1.40546511 1.40546511 1.40546511 1.         1.40546511 1.40546511
 1.40546511 1.40546511 1.40546511]


Index 3 ("is"): IDF = 1 (appears in all documents, so IDF is lower)


In [6]:
text_as_input = text[1]
print(text_as_input)


Football is a popular sport


In [7]:

vector = vectorizer.transform([text_as_input])


In [8]:
#The result is a sparse vector where each element corresponds to a word in the vocabulary
print(vector.toarray())


[[0.         0.53404633 0.         0.37997836 0.         0.53404633
  0.53404633 0.         0.        ]]
