In [1]:
# Importing necessary libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [2]:
# Sample corpus
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

In [3]:
# ---- Bag of Words (BoW) ----
# Using CountVectorizer to convert text to BoW representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print("Bag of Words (BoW) Representation:\n", X.toarray())

Bag of Words (BoW) Representation:
 [[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [4]:
# ---- TF-IDF ----
# Using TfidfVectorizer to convert text to TF-IDF representation
tfidfvectorizer = TfidfVectorizer()
X_tfidf = tfidfvectorizer.fit_transform(corpus)
print("\nTF-IDF Representation:\n", X_tfidf.toarray())


TF-IDF Representation:
 [[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [5]:
# ---- Feature Engineering ----
# 1. Text Length
text_length = [len(doc) for doc in corpus]
print("\nText Length Feature:", text_length)



Text Length Feature: [27, 37, 26, 27]


In [7]:
# 2. Number of Unique Words
num_unique_words = [len(set(doc.split())) for doc in corpus]
print("\nNumber of Unique Words Feature:", num_unique_words)



Number of Unique Words Feature: [5, 6, 6, 5]
