In [8]:
nltk.download('punkt', force=True)
nltk.download('stopwords', force=True)
nltk.download('punkt_tab', force=True)

from nltk.tokenize import word_tokenize
assert word_tokenize("Test sentence.") == ['Test', 'sentence', '.']

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [9]:
import nltk
import os
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [10]:
data = [
    "Cats are running around the house!",
    "The dog barked loudly at the stranger.",
    "He is reading a book on machine learning.",
    "The quick brown fox jumps over the lazy dog."
]

In [11]:
df = pd.DataFrame(data, columns=["Text"])

In [12]:
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

In [13]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in tokens]
    return " ".join(stemmed)

In [14]:
df["Processed_Text"] = df["Text"].apply(preprocess)

In [15]:
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(df["Processed_Text"])

In [16]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df["Processed_Text"])

In [17]:
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=bow_vectorizer.get_feature_names_out())
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [18]:
print("=== Bag of Words Matrix ===")
print(bow_df)
print("\n=== TF-IDF Matrix ===")
print(tfidf_df)

=== Bag of Words Matrix ===
   around  bark  book  brown  cat  dog  fox  hous  jump  lazi  learn  loudli  \
0       1     0     0      0    1    0    0     1     0     0      0       0   
1       0     1     0      0    0    1    0     0     0     0      0       1   
2       0     0     1      0    0    0    0     0     0     0      1       0   
3       0     0     0      1    0    1    1     0     1     1      0       0   

   machin  quick  read  run  stranger  
0       0      0     0    1         0  
1       0      0     0    0         1  
2       1      0     1    0         0  
3       0      1     0    0         0  

=== TF-IDF Matrix ===
   around      bark  book     brown  cat       dog       fox  hous      jump  \
0     0.5  0.000000   0.0  0.000000  0.5  0.000000  0.000000   0.5  0.000000   
1     0.0  0.525473   0.0  0.000000  0.0  0.414289  0.000000   0.0  0.000000   
2     0.0  0.000000   0.5  0.000000  0.0  0.000000  0.000000   0.0  0.000000   
3     0.0  0.000000   0.0  0