#  **Step 1: Import Necessary Libraries**

In [5]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [9]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

# **Step 2: Define a Sample Dataset**

In [10]:
data = {
    'Text': [
        "Cats are playing in the garden.",
        "Dogs bark loudly at strangers.",
        "Birds are flying in the sky.",
        "Cats and dogs are friendly pets.",
        "The garden has many beautiful flowers."
    ]
}

df = pd.DataFrame(data)
print(df)

                                     Text
0         Cats are playing in the garden.
1          Dogs bark loudly at strangers.
2            Birds are flying in the sky.
3        Cats and dogs are friendly pets.
4  The garden has many beautiful flowers.


# **Step 3: Text Preprocessing**

In [11]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # Lowercase
    text = text.lower()

    # Remove punctuation/special characters
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize
    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Apply stemming
    # tokens = [stemmer.stem(word) for word in tokens]

    # OR Apply lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

df['Clean_Text'] = df['Text'].apply(preprocess)
print(df)

                                     Text                    Clean_Text
0         Cats are playing in the garden.            cat playing garden
1          Dogs bark loudly at strangers.      dog bark loudly stranger
2            Birds are flying in the sky.               bird flying sky
3        Cats and dogs are friendly pets.          cat dog friendly pet
4  The garden has many beautiful flowers.  garden many beautiful flower


# **Step 4a: Bag-of-Words Vectorization**

In [12]:
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(df['Clean_Text'])

print("Bag of Words Feature Matrix:")
print(pd.DataFrame(bow_matrix.toarray(), columns=bow_vectorizer.get_feature_names_out()))

Bag of Words Feature Matrix:
   bark  beautiful  bird  cat  dog  flower  flying  friendly  garden  loudly  \
0     0          0     0    1    0       0       0         0       1       0   
1     1          0     0    0    1       0       0         0       0       1   
2     0          0     1    0    0       0       1         0       0       0   
3     0          0     0    1    1       0       0         1       0       0   
4     0          1     0    0    0       1       0         0       1       0   

   many  pet  playing  sky  stranger  
0     0    0        1    0         0  
1     0    0        0    0         1  
2     0    0        0    1         0  
3     0    1        0    0         0  
4     1    0        0    0         0  


# **Step 4b: TF-IDF Vectorization**

In [13]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Clean_Text'])

print("TF-IDF Feature Matrix:")
print(pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out()))

TF-IDF Feature Matrix:
       bark  beautiful     bird       cat       dog    flower   flying  \
0  0.000000   0.000000  0.00000  0.531772  0.000000  0.000000  0.00000   
1  0.523358   0.000000  0.00000  0.000000  0.422242  0.000000  0.00000   
2  0.000000   0.000000  0.57735  0.000000  0.000000  0.000000  0.57735   
3  0.000000   0.000000  0.00000  0.444002  0.444002  0.000000  0.00000   
4  0.000000   0.523358  0.00000  0.000000  0.000000  0.523358  0.00000   

   friendly    garden    loudly      many       pet   playing      sky  \
0  0.000000  0.531772  0.000000  0.000000  0.000000  0.659118  0.00000   
1  0.000000  0.000000  0.523358  0.000000  0.000000  0.000000  0.00000   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.57735   
3  0.550329  0.000000  0.000000  0.000000  0.550329  0.000000  0.00000   
4  0.000000  0.422242  0.000000  0.523358  0.000000  0.000000  0.00000   

   stranger  
0  0.000000  
1  0.523358  
2  0.000000  
3  0.000000  
4  0.000000  


# **Step 5: Compare and Interpret Outputs**

* **BoW Matrix:**

    Each cell indicates the number of times a word appears in a sentence.

    Simple frequency count; does not account for word importance across the corpus.

* **TF-IDF Matrix:**

    Each cell contains a weight (not count).

    Words common across all documents get lower scores.

    Helps emphasize unique, informative words.