In [1]:
# Bag of Words text classification example using scikit-learn
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re

In [2]:
text = """Pizza like to eat Pizza.
        I love to eat burger."""      

In [3]:
text

'Pizza like to eat Pizza.\n        I love to eat burger.'

In [4]:
# 1. Tokenisation
sentences = nltk.sent_tokenize(text)
print(sentences)

['Pizza like to eat Pizza.', 'I love to eat burger.']


In [5]:
stop_words = set(stopwords.words("english"))
snowball_stemmer = SnowballStemmer("english")

bow_sentences = []

for sent in sentences:
    cleaned_words = []

    words = word_tokenize(sent)
    for word in words:
        if word.lower() not in stop_words and word.isalpha():
            cleaned_words.append(snowball_stemmer.stem(word.lower()))

    bow_sentences.append(" ".join(cleaned_words))

print(bow_sentences)

['pizza like eat pizza', 'love eat burger']


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 1))
X = vectorizer.fit_transform(bow_sentences)
print(vectorizer.get_feature_names_out())
print(X.toarray())


['burger' 'eat' 'like' 'love' 'pizza']
[[0 1 1 0 2]
 [1 1 0 1 0]]


# process tokenise --> remove stop words --> stemming --> bag of words vectorization

In [7]:
# Implementing TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(bow_sentences)

In [8]:
print(vectorizer.get_feature_names_out())

['burger' 'eat' 'like' 'love' 'pizza']


In [9]:
print(X_tfidf.toarray())

[[0.         0.30321606 0.4261596  0.         0.8523192 ]
 [0.6316672  0.44943642 0.         0.6316672  0.        ]]


pizza appears twice in sentence 1 → TF ↑

eat appears in both sentences → IDF ↓

burger & love appear only once → importance ↑

TF-IDF assigns higher weight to words that are frequent in a document but rare across documents.