## Bag of Words

In [15]:
import nltk

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
text = """Natural Language Processing is a subfield of Artificial Intelligence.
Machine learning techniques are widely used in NLP tasks such as text classification.
Deep learning models like RNNs and Transformers have improved NLP performance.
Bag of Words is one of the simplest text vectorization techniques.
TF-IDF and word embeddings are more advanced alternatives to Bag of Words."""

In [18]:
# Download the tokenizer
nltk.download('punkt')
corpus = nltk.sent_tokenize(text)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### The Punkt tokenizer is a pre-trained, unsupervised sentence tokenizer that comes with NLTK. It breaks text into sentences.

In [19]:
# Sentence splitting
corpus = nltk.sent_tokenize(text)

In [20]:
print("Documents (after splitting):")
for i, doc in enumerate(corpus, 1):
    print(f"{i}: {doc}")

Documents (after splitting):
1: Natural Language Processing is a subfield of Artificial Intelligence.
2: Machine learning techniques are widely used in NLP tasks such as text classification.
3: Deep learning models like RNNs and Transformers have improved NLP performance.
4: Bag of Words is one of the simplest text vectorization techniques.
5: TF-IDF and word embeddings are more advanced alternatives to Bag of Words.


In [21]:
# Bag of words
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(corpus)

In [23]:
print(" Vocabulary:", vectorizer.get_feature_names_out())

 Vocabulary: ['advanced' 'alternatives' 'and' 'are' 'artificial' 'as' 'bag'
 'classification' 'deep' 'embeddings' 'have' 'idf' 'improved' 'in'
 'intelligence' 'is' 'language' 'learning' 'like' 'machine' 'models'
 'more' 'natural' 'nlp' 'of' 'one' 'performance' 'processing' 'rnns'
 'simplest' 'subfield' 'such' 'tasks' 'techniques' 'text' 'tf' 'the' 'to'
 'transformers' 'used' 'vectorization' 'widely' 'word' 'words']


In [24]:
print(" BoW Matrix:\n", bow.toarray())

 BoW Matrix:
 [[0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0
  0 0 0 0 0 0 0 0]
 [0 0 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 0
  0 0 0 1 0 1 0 0]
 [0 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 1 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0
  0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 2 1 0 0 0 1 0 0 0 1 1 0
  1 0 0 0 1 0 0 1]
 [1 1 1 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1
  0 1 0 0 0 0 1 1]]


---

## Unigram

In [25]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [26]:
text = """Natural Language Processing is a subfield of Artificial Intelligence.
Machine learning techniques are widely used in NLP tasks such as text classification.
Bag of Words is one of the simplest text vectorization techniques."""

In [27]:
# split text into sentences
corpus = nltk.sent_tokenize(text)

In [29]:
print("Documents :", corpus)

Documents : ['Natural Language Processing is a subfield of Artificial Intelligence.', 'Machine learning techniques are widely used in NLP tasks such as text classification.', 'Bag of Words is one of the simplest text vectorization techniques.']


In [30]:
unigram_vectorizer = CountVectorizer(ngram_range=(1,1))  # (1,1) = unigrams

In [34]:
unigram_bow = unigram_vectorizer.fit_transform(corpus)

In [35]:
print("\n Unigram Vocabulary:", unigram_vectorizer.get_feature_names_out())


 Unigram Vocabulary: ['are' 'artificial' 'as' 'bag' 'classification' 'in' 'intelligence' 'is'
 'language' 'learning' 'machine' 'natural' 'nlp' 'of' 'one' 'processing'
 'simplest' 'subfield' 'such' 'tasks' 'techniques' 'text' 'the' 'used'
 'vectorization' 'widely' 'words']


In [37]:
print(" Unigram BoW Matrix:\n", unigram_bow.toarray())

 Unigram BoW Matrix:
 [[0 1 0 0 0 0 1 1 1 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0]
 [1 0 1 0 1 1 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 1 0 1 0 1 0]
 [0 0 0 1 0 0 0 1 0 0 0 0 0 2 1 0 1 0 0 0 1 1 1 0 1 0 1]]


## Bigram

In [38]:
bigram_vectorizer = CountVectorizer(ngram_range=(2,2))  # (2,2) = bigrams

In [39]:
bigram_bow = bigram_vectorizer.fit_transform(corpus)

In [40]:
print("\n Bigram Vocabulary:", bigram_vectorizer.get_feature_names_out())


 Bigram Vocabulary: ['are widely' 'artificial intelligence' 'as text' 'bag of' 'in nlp'
 'is one' 'is subfield' 'language processing' 'learning techniques'
 'machine learning' 'natural language' 'nlp tasks' 'of artificial'
 'of the' 'of words' 'one of' 'processing is' 'simplest text'
 'subfield of' 'such as' 'tasks such' 'techniques are'
 'text classification' 'text vectorization' 'the simplest' 'used in'
 'vectorization techniques' 'widely used' 'words is']


In [41]:
print(" Bigram BoW Matrix:\n", bigram_bow.toarray())

 Bigram BoW Matrix:
 [[0 1 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0]
 [1 0 1 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 1 0]
 [0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 1 1 0 1 0 1]]


## Unigram+Bigram

In [43]:
vectorizer = CountVectorizer(ngram_range=(1,2))  # (1,2) = unigrams + bigrams
bow = vectorizer.fit_transform(corpus)

print("\n Vocabulary (Unigrams + Bigrams):")
print(vectorizer.get_feature_names_out())


 Vocabulary (Unigrams + Bigrams):
['are' 'are widely' 'artificial' 'artificial intelligence' 'as' 'as text'
 'bag' 'bag of' 'classification' 'in' 'in nlp' 'intelligence' 'is'
 'is one' 'is subfield' 'language' 'language processing' 'learning'
 'learning techniques' 'machine' 'machine learning' 'natural'
 'natural language' 'nlp' 'nlp tasks' 'of' 'of artificial' 'of the'
 'of words' 'one' 'one of' 'processing' 'processing is' 'simplest'
 'simplest text' 'subfield' 'subfield of' 'such' 'such as' 'tasks'
 'tasks such' 'techniques' 'techniques are' 'text' 'text classification'
 'text vectorization' 'the' 'the simplest' 'used' 'used in'
 'vectorization' 'vectorization techniques' 'widely' 'widely used' 'words'
 'words is']


In [44]:
print("\n BoW Matrix:\n", bow.toarray())


 BoW Matrix:
 [[0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 1 1 0 0 0 0 1 1 0 0 1 1 0 0 0 0 1 1 0 0 1
  1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 1 1 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
  0 1 1 1 1 1 1 1 1 0 0 0 1 1 0 0 1 1 0 0]
 [0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 2 0 1 1 1 1 0 0 1 1 0
  0 0 0 0 0 1 0 1 0 1 1 1 0 0 1 1 0 0 1 1]]


---

In [47]:
import pandas as pd

In [48]:
text = """Natural Language Processing is a subfield of Artificial Intelligence.
Machine learning techniques are widely used in NLP tasks such as text classification.
Bag of Words is one of the simplest text vectorization techniques."""

In [49]:
#  Split text into sentences
corpus = nltk.sent_tokenize(text)
print(" Documents (sentences):", corpus)

 Documents (sentences): ['Natural Language Processing is a subfield of Artificial Intelligence.', 'Machine learning techniques are widely used in NLP tasks such as text classification.', 'Bag of Words is one of the simplest text vectorization techniques.']


In [50]:
vectorizer = CountVectorizer(ngram_range=(1,2))  # unigrams + bigrams
bow = vectorizer.fit_transform(corpus)

In [51]:
#  Convert BoW to DataFrame for readability
bow_df = pd.DataFrame(bow.toarray(), columns=vectorizer.get_feature_names_out())
bow_df.index = [f"Sentence {i+1}" for i in range(len(corpus))]

In [53]:
print("\n Bag of Words Table (Unigrams + Bigrams):")
print(bow_df)


 Bag of Words Table (Unigrams + Bigrams):
            are  are widely  artificial  artificial intelligence  as  as text  \
Sentence 1    0           0           1                        1   0        0   
Sentence 2    1           1           0                        0   1        1   
Sentence 3    0           0           0                        0   0        0   

            bag  bag of  classification  in  ...  the  the simplest  used  \
Sentence 1    0       0               0   0  ...    0             0     0   
Sentence 2    0       0               1   1  ...    0             0     1   
Sentence 3    1       1               0   0  ...    1             1     0   

            used in  vectorization  vectorization techniques  widely  \
Sentence 1        0              0                         0       0   
Sentence 2        1              0                         0       1   
Sentence 3        0              1                         1       0   

            widely used  words  wo