## Import libraries

In [None]:
import nltk
import numpy as np

## Step 1: Sample Text Data
### Assume you have a small corpus of documents (sentences or paragraphs) for demonstration.

In [21]:
documents = [
    "the cat sat on the mat",
    "the dog sat on the log",
    "dogs and cats living together"
]


## Step 2: Tokenization
### Tokenize the documents into individual words. This step also involves converting all the words to lowercase to ensure consistency.

In [22]:
tokenized_documents = [doc.lower().split() for doc in documents]


## Step 3: Build a Vocabulary
### Create a list of unique words used across all documents. This will serve as the columns in your BoW model.

In [23]:
vocabulary = sorted(set(word for doc in tokenized_documents for word in doc))


## Step 4: Generate BoW Vectors
### For each document, create a vector where each element counts how many times a word from the vocabulary appears in the document.


In [24]:
# Initialize a matrix of zeros with dimensions (number of documents) x (vocabulary size)
bow_vectors = np.zeros((len(documents), len(vocabulary)), dtype=int)

# Populate the matrix with word counts
for i, doc in enumerate(tokenized_documents):
    for word in doc:
        bow_vectors[i, vocabulary.index(word)] += 1


## Step 5: Display the BoW Model
### You can display the BoW model to see the vectors or use them for further analysis, such as training a machine learning model.

In [25]:
print("Vocabulary:", vocabulary)
print("BoW Vectors:\n", bow_vectors)


Vocabulary: ['and', 'cat', 'cats', 'dog', 'dogs', 'living', 'log', 'mat', 'on', 'sat', 'the', 'together']
BoW Vectors:
 [[0 1 0 0 0 0 0 1 1 1 2 0]
 [0 0 0 1 0 0 1 0 1 1 2 0]
 [1 0 1 0 1 1 0 0 0 0 0 1]]
