<a href="https://colab.research.google.com/github/Kavya-sree/machinelearningbrain_code_samples/blob/main/Bag_of_words_model_from_scratch_and_scikit_learn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install nltk



In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Implementing bag of words from scratch

In [18]:
from collections import Counter

class BoW:
    def __init__(self):
        self.vocabulary = {}  # Dictionary to store the vocabulary
        self.word_counts = []  # List to store the word counts for each document

    def fit_transform(self, documents):
        # Iterate through each document in the corpus
        for doc in documents:
            # Tokenize the document into words (split by whitespace)
            words = doc.split()
            # Count the occurrences of each word using Counter
            word_count = Counter(words)
            # Update the vocabulary with new words
            self.vocabulary.update(word_count)
            # Append the word count dictionary to word_counts
            self.word_counts.append(word_count)

        # Convert the vocabulary into a list of unique words
        self.vocabulary = list(self.vocabulary.keys())

        # Transform each document into a BoW vector
        bow_vectors = []
        for word_count in self.word_counts:
            # Initialize a vector with zeros for each word in the vocabulary
            vector = [0] * len(self.vocabulary)
            # Update the vector with word counts
            for word, count in word_count.items():
                if word in self.vocabulary:
                    # Find the index of the word in the vocabulary
                    index = self.vocabulary.index(word)
                    # Update the vector at the corresponding index
                    vector[index] = count
            # Append the vector to the list of BoW vectors
            bow_vectors.append(vector)

        return bow_vectors

# Example usage
documents = [
    "I love dancing, love to dance on stage.",
    "Dancing is a way to express yourself.",
    "He loves dancing to random beats."
]

bow_model = BoW()
bow_representation = bow_model.fit_transform(documents)

# Print the vocabulary
print("Vocabulary:", bow_model.vocabulary)
# Print the BoW representation of each document
for i, vector in enumerate(bow_representation):
    print(f"Document {i + 1}: {vector}")


Vocabulary: ['I', 'love', 'dancing,', 'to', 'dance', 'on', 'stage.', 'Dancing', 'is', 'a', 'way', 'express', 'yourself.', 'He', 'loves', 'dancing', 'random', 'beats.']
Document 1: [1, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Document 2: [0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
Document 3: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]


# Implementing Bag of Words in scikit-learn

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(documents)

In [10]:
vectorizer.vocabulary_

{'love': 6,
 'dancing': 2,
 'to': 11,
 'dance': 1,
 'on': 8,
 'stage': 10,
 'is': 5,
 'way': 12,
 'express': 3,
 'yourself': 13,
 'he': 4,
 'loves': 7,
 'random': 9,
 'beats': 0}

In [11]:
bag_of_words_model = vectorizer.transform(documents)
bag_of_words_model.shape

(3, 14)

In [12]:
bag_of_words_model

<3x14 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [15]:
doc_array = bag_of_words_model.toarray()
doc_array

import pandas as pd

pd.DataFrame(doc_array, columns=vectorizer.get_feature_names_out())


Unnamed: 0,beats,dance,dancing,express,he,is,love,loves,on,random,stage,to,way,yourself
0,0,1,1,0,0,0,2,0,1,0,1,1,0,0
1,0,0,1,1,0,1,0,0,0,0,0,1,1,1
2,1,0,1,0,1,0,0,1,0,1,0,1,0,0
