In [None]:
!pip install lorem-text

In [321]:
import re
import numpy as np
import pandas as pd

from typing import Optional
from lorem_text import lorem

In [322]:
class CountVectorizer():
    def __init__(self,) -> None:
        self.bag_of_words: list = []
    
    def fit(self, dataset: list[str]) -> None:
        """
        Builds the Bag of Words.

        Args:
            dataset (str): Dataset that contains the main words.
        """
        dataset = " ".join(dataset).lower()
        self.bag_of_words = dict(zip(*self._get_unique_words(dataset, counts = True)))

    def _get_unique_words(self, data: list[str], counts: Optional[bool] = False) -> np.ndarray | tuple[np.ndarray]:
        words = re.findall(r'\b\w+\b', data.lower())
        return np.unique(words, return_counts = counts)
    
    @property
    def vocabulary_(self) -> dict:
        return {k: v for k, v in sorted(self.bag_of_words.items(), key=lambda item: item[1], reverse = True)}
    
    def transform(self, data: list[str]) -> list:
        vector = []
        
        for sentence in data:
            unique_words, counts = self._get_unique_words(sentence, counts = True)
            transformed_words = dict(zip(unique_words, counts))
            print(transformed_words)
            transformed_counts = [transformed_words.get(word, 0) for word, _ in self.bag_of_words.items()]

            vector.append(transformed_counts)
            
        return np.array(vector)
        

In [323]:
X_train = [lorem.paragraphs(10)]
X_test = ["Lorem ipsum is dolor"]

In [325]:
countvec = CountVectorizer()
countvec.fit(X_train)

vocab = countvec.vocabulary_
print("Vocabulary:", vocab)

vector = countvec.transform(X_test)
print("Encoded Document is:")

for counts in vector:
    print(counts)

Vocabulary: {'totam': 9, 'consectetur': 8, 'ex': 8, 'non': 8, 'ut': 8, 'corrupti': 7, 'dolorum': 7, 'minus': 7, 'numquam': 7, 'perferendis': 7, 'quod': 7, 'voluptas': 7, 'voluptates': 7, 'aut': 6, 'deleniti': 6, 'enim': 6, 'esse': 6, 'est': 6, 'incidunt': 6, 'iusto': 6, 'nisi': 6, 'praesentium': 6, 'quaerat': 6, 'qui': 6, 'quidem': 6, 'voluptatibus': 6, 'a': 5, 'accusamus': 5, 'amet': 5, 'animi': 5, 'aperiam': 5, 'architecto': 5, 'asperiores': 5, 'autem': 5, 'cum': 5, 'dicta': 5, 'dolore': 5, 'doloribus': 5, 'earum': 5, 'eligendi': 5, 'eum': 5, 'excepturi': 5, 'exercitationem': 5, 'fugiat': 5, 'harum': 5, 'in': 5, 'laborum': 5, 'maxime': 5, 'molestiae': 5, 'nam': 5, 'natus': 5, 'perspiciatis': 5, 'quasi': 5, 'quia': 5, 'quo': 5, 'reprehenderit': 5, 'sed': 5, 'sequi': 5, 'sint': 5, 'vel': 5, 'voluptate': 5, 'voluptatum': 5, 'at': 4, 'corporis': 4, 'culpa': 4, 'deserunt': 4, 'ducimus': 4, 'eius': 4, 'error': 4, 'expedita': 4, 'facilis': 4, 'fugit': 4, 'ipsam': 4, 'iste': 4, 'labore': 4, 

In [326]:
from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer()
countvec.fit(X_train)

vocab = countvec.vocabulary_
print("Vocabulary:", vocab)

vector = countvec.transform(X_test)
print("Encoded Document is:")
print(vector.toarray())

Vocabulary: {'lorem': 105, 'ipsum': 93, 'dolor': 45, 'sit': 177, 'amet': 11, 'consectetur': 28, 'adipisicing': 5, 'elit': 59, 'sed': 173, 'do': 44, 'eiusmod': 57, 'tempor': 181, 'incididunt': 88, 'ut': 190, 'labore': 99, 'et': 65, 'dolore': 46, 'magna': 106, 'aliqua': 7, 'enim': 60, 'ad': 3, 'minim': 111, 'veniam': 193, 'quis': 157, 'nostrud': 128, 'exercitation': 72, 'ullamco': 188, 'laboris': 101, 'nisi': 125, 'aliquip': 10, 'ex': 69, 'ea': 53, 'commodo': 27, 'consequat': 29, 'duis': 52, 'aute': 21, 'irure': 94, 'in': 87, 'reprehenderit': 168, 'voluptate': 198, 'velit': 192, 'esse': 63, 'cillum': 25, 'eu': 66, 'fugiat': 79, 'nulla': 130, 'pariatur': 140, 'excepteur': 70, 'sint': 176, 'occaecat': 133, 'cupidatat': 36, 'non': 127, 'proident': 147, 'sunt': 179, 'culpa': 33, 'qui': 153, 'officia': 136, 'deserunt': 40, 'mollit': 116, 'anim': 12, 'id': 83, 'est': 64, 'laborum': 102, 'architecto': 15, 'neque': 122, 'numquam': 131, 'cum': 34, 'perspiciatis': 142, 'molestiae': 114, 'tenetur':