In [None]:
!pip install lorem-text

In [1]:
import re
import numpy as np
import pandas as pd

from typing import Optional
from lorem_text import lorem

In [17]:
class CountVectorizer():
    def __init__(self,) -> None:
        self.bag_of_words: list = []
    
    def fit(self, dataset: list[str]) -> None:
        """
        Builds the Bag of Words.

        Args:
            dataset (str): Dataset that contains the main words.
        """
        dataset = " ".join(dataset).lower()
        self.bag_of_words = dict(zip(*self._get_unique_words(dataset, counts = True)))

    def _get_unique_words(self, data: list[str], counts: Optional[bool] = False) -> np.ndarray | tuple[np.ndarray]:
        words = re.findall(r'\b\w+\b', data.lower())
        return np.unique(words, return_counts = counts)
    
    @property
    def vocabulary_(self) -> dict:
        return {k: v for k, v in sorted(self.bag_of_words.items(), key=lambda item: item[1], reverse = True)}

    def transform(self, data: list[str], normalize: Optional[str] = None) -> list:
        vector = []
        
        for sentence in data:
            unique_words, counts = self._get_unique_words(sentence, counts = True)
            transformed_words = dict(zip(unique_words, counts))
            print(transformed_words)
            transformed_counts = [transformed_words.get(word, 0) for word, _ in self.bag_of_words.items()]

            vector.append(transformed_counts)

        matrix = np.array(vector)
        if normalize:
            matrix = self._normalize(matrix, normalize)
        return matrix
    
    def _normalize(self, matrix: np.ndarray, norm: str) -> np.ndarray:
        """Normalize the matrix with L1 or L2 norm."""
        if not norm:
            return matrix 
        
        if norm == 'l1':
            return matrix / np.sum(np.abs(matrix), axis=1, keepdims=True)
        elif norm == 'l2':
            return matrix / np.sqrt(np.sum(matrix**2, axis=1, keepdims=True))
        return matrix
        

In [18]:
X_train = [lorem.paragraphs(10)]
X_test = ["Lorem ipsum is dolor"]

In [19]:
countvec = CountVectorizer()
countvec.fit(X_train)

vocab = countvec.vocabulary_
print("Vocabulary:", vocab)

vector = countvec.transform(X_test)
print("Encoded Document is:")

for counts in vector:
    print(counts)

Vocabulary: {'culpa': 9, 'modi': 9, 'accusantium': 8, 'inventore': 8, 'cum': 7, 'dolore': 7, 'id': 7, 'iste': 7, 'itaque': 7, 'numquam': 7, 'officiis': 7, 'sit': 7, 'alias': 6, 'consequuntur': 6, 'corrupti': 6, 'deserunt': 6, 'enim': 6, 'eos': 6, 'esse': 6, 'fugiat': 6, 'fugit': 6, 'illo': 6, 'in': 6, 'laborum': 6, 'magnam': 6, 'molestiae': 6, 'nulla': 6, 'officia': 6, 'provident': 6, 'reprehenderit': 6, 'ullam': 6, 'vero': 6, 'at': 5, 'commodi': 5, 'dolor': 5, 'doloremque': 5, 'earum': 5, 'ex': 5, 'labore': 5, 'maiores': 5, 'maxime': 5, 'minus': 5, 'neque': 5, 'omnis': 5, 'possimus': 5, 'quod': 5, 'quos': 5, 'recusandae': 5, 'rem': 5, 'similique': 5, 'tenetur': 5, 'voluptas': 5, 'voluptatum': 5, 'aliquid': 4, 'aut': 4, 'beatae': 4, 'consectetur': 4, 'cumque': 4, 'dignissimos': 4, 'dolores': 4, 'error': 4, 'est': 4, 'et': 4, 'excepturi': 4, 'illum': 4, 'incidunt': 4, 'ipsam': 4, 'ipsum': 4, 'molestias': 4, 'nihil': 4, 'non': 4, 'odit': 4, 'optio': 4, 'pariatur': 4, 'perferendis': 4, 'p

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer()
countvec.fit(X_train)

vocab = countvec.vocabulary_
print("Vocabulary:", vocab)

vector = countvec.transform(X_test)
print("Encoded Document is:")
print(vector.toarray())

Vocabulary: {'lorem': 107, 'ipsum': 95, 'dolor': 47, 'sit': 178, 'amet': 10, 'consectetur': 28, 'adipisicing': 4, 'elit': 62, 'sed': 174, 'do': 46, 'eiusmod': 60, 'tempor': 182, 'incididunt': 90, 'ut': 191, 'labore': 101, 'et': 68, 'dolore': 48, 'magna': 108, 'aliqua': 6, 'enim': 63, 'ad': 2, 'minim': 113, 'veniam': 194, 'quis': 159, 'nostrud': 131, 'exercitation': 75, 'ullamco': 189, 'laboris': 103, 'nisi': 128, 'aliquip': 9, 'ex': 72, 'ea': 56, 'commodo': 27, 'consequat': 29, 'duis': 55, 'aute': 21, 'irure': 96, 'in': 89, 'reprehenderit': 169, 'voluptate': 199, 'velit': 193, 'esse': 66, 'cillum': 25, 'eu': 69, 'fugiat': 82, 'nulla': 133, 'pariatur': 143, 'excepteur': 73, 'sint': 177, 'occaecat': 136, 'cupidatat': 37, 'non': 130, 'proident': 150, 'sunt': 180, 'culpa': 34, 'qui': 157, 'officia': 139, 'deserunt': 42, 'mollit': 119, 'anim': 11, 'id': 86, 'est': 67, 'laborum': 104, 'tenetur': 186, 'accusantium': 1, 'reiciendis': 165, 'soluta': 179, 'minus': 115, 'quisquam': 160, 'nobis': 