In [2]:
DATASET = "../artifacts/derived/first_essay_set/dataset.parquet"

In [3]:
import sys
sys.path.append("../")

from scorer.data import FirstEssaySetDataset, TextCleaner
from torch.utils.data import DataLoader, Dataset

In [4]:
dataset = FirstEssaySetDataset(DATASET, TextCleaner())

In [9]:
loader = DataLoader(dataset, batch_size=1, shuffle=True)

In [28]:
import math

class TfidfVectorizer:
    def __init__(self) -> None:
        self.word_inverse_document_frequency = {}

    def fit(self, documents: list[str]) -> None:
        n_documents = len(documents)
        self._init_word_inverse_document_frequency(documents)
        for term, count in self.word_inverse_document_frequency.items():
            self.word_inverse_document_frequency[term] = math.log(n_documents / count)
    
    def _init_word_inverse_document_frequency(self, documents: list[str]):
        for document in documents:
            seen = set()
            for term in document.split():
                if term not in seen:
                    self.word_inverse_document_frequency.setdefault(term, 0)
                    self.word_inverse_document_frequency[term] += 1
                    seen.add(term)

    def encode(self, documents: list[str]) -> list[list[float]]:
        encodings = []
        for document in documents:
            encoded_document = []
            for term in document.split():
                encoded_term = self.word_inverse_document_frequency[term]
                encoded_document.append(encoded_term)
            encodings.append(encoded_document)
        return encodings

In [29]:
corpus = [data[0][0] for data in loader]
len(corpus)

1783

In [30]:
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

In [33]:
output = vectorizer.encode([corpus[0]])