In [None]:
import numpy as np
from datasets import Dataset
from models import load_models
from rag_configuration import CFG
from typing import Any

In [None]:
embedding_model, _ = load_models()

In [None]:
dataset = Dataset.load_from_disk('data/preprocessed_rag_chunks_no_embedding')
dataset = dataset.add_column('embedded_text',
                             ['\n\n'.join([f"**{chunk['headings']}**", f"{chunk['text_content']}"]) for chunk in
                              dataset])

In [None]:
def calculate_embedding(batch:Any) -> np.ndarray:
    """
    Calculates the embedding vector for a given batch of text data. This function is specifically made to work with datasets.Dataset.map(batched=True).

    :param batch: A dictionary containing the key 'embedded_text', which is the text data to be encoded.
                  The resulting embedding will be stored in a new key, 'embedding', within the batch.
    :return: The modified batch with an added 'embedding' key containing the computed embedding vector as a numpy array.
    """
    batch['embedding'] = embedding_model.encode(batch['embedded_text'], task=CFG.task)
    return batch

In [None]:
dataset = dataset.map(calculate_embedding, batched=True, batch_size=32)
dataset.save_to_disk('data/preprocessed_rag_chunks_text_and_heading')