# Install and Import Required Libraries

Install the `datasets` library if not already installed and import the necessary modules.


In [11]:
# Install the datasets library if not already installed
# !pip install datasets

# Import necessary modules
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
import os

# Load the Dataset

Use the `datasets` library to fetch and load the dataset from the Hugging Face Hub.


In [12]:
# Load the dataset from the Hugging Face Hub
dataset_name = "enelpol/rag-mini-bioasq"  # Replace with the desired dataset name
dataset = load_dataset(dataset_name, "question-answer-passages")

# Split the Dataset into Train, Test, and Validation

Split the dataset into training, testing, and validation subsets using the `train_test_split` method or similar functionality.


In [13]:
# Split the train dataset into train and validation subsets; keep test unchanged

train_validation_split = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_dataset = train_validation_split["train"]
validation_dataset = train_validation_split["test"]
test_dataset = dataset["test"]

# Create a DatasetDict

Combine the train, test, and validation subsets into a `DatasetDict` object for further processing.


In [14]:
# Combine the subsets into a DatasetDict
dataset_dict = DatasetDict(
    {"train": train_dataset, "test": test_dataset, "validation": validation_dataset}
)

# Display the structure of the DatasetDict
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'id', 'relevant_passage_ids'],
        num_rows: 3610
    })
    test: Dataset({
        features: ['question', 'answer', 'id', 'relevant_passage_ids'],
        num_rows: 707
    })
    validation: Dataset({
        features: ['question', 'answer', 'id', 'relevant_passage_ids'],
        num_rows: 402
    })
})


# Save the Raw DatasetDict

Save the unprocessed `DatasetDict` to disk for future use.


In [15]:
os.makedirs("data/prepared_data", exist_ok=True)  # Create directory if it doesn't exist
dataset_dict.save_to_disk("data/prepared_data")  # Save the dataset to disk
print("Raw dataset saved to data/prepared_data")

Saving the dataset (1/1 shards): 100%|██████████| 3610/3610 [00:00<00:00, 166237.80 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 707/707 [00:00<00:00, 95285.27 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 402/402 [00:00<00:00, 55772.37 examples/s]

Raw dataset saved to data/prepared_data





# Preprocess the Dataset for Model Training

Tokenize the dataset for use with the T5 model. This step is crucial for fine-tuning as it converts the text data into the format required by the model.


In [16]:
# Define the model name for tokenization
model_name = (
    "hmbyt5/byt5-small-english"  # This should match the model in .env file
)

# Load the tokenizer for the specified model
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [17]:
# Define the preprocessing function
def preprocess_function(examples):
    # Define the system prompt to be prepended to questions
    FINETUNING_SYSTEM_PROMPT = """You are a helpful reading assistant who answers questions.
    Be concise. If you're unsure, just say that you don't know. \n\nQuestion: """
    inputs = [FINETUNING_SYSTEM_PROMPT + q for q in examples["question"]]

    # Tokenize the inputs (questions) with padding and truncation
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        # padding="longest"
    )

    # Tokenize the targets (answers) to create the labels
    labels = tokenizer(
        text_target=examples["answer"],
        max_length=512,
        truncation=True,
        # padding="max_length",
    )

    # Add the labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [18]:
# Apply the preprocessing function to the dataset
tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

# Remove the original columns as they are no longer needed by the model
tokenized_datasets = tokenized_datasets.remove_columns(
    ["question", "answer", "id", "relevant_passage_ids"]
)

# Display the structure of the tokenized dataset
print(tokenized_datasets)

Map:   0%|          | 0/3610 [00:00<?, ? examples/s]

Map: 100%|██████████| 3610/3610 [00:00<00:00, 3716.28 examples/s]
Map: 100%|██████████| 707/707 [00:00<00:00, 4241.81 examples/s]
Map: 100%|██████████| 402/402 [00:00<00:00, 4164.63 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3610
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 707
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 402
    })
})





# Save the Tokenized Dataset

Save the preprocessed and tokenized `DatasetDict` to disk for use with the model training script.


In [19]:
# Create directory for the tokenized dataset
tokenized_dataset_path = "data/tokenized_data"
os.makedirs(tokenized_dataset_path, exist_ok=True)

tokenized_datasets.set_format("torch")  # Set the format to PyTorch tensors
# Save the tokenized dataset to disk
tokenized_datasets.save_to_disk(tokenized_dataset_path)
print(f"Tokenized dataset saved to {tokenized_dataset_path}")

Saving the dataset (1/1 shards): 100%|██████████| 3610/3610 [00:00<00:00, 533957.66 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 707/707 [00:00<00:00, 175299.89 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 402/402 [00:00<00:00, 181810.46 examples/s]

Tokenized dataset saved to data/tokenized_data



