# Install and Import Required Libraries
Install the `datasets` library if not already installed and import the necessary modules.

In [15]:
# Install the datasets library if not already installed
# !pip install datasets

# Import necessary modules
from datasets import load_dataset, DatasetDict
import os

# Load the Dataset
Use the `datasets` library to fetch and load the dataset from the Hugging Face Hub.

In [3]:
# Load the dataset from the Hugging Face Hub
dataset_name = "enelpol/rag-mini-bioasq"  # Replace with the desired dataset name
dataset = load_dataset(dataset_name, 'question-answer-passages')

# Split the Dataset into Train, Test, and Validation
Split the dataset into training, testing, and validation subsets using the `train_test_split` method or similar functionality.

In [9]:
# Split the train dataset into train and validation subsets; keep test unchanged

train_validation_split = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = train_validation_split["train"]
validation_dataset = train_validation_split["test"]
test_dataset = dataset["test"]


# Create a DatasetDict
Combine the train, test, and validation subsets into a `DatasetDict` object for further processing.

In [13]:
# Combine the subsets into a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "validation": validation_dataset
})

# Display the structure of the DatasetDict
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'id', 'relevant_passage_ids'],
        num_rows: 3209
    })
    test: Dataset({
        features: ['question', 'answer', 'id', 'relevant_passage_ids'],
        num_rows: 707
    })
    validation: Dataset({
        features: ['question', 'answer', 'id', 'relevant_passage_ids'],
        num_rows: 803
    })
})


In [14]:
os.makedirs("data/prepared_data", exist_ok=True)  # Create directory if it doesn't exist
dataset_dict.save_to_disk("data/prepared_data")  # Save the dataset to disk

Saving the dataset (1/1 shards): 100%|██████████| 3209/3209 [00:00<00:00, 36925.78 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 707/707 [00:00<00:00, 135491.77 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 803/803 [00:00<00:00, 161963.27 examples/s]
