In [None]:
# - 'datasets' from Hugging Face to load and manipulate datasets.
# - 'tabulate' for formatting and displaying tables in a readable format.
!pip install datasets tabulate

In [None]:
import pandas as pd
from datasets import load_dataset
from tabulate import tabulate


In [None]:
# Load the datasets.
dataset_Q_A = load_dataset("rag-datasets/rag-mini-bioasq", "question-answer-passages")
dataset_testi = load_dataset("rag-datasets/rag-mini-bioasq", "text-corpus")

In [None]:
# Convert the datasets into pandas DataFrames and save them in Parquet format.
df_Q_A = pd.DataFrame(dataset_Q_A['test'])
df_testi = pd.DataFrame(dataset_testi['passages'])

In [None]:
# SELECT a subset of rows (samples) from the 'df_Q_A' dataframe. (CHANGE THE NUMBER OF ROWS AS NEEDED)
df_Q_A_small = df_Q_A.head(500)

In [None]:
# Extract the relevant_passage_ids column
relevant_passage_ids = df_Q_A_small['relevant_passage_ids'].apply(eval)

# Save the smaller datasets to new Parquet files
df_Q_A_small.to_parquet('dataset_Q_A_small.parquet')


In [None]:
# Convert each number in the list to a string to avoid scientific notation
relevant_passage_ids = relevant_passage_ids.apply(lambda x: [str(i) for i in x])

# Create a new DataFrame where each number in the list is placed in a separate column
df_relevant_passage_ids = pd.DataFrame(relevant_passage_ids.tolist(), dtype=object)

# Rename the columns to 'contesto 1', 'contesto 2', etc.
df_relevant_passage_ids.columns = [f'contesto {i+1}' for i in range(df_relevant_passage_ids.shape[1])]

# 1. Extract all unique IDs from df_ids
id_to_search = set()

# Iterate through all rows and columns
for _, row in df_relevant_passage_ids.iterrows():
    for value in row:
        if pd.notna(value):
            # Convert to string for safety and add to the set
            id_to_search.add(str(value).strip())

# 2. Filter the text dataframe by checking if 'id' values are in the set of IDs to search
filtered_df = df_testi[df_testi['id'].astype(str).str.strip().isin(id_to_search)]

# 3. Remove any duplicates based on the 'id' column
filtered_df = filtered_df.drop_duplicates(subset=['id'])

In [None]:
# 4. Display the results
# Print the 'df_Q_A_small' DataFrame in a table format using the 'psql' style
print(tabulate(df_Q_A_small.head(5), headers='keys', tablefmt='psql'))

# Print the 'filtered_df' DataFrame in a table format using the 'psql' style, without showing the index
print(tabulate(filtered_df.head(5), headers='keys', tablefmt='psql', showindex=False))

# 5. Save the new file
# Save the 'filtered_df' DataFrame to a Parquet file
filtered_df.to_parquet('dataset_CONTESTI_small.parquet')