In [None]:
!pip install tabulate

In [None]:
import pandas as pd
import ast
from tabulate import tabulate

In [None]:
# Dataset paths
Q_A_PATH = "dataset_Q_A_small.parquet"
TESTI_PATH = 'dataset_CONTESTI_small.parquet'

# Load the datasets
Q_A_DataSet = pd.read_parquet(Q_A_PATH)  # Load the Q&A dataset from a Parquet file
TESTI_DataSet = pd.read_parquet(TESTI_PATH)  # Load the passages dataset from a Parquet file

# Create a dictionary for quick access to passages, with 'id' as the key and 'passage' as the value
Testi_dizionario = {row["id"]: row["passage"] for _, row in TESTI_DataSet.iterrows()}


In [None]:
def add_context_to_qa_full(qa_dataset, passage_dict):
    """
    For each row in the Q&A dataset, this function combines the question with all the relevant passages,
    formatting them in the following way:

    "Question: <question> end_question Context: <passage1> ; <passage2> ; ... end_contexts"
    """

    def get_context(row):
        # Handle the "relevant_passage_ids" column, converting it into a list if necessary
        passage_ids = row["relevant_passage_ids"]
        if isinstance(passage_ids, str):
            passage_ids = ast.literal_eval(passage_ids)  # Convert string to list

        # Retrieve all passages corresponding to the IDs
        passages = [passage_dict.get(pid, "") for pid in passage_ids]
        # Create a string by concatenating the passages, separated by '; '
        context_str = " ; ".join(passages)

        # Format the final string according to the requested template
        return f"Question: {row['question']} \nContext: {context_str}"

    # Apply the get_context function to each row to create the new "input_text" column
    qa_dataset["input_text"] = qa_dataset.apply(get_context, axis=1)
    return qa_dataset


# Apply the function to generate the new dataset with the formatted "input_text" column
Q_A_DataSet = add_context_to_qa_full(Q_A_DataSet, Testi_dizionario)


In [None]:
# Create a final dataset with only the "input_text" (question + context) and "answer" columns:
final_dataset = Q_A_DataSet[["input_text", "answer"]]
print(tabulate(final_dataset.head(3), headers='keys', tablefmt='psql', showindex=False))

# To save the new dataset in Parquet format:
final_dataset.to_parquet("DB_QC_A_da_utilizzare.parquet", index=False)
