In [None]:
!pip install -q transformers datasets evaluate scikit-learn pyarrow accelerate sentencepiece bitsandbytes

In [None]:
import ast
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    EarlyStoppingCallback,
)
import torch

In [None]:
# Path to the QA dataset in Parquet format
QA = "Per_Creare_Dataset_Piccoli/DB_QC_A_da_utilizzare.parquet"

# Load the dataset into a pandas DataFrame
qa_dataset = pd.read_parquet(QA)

# Split the dataset into training, validation, and test sets
# 1. First, split the dataset into a training+validation set (90% of the data) and a test set (10% of the data)
train_val, TEST_df = train_test_split(qa_dataset, test_size=0.1, random_state=42)

# 2. Then, split the training+validation set into training (80% of the remaining 90%) and validation (20% of the remaining 90%) sets
# Note: The test size is adjusted to account for the original split (i.e., 20% of the 90%).
TRAIN_df, VALIDATION_df = train_test_split(train_val, test_size=0.2 / (1 - 0.1), random_state=42)

In [None]:
# Load the tokenizer and base model for BART (a seq2seq model)
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base", model_max_length=512)
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")

In [None]:
def tokenize_function(examples, tokenizer, max_input_length=512, max_target_length=128):
    """Tokenizes the dataset."""

    # Extract the 'input_text' (questions and contexts) and 'answer' columns from the examples
    inputs = examples["input_text"]
    targets = examples["answer"]

    # Tokenize the input text (questions and contexts) with truncation and padding to a maximum length
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    # Tokenize the target text (answers) with truncation and padding to a maximum length
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True, padding="max_length")

    # Add the tokenized target text (labels) to the model inputs dictionary
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs  # Return the tokenized inputs along with the labels



# Prepare datasets by applying the tokenization function to each split
train_dataset = Dataset.from_pandas(TRAIN_df).map(lambda x: tokenize_function(x, tokenizer), batched=True)
val_dataset = Dataset.from_pandas(VALIDATION_df).map(lambda x: tokenize_function(x, tokenizer), batched=True)
test_dataset = Dataset.from_pandas(TEST_df).map(lambda x: tokenize_function(x, tokenizer), batched=True)

# Data collator to manage batching for sequence-to-sequence tasks
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# Define training arguments for the Seq2Seq model
training_args = Seq2SeqTrainingArguments(
    output_dir='fine_tuned_model',  # Directory to save the fine-tuned model
    evaluation_strategy="epoch",  # Evaluate the model at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    load_best_model_at_end=True,  # Automatically load the best model after training
    metric_for_best_model="eval_loss",  # Metric used to determine the best model (lower is better for eval_loss)
    greater_is_better=False,  # For eval_loss, lower is better, so we set this to False
    save_total_limit=2,  # Limit the number of saved models to 2 (older models will be deleted)
    num_train_epochs=10,  # Set the number of training epochs to 10
    per_device_train_batch_size=16,  # Batch size for training on each device (GPU or CPU)
    fp16=torch.cuda.is_available(),  # Use mixed precision (fp16) if a GPU with CUDA is available
    logging_steps=50,  # Log training information every 50 steps
    report_to="none"  # No reporting to any external tool like WandB or TensorBoard
)

# Initialize the Seq2SeqTrainer with the model, training arguments, datasets, tokenizer, and callbacks
trainer = Seq2SeqTrainer(
    model=model,  # The model to train
    args=training_args,  # The training arguments defined above
    train_dataset=train_dataset,  # The training dataset
    eval_dataset=val_dataset,  # The validation dataset
    tokenizer=tokenizer,  # The tokenizer used during training
    data_collator=data_collator,  # The collator that handles batching of data
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Early stopping callback with patience of 2 epochs
)

# Start training the model
trainer.train()

# Save the fine-tuned model and tokenizer to disk
trainer.save_model('fine_tuned_model')  # Save the trained model
tokenizer.save_pretrained('fine_tuned_model')  # Save the tokenizer


In [None]:
# Load the tokenizer and model from the fine-tuned directory
tokenizer = AutoTokenizer.from_pretrained('fine_tuned_model')
model = AutoModelForSeq2SeqLM.from_pretrained('fine_tuned_model')

# Set the device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move the model to the chosen device (GPU/CPU)

# Prepare the tensors for input_ids and attention_mask from the test dataset
input_ids = torch.tensor(test_dataset["input_ids"]).to(device)
attention_mask = torch.tensor(test_dataset["attention_mask"]).to(device)

# Set the model to evaluation mode (turn off dropout, etc.)
model.eval()

# Generate predictions (no gradients required during inference)
with torch.no_grad():
    # Generate the responses using beam search and early stopping
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=128,  # Maximum length of the generated response
        num_beams=4,  # Beam search parameter (controls diversity of generated text)
        early_stopping=True  # Stops generation when all beams reach the end
    )

# Decode the generated responses from token IDs to text
generated_answers = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Get the original questions and contexts from the test dataset
questions_contexts = test_dataset["input_text"]
real_answers = test_dataset["answer"]

# Print the test results for inspection
print("\nModel Test Results:\n")
for i, (qc, gen, real) in enumerate(zip(questions_contexts, generated_answers, real_answers)):
    if 10 and i >= 10:  # Limit the output to 10 examples
        break

    # Print each example's question, generated answer, and real answer
    print(f"### Example {i + 1} ###")
    print(f"[Question + Context]:\n{qc}")
    print(f"\n[Generated Answer]:\n{gen}")
    print(f"\n[Real Answer]:\n{real}")
    print("\n" + "-" * 50 + "\n")