In [None]:
!pip install -q transformers datasets evaluate scikit-learn pyarrow accelerate sentencepiece bitsandbytes

In [None]:
import ast
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    EarlyStoppingCallback,
)
import torch

In [None]:
# Path to the QA dataset in Parquet format
QA = "Per_Creare_Dataset_Piccoli/DB_QC_A_da_utilizzare.parquet"

# Load the dataset into a pandas DataFrame
qa_dataset = pd.read_parquet(QA)

# Split the dataset into training, validation, and test sets
# 1. First, split the dataset into a training+validation set (90% of the data) and a test set (10% of the data)
train_val, TEST_df = train_test_split(qa_dataset, test_size=0.1, random_state=42)

# 2. Then, split the training+validation set into training (80% of the remaining 90%) and validation (20% of the remaining 90%) sets
# Note: The test size is adjusted to account for the original split (i.e., 20% of the 90%).
TRAIN_df, VALIDATION_df = train_test_split(train_val, test_size=0.2 / (1 - 0.1), random_state=42)

In [None]:
# Load the tokenizer and base model
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [None]:
def tokenize_function(examples, tokenizer, max_input_length=512, max_target_length=128):
    """Tokenizes the dataset."""

    # Extract the 'input_text' (questions and contexts) and 'answer' columns from the examples
    inputs = examples["input_text"]
    targets = examples["answer"]

    # Tokenize the input text (questions and contexts) with truncation and padding to a maximum length
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    # Tokenize the target text (answers) with truncation and padding to a maximum length
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True, padding="max_length")

    # Add the tokenized target text (labels) to the model inputs dictionary
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs  # Return the tokenized inputs along with the labels

# Prepare datasets by applying the tokenization function to each split
train_dataset = Dataset.from_pandas(TRAIN_df).map(lambda x: tokenize_function(x, tokenizer), batched=True)
val_dataset = Dataset.from_pandas(VALIDATION_df).map(lambda x: tokenize_function(x, tokenizer), batched=True)
test_dataset = Dataset.from_pandas(TEST_df).map(lambda x: tokenize_function(x, tokenizer), batched=True)

# Data collator to manage batching for sequence-to-sequence tasks
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# Training arguments for the Seq2Seq model (e.g., T5)
training_args = Seq2SeqTrainingArguments(
    output_dir='fine_tuned_model',  # Directory to save the fine-tuned model
    evaluation_strategy="epoch",  # Perform evaluation at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    learning_rate=3e-4,  # Optimal learning rate for T5 model
    per_device_train_batch_size=8,  # Batch size for training per device (GPU/CPU)
    per_device_eval_batch_size=16,  # Batch size for evaluation per device
    weight_decay=0.01,  # Regularization parameter to prevent overfitting
    num_train_epochs=20,  # Total number of training epochs
    predict_with_generate=True,  # Use model's `generate` method for prediction
    generation_max_length=128,  # Maximum length of generated sequences (specific to T5)
    fp16=torch.cuda.is_available(),  # Enable mixed precision training if CUDA is available
    logging_steps=50,  # Log training metrics every 50 steps
    save_total_limit=2,  # Keep only the last 2 checkpoints to save disk space
    load_best_model_at_end=True,  # Load the best model (based on the evaluation loss) after training
    metric_for_best_model="eval_loss",  # Metric used to determine the best model
    greater_is_better=False,  # Whether a higher evaluation loss is better (False means lower is better)
    report_to="none",  # Disable reporting to external platforms like WandB
    optim="adafactor",  # Recommended optimizer for T5 (efficient in terms of memory and computation)
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps before updating model
    group_by_length=True,  # Group sequences of similar lengths together to optimize batching
    warmup_steps=100  # Number of warm-up steps for the learning rate scheduler
)

# Initialize the Seq2Seq trainer with the model, training arguments, dataset, tokenizer, and data collator
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,  # Pass the training arguments defined above
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=val_dataset,  # Validation dataset
    tokenizer=tokenizer,  # Tokenizer for encoding/decoding inputs and outputs
    data_collator=data_collator,  # Data collator to handle dynamic padding
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4, early_stopping_threshold=0.001)]  # Early stopping to prevent overfitting
)

# Start the training process
trainer.train()

# Save the fine-tuned model and tokenizer to the specified output directory
trainer.save_model('fine_tuned_model')
tokenizer.save_pretrained('fine_tuned_model')


In [None]:
# Load the tokenizer and the fine-tuned model
tokenizer = AutoTokenizer.from_pretrained('fine_tuned_model')  # Load the tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained('fine_tuned_model')  # Load the fine-tuned model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Check if GPU is available, otherwise use CPU
model.to(device)  # Move the model to the appropriate device (GPU or CPU)

# Convert the input data to PyTorch tensors for processing
input_ids = torch.tensor(test_dataset["input_ids"]).to(device)  # Convert the input IDs to tensors and move them to the correct device
attention_mask = torch.tensor(test_dataset["attention_mask"]).to(device)  # Convert attention mask to tensor and move it to the correct device

# Generate predictions with the model (optimized for T5)
model.eval()  # Set the model to evaluation mode (important for inference)
with torch.no_grad():  # Disable gradient calculations to save memory during inference
    outputs = model.generate(
        input_ids=input_ids,  # The tokenized input data
        attention_mask=attention_mask,  # The attention mask
        max_length=128,  # Set the maximum length for the generated sequence
        num_beams=4,  # Use beam search with 4 beams for better quality
        early_stopping=True,  # Stop generation early if an end-of-sequence token is generated
        length_penalty=0.6,  # Penalize shorter outputs to encourage longer, more complete answers
        no_repeat_ngram_size=2,  # Avoid repeating n-grams (2-grams in this case)
        temperature=0.7  # Controls the randomness of the generation (lower means more deterministic)
    )

# Decode the generated outputs into human-readable text
generated_answers = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Extract the original questions and contexts, removing the "answer the question: " prefix
questions_contexts = [text.replace("answer the question: ", "") for text in test_dataset["input_text"]]  # Clean up the input text
real_answers = test_dataset["answer"]  # Get the real answers from the test dataset

# Display the model's output alongside the real answers for evaluation
print("\nTest of the T5_Model:\n")
for i, (qc, gen, real) in enumerate(zip(questions_contexts, generated_answers, real_answers)):
    if 10 and i >= 10:  # Limit to 10 examples for printing
        break

    # Print the input question/context, the generated answer, and the real answer
    print(f"### Example {i + 1} ###")
    print(f"[Input]:\n{qc}")  # Print the input question/context
    print(f"\n[Generated Answer]:\n{gen}")  # Print the generated answer
    print(f"\n[Real Answer]:\n{real}")  # Print the actual real answer
    print("\n" + "=" * 60 + "\n")  # Separator between examples
