In [None]:
!pip install -q transformers datasets evaluate scikit-learn pyarrow accelerate sentencepiece bitsandbytes peft wandb nltk rouge_score


In [None]:
import ast
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    EarlyStoppingCallback,
)
import torch
torch.cuda.empty_cache()
from peft import LoraConfig, TaskType, get_peft_model, PeftModel, PeftConfig
import evaluate
import wandb
import nltk
nltk.download('punkt')

wandb.init(project="QA-FineTuning", name="BART_File-LoRA-Experiment")


In [None]:
# Path to the QA dataset in Parquet format
QA = "Per_Creare_Dataset_Piccoli/DB_QC_A_da_utilizzare.parquet"

# Load the dataset into a pandas DataFrame
qa_dataset = pd.read_parquet(QA)

# Split the dataset into training, validation, and test sets
# 1. First, split the dataset into a training+validation set (90% of the data) and a test set (10% of the data)
train_val, TEST_df = train_test_split(qa_dataset, test_size=0.1, random_state=42)

# 2. Then, split the training+validation set into training (80% of the remaining 90%) and validation (20% of the remaining 90%) sets
# Note: The test size is adjusted to account for the original split (i.e., 20% of the 90%).
TRAIN_df, VALIDATION_df = train_test_split(train_val, test_size=0.2 / (1 - 0.1), random_state=42)


In [None]:
# Load the tokenizer and base model for BART (a seq2seq model)
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base", model_max_length=512)
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")

# LoRA configuration (Low-Rank Adaptation)
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,  # Define the task as sequence-to-sequence language modeling
    r=4,                             # Set the rank for the low-rank adaptation (a parameter controlling the amount of adaptation)
    lora_alpha=32,                   # Scaling factor for LoRA weights
    lora_dropout=0.1,                # Dropout rate for the LoRA layers (helps prevent overfitting)
    target_modules=["q_proj", "v_proj"],  # Specify which modules of the model to apply LoRA to (here it is the attention projection layers)
    bias="none",                     # Specify whether to include bias terms in the LoRA layers (here, no bias terms are used)
    modules_to_save=["lm_head", "final_layer_norm"]  # Specify which parts of the model to save after training (here, it's the language model head and the final layer normalization)
)

# Add LoRA to the model (apply the LoRA configuration to the base model)
model = get_peft_model(model, peft_config)

# Display the trainable parameters of the model (useful to check which parameters are trainable after applying LoRA)
model.print_trainable_parameters()


In [None]:
def tokenize_function(examples, tokenizer, max_input_length=512, max_target_length=128):
    """
    Tokenizes the dataset.
    Sets the truncation direction for inputs (left) and targets (right).
    """
    # Set the truncation direction for inputs (truncates from the left side)
    tokenizer.truncation_side = 'left'
    inputs = examples["input_text"]  # Extract the input text from the examples
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,  # Limit input length to 'max_input_length'
        truncation=True,               # Enable truncation for longer inputs
        padding="max_length"           # Pad the inputs to 'max_length'
    )

    # Set the truncation direction for targets (truncates from the right side)
    tokenizer.truncation_side = 'right'
    targets = examples["answer"]    # Extract the target (answer) from the examples
    labels = tokenizer(
        text_target=targets,         # Apply tokenizer on the target text (answer)
        max_length=max_target_length, # Limit target length to 'max_target_length'
        truncation=True,              # Enable truncation for longer targets
        padding="max_length"          # Pad the targets to 'max_length'
    )

    # Add the target tokenized labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]  # Set the labels (target tokens) for model training
    return model_inputs  # Return the tokenized input and label data

# Prepare datasets by applying the tokenization function to each split
train_dataset = Dataset.from_pandas(TRAIN_df).map(lambda x: tokenize_function(x, tokenizer), batched=True)
val_dataset = Dataset.from_pandas(VALIDATION_df).map(lambda x: tokenize_function(x, tokenizer), batched=True)
test_dataset = Dataset.from_pandas(TEST_df).map(lambda x: tokenize_function(x, tokenizer), batched=True)

# Data collator to manage batching for sequence-to-sequence tasks
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [None]:
# Function to calculate metrics during evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred  # Extract predictions and labels from the evaluation results
    # Decode the predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 with the padding token in labels and decode
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Calculate BLEU score
    bleu_metric = evaluate.load("bleu")  # Load the BLEU metric
    bleu_results = bleu_metric.compute(
        predictions=decoded_preds,
        references=[[label] for label in decoded_labels]  # Format labels as a list of lists for references
    )

    # Calculate ROUGE score
    rouge_metric = evaluate.load("rouge")  # Load the ROUGE metric
    rouge_results = rouge_metric.compute(
        predictions=decoded_preds,
        references=decoded_labels  # Use the decoded labels as the reference text
    )

    # Return the computed metrics (BLEU and ROUGE)
    return {
        "bleu": bleu_results["bleu"],
        "rouge1": rouge_results["rouge1"],
        "rouge2": rouge_results["rouge2"],
        "rougeL": rouge_results["rougeL"]
    }

# Training arguments for the Seq2Seq model
training_args = Seq2SeqTrainingArguments(
    output_dir='fine_tuned_model',              # Directory where the fine-tuned model will be saved
    evaluation_strategy="epoch",                # Evaluate the model every epoch
    save_strategy="epoch",                      # Save the model every epoch
    learning_rate=2e-4,                         # Learning rate for training
    per_device_train_batch_size=2,              # Batch size for training
    per_device_eval_batch_size=4,               # Batch size for evaluation
    num_train_epochs=20,                        # Number of training epochs
    weight_decay=0.01,                          # Weight decay for regularization
    save_total_limit=2,                         # Limit the number of saved checkpoints
    predict_with_generate=True,                 # Required for prediction during evaluation
    fp16=torch.cuda.is_available(),             # Enable mixed-precision training if GPU with fp16 support is available
    load_best_model_at_end=True,                # Load the best model at the end of training
    metric_for_best_model="bleu",               # Metric used to determine the best model
    greater_is_better=True,                     # Whether higher values of the metric are better
    report_to="wandb"                           # Log metrics to W&B (Weights and Biases)
)

# Initialize the Seq2SeqTrainer with model, arguments, and datasets
trainer = Seq2SeqTrainer(
    model=model,                               # The model to train
    args=training_args,                        # Training arguments defined above
    train_dataset=train_dataset,               # Training dataset
    eval_dataset=val_dataset,                  # Validation dataset
    tokenizer=tokenizer,                       # Tokenizer for encoding text
    data_collator=data_collator,               # Data collator to handle batching
    compute_metrics=compute_metrics,           # Function to compute evaluation metrics
    callbacks=[                                # List of callbacks
        EarlyStoppingCallback(                 # Callback to stop training early if no improvement
            early_stopping_patience=10,         # Number of epochs with no improvement to wait
            early_stopping_threshold=0.01       # Minimum improvement to consider as progress
        )
    ]
)

# Measure the training time (it can take up to an hour or more depending on the model and dataset size)
start_time = time.time()
trainer.train()  # Start training
training_duration = time.time() - start_time  # Calculate the total training time
print(f"Training duration: {training_duration/60:.2f} minutes")  # Print the training duration in minutes

# Save the fine-tuned model and tokenizer
model.save_pretrained('fine_tuned_model')  # Save the model
tokenizer.save_pretrained('fine_tuned_model')  # Save the tokenizer


In [None]:
# Load the LoRA configuration and base model
config = PeftConfig.from_pretrained('fine_tuned_model')  # Load the LoRA configuration from the saved model
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)  # Load the base model

# Load the LoRA weights
model = PeftModel.from_pretrained(model, 'fine_tuned_model')  # Apply LoRA weights to the model
model = model.merge_and_unload()  # Merge the LoRA weights with the base model for inference (unloads LoRA-related parameters)

# Load the tokenizer used during training
tokenizer = AutoTokenizer.from_pretrained('fine_tuned_model')

# Set the device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move the model to the selected device (GPU/CPU)

# Prepare the tensors for the test set (input_ids and attention_mask)
input_ids = torch.tensor(test_dataset["input_ids"]).to(device)  # Convert input_ids to tensor and move to device
attention_mask = torch.tensor(test_dataset["attention_mask"]).to(device)  # Convert attention_mask to tensor and move to device

# Generate responses with the desired decoding parameters
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient computation to speed up inference
    outputs = model.generate(
        input_ids=input_ids,  # Input tensor
        attention_mask=attention_mask,  # Attention mask tensor
        max_length=200,  # Maximum length of the generated output
        min_length=5,  # Minimum length of the generated output (ensures the answer is not too short)
        num_beams=8,  # Beam search parameter for controlling diversity in the generation process
        length_penalty=0.8,  # Length penalty to favor longer answers
        no_repeat_ngram_size=2,  # Prevents repetition of n-grams during generation
        early_stopping=True,  # Stops the generation early if all beams end the generation
        do_sample=False,  # Perform deterministic decoding (no sampling)
        temperature=0.5,  # Controls randomness in generation (lower value makes it more deterministic)
        top_k=50,  # Top-K sampling parameter, controls the diversity of generated tokens
        top_p=0.95  # Top-p sampling parameter (nucleus sampling) for diversity
    )

# Decode the generated answers (convert token IDs to readable text)
generated_answers = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Extract the actual answers from the test set
questions_contexts = test_dataset["input_text"]  # Extract input text (questions + context)
real_answers = test_dataset["answer"]  # Extract real answers from the test set

# Compute the evaluation metrics using the 'evaluate' library
bleu_metric = evaluate.load("bleu")  # Load BLEU metric
test_bleu = bleu_metric.compute(
    predictions=generated_answers,  # Generated answers
    references=[[real] for real in real_answers]  # Real answers (formatted as a list of lists for references)
)

rouge_metric = evaluate.load("rouge")  # Load ROUGE metric
rouge_results = rouge_metric.compute(
    predictions=generated_answers,  # Generated answers
    references=real_answers  # Real answers (single list format for ROUGE)
)

# Print the evaluation metrics (BLEU and ROUGE)
print(f"\nTest BLEU: {test_bleu['bleu']}")  # Print BLEU score
print(f"Test ROUGE-L: {rouge_results['rougeL']}")  # Print ROUGE-L score

# Log the metrics to Weights and Biases (WandB) for tracking the experiment
wandb.log({
    "test_bleu": test_bleu["bleu"],
    "test_rougeL": rouge_results["rougeL"],
    "test_rouge1": rouge_results["rouge1"],
    "test_rouge2": rouge_results["rouge2"]
})

# Print some example inputs and outputs for manual inspection
print("\nModel Test Examples:\n")
for i, (qc, gen, real) in enumerate(zip(questions_contexts, generated_answers, real_answers)):
    if 20 and i >= 20:  # Limit to 20 examples for display
        break
    print(f"### Example {i + 1} ###")
    print(f"[Question + Context]:\n{qc}")
    print(f"\n[Generated Answer]:\n{gen}")
    print(f"\n[Real Answer]:\n{real}")
    print("\n" + "-" * 50 + "\n")  # Separator line for readability
