# Environment Setup with PEFT

In [1]:
# Install advanced fine-tuning libraries
# !pip install transformers datasets peft accelerate evaluate rouge-score -q

import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    default_data_collator
)
from peft import LoraConfig, get_peft_model, TaskType
import warnings
warnings.filterwarnings('ignore')

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU only'}")


Using device: cuda
GPU: Tesla T4


# Advanced Dataset Loading and Preprocessing

In [2]:
# Load SQuAD 2.0 dataset - includes unanswerable questions
print("Loading SQuAD 2.0 dataset...")
dataset = load_dataset("squad_v2")

# Create smaller subset for efficient training
def create_qa_subset(dataset, train_size=5000, val_size=1000):
    """Create balanced subset with answerable and unanswerable questions"""
    # Shuffle and select based on indices
    train_indices = list(range(len(dataset['train'])))
    np.random.seed(42)
    np.random.shuffle(train_indices)
    train_indices = train_indices[:train_size]

    val_indices = list(range(len(dataset['validation'])))
    np.random.seed(42)
    np.random.shuffle(val_indices)
    val_indices = val_indices[:val_size]

    train_dataset = dataset['train'].select(train_indices)
    val_dataset = dataset['validation'].select(val_indices)

    return train_dataset, val_dataset

train_dataset, val_dataset = create_qa_subset(dataset)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

# Analyze dataset complexity
answerable_train = sum(1 for ex in train_dataset if len(ex['answers']['text']) > 0)
print(f"Answerable questions in training: {answerable_train} ({answerable_train/len(train_dataset)*100:.1f}%)")

# Display sample data
sample = train_dataset[0]
print(f"\nSample Question: {sample['question']}")
print(f"Context (first 200 chars): {sample['context'][:200]}...")
print(f"Answer: {sample['answers']['text'][0] if sample['answers']['text'] else 'No answer'}")

Loading SQuAD 2.0 dataset...
Training samples: 5000
Validation samples: 1000
Answerable questions in training: 3379 (67.6%)

Sample Question: What year did the global recession that followed the financial crisis of 2007 end?
Context (first 200 chars): It threatened the collapse of large financial institutions, which was prevented by the bailout of banks by national governments, but stock markets still dropped worldwide. In many areas, the housing m...
Answer: 2012


# Advanced Tokenization for QA

In [3]:
# Initialize tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_qa_data(examples):
    """Advanced preprocessing for question-answering tasks"""
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0] if answer["answer_start"] else 0
        end_char = start_char + len(answer["text"][0]) if answer["text"] else 0

        # Find token positions
        sequence_ids = inputs.sequence_ids(i)
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

        # Handle unanswerable questions
        if not answer["text"]:
            start_positions.append(context_start)
            end_positions.append(context_start)
        else:
            # Find start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Apply preprocessing
print("Preprocessing datasets...")
train_dataset = train_dataset.map(
    preprocess_qa_data,
    batched=True,
    remove_columns=train_dataset.column_names
)

val_dataset = val_dataset.map(
    preprocess_qa_data,
    batched=True,
    remove_columns=val_dataset.column_names
)

print("Preprocessing completed!")


Preprocessing datasets...


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Preprocessing completed!


# LoRA Configuration and Model Setup

In [4]:
# Load base model
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Configure LoRA - this is the key innovation
lora_config = LoraConfig(
    task_type=TaskType.QUESTION_ANS,
    inference_mode=False,
    r=16,                    # Rank - higher = more parameters but better performance
    lora_alpha=32,           # Scaling parameter
    lora_dropout=0.1,        # Dropout for regularization
    target_modules=["q_lin", "k_lin", "v_lin", "out_lin"]  # DistilBERT specific
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Move to GPU
model.to(device)

print("LoRA model configuration:")
print(f"Base model parameters: {sum(p.numel() for p in model.base_model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 591,362 || all params: 66,955,780 || trainable%: 0.8832127711752443
LoRA model configuration:
Base model parameters: 66,955,780
Trainable parameters: 591,362


# Advanced Training Configuration

In [5]:
# Training arguments optimized for LoRA
training_args = TrainingArguments(
    output_dir="./qa_lora_results",
    evaluation_strategy="steps",
    eval_steps=200,
    logging_steps=50,
    save_steps=200,
    learning_rate=3e-4,          # Higher LR for LoRA
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=100,
    fp16=True,
    dataloader_num_workers=2,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none"
)

# Custom metrics for QA evaluation
def compute_qa_metrics(eval_pred):
    """Compute F1 and Exact Match scores for QA"""
    predictions, labels = eval_pred
    start_predictions, end_predictions = predictions
    start_labels, end_labels = labels

    # Simplified evaluation - exact match for start/end positions
    start_accuracy = (start_predictions.argmax(-1) == start_labels).mean()
    end_accuracy = (end_predictions.argmax(-1) == end_labels).mean()
    exact_match = ((start_predictions.argmax(-1) == start_labels) &
                   (end_predictions.argmax(-1) == end_labels)).mean()

    return {
        "start_accuracy": start_accuracy,
        "end_accuracy": end_accuracy,
        "exact_match": exact_match
    }

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=default_data_collator,
    compute_metrics=compute_qa_metrics,
    tokenizer=tokenizer
)

print("Training configuration complete!")


Training configuration complete!


# LoRA Training Process

In [6]:
import time

print("Starting LoRA fine-tuning for Question Answering...")
start_time = time.time()

# Train the model
trainer.train()

training_time = time.time() - start_time
print(f"\nTraining completed in {training_time/60:.2f} minutes")

# Save LoRA adapter
model.save_pretrained("./qa_lora_adapter")
print("LoRA adapter saved!")


Starting LoRA fine-tuning for Question Answering...


Step,Training Loss,Validation Loss,Start Accuracy,End Accuracy,Exact Match
200,3.4327,2.795097,0.547783,0.521182,0.515271
400,2.969,2.27473,0.551724,0.549754,0.543842
600,2.6429,2.103418,0.458128,0.430542,0.328079
800,2.1996,1.892962,0.457143,0.465025,0.352709
1000,1.9988,1.84906,0.498522,0.430542,0.335961
1200,2.0083,1.748764,0.497537,0.49064,0.373399
1400,1.9066,1.780151,0.477833,0.473892,0.361576
1600,1.9658,1.728884,0.478818,0.478818,0.348768
1800,1.7282,1.675193,0.504433,0.497537,0.387192



Training completed in 4.34 minutes
LoRA adapter saved!


# Advanced Evaluation and Testing

In [7]:
# Comprehensive evaluation
print("Evaluating LoRA-tuned model...")
eval_results = trainer.evaluate()

print("\nFinal Evaluation Results:")
print("-" * 40)
for key, value in eval_results.items():
    if key.startswith('eval_'):
        metric_name = key.replace('eval_', '').title()
        print(f"{metric_name}: {value:.4f}")

# Advanced inference function
def answer_question(question, context, model, tokenizer):
    """Extract answer from context using fine-tuned QA model"""
    inputs = tokenizer(
        question,
        context,
        max_length=384,
        truncation="only_second",
        return_tensors="pt",
        padding=True
    ).to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits

        start_idx = torch.argmax(start_scores)
        end_idx = torch.argmax(end_scores)

        # Get confidence scores
        start_confidence = torch.softmax(start_scores, dim=-1)[0][start_idx].item()
        end_confidence = torch.softmax(end_scores, dim=-1)[0][end_idx].item()

        if start_idx <= end_idx:
            answer_tokens = inputs['input_ids'][0][start_idx:end_idx+1]
            answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
        else:
            answer = "No answer found"

        return answer, start_confidence, end_confidence

# Test with complex examples
test_examples = [
    {
        "context": "The Amazon rainforest, also known as Amazonia, is a moist broadleaf tropical rainforest in the Amazon biome that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 km2, of which 5,500,000 km2 are covered by the rainforest. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana.",
        "question": "Which country contains the most Amazon rainforest?"
    },
    {
        "context": "Machine learning is a method of data analysis that automates analytical model building. It is a branch of artificial intelligence based on the idea that systems can learn from data, identify patterns and make decisions with minimal human intervention.",
        "question": "What can machine learning systems do with minimal human intervention?"
    },
    {
        "context": "The Great Wall of China is a series of fortifications made of stone, brick, tamped earth, wood, and other materials, generally built along an east-to-west line across the historical northern borders of China.",
        "question": "What materials were used to build the Great Wall?"
    }
]

print("\nTesting LoRA-tuned QA model:")
print("=" * 60)

for i, example in enumerate(test_examples, 1):
    answer, start_conf, end_conf = answer_question(
        example["question"],
        example["context"],
        model,
        tokenizer
    )

    print(f"Example {i}:")
    print(f"Question: {example['question']}")
    print(f"Answer: {answer}")
    print(f"Confidence: Start={start_conf:.3f}, End={end_conf:.3f}")
    print("-" * 60)


Evaluating LoRA-tuned model...



Final Evaluation Results:
----------------------------------------
Loss: 1.6752
Start_Accuracy: 0.5044
End_Accuracy: 0.4975
Exact_Match: 0.3872
Runtime: 5.1360
Samples_Per_Second: 197.6260
Steps_Per_Second: 24.7280

Testing LoRA-tuned QA model:
Example 1:
Question: Which country contains the most Amazon rainforest?
Answer: the
Confidence: Start=0.434, End=0.514
------------------------------------------------------------
Example 2:
Question: What can machine learning systems do with minimal human intervention?
Answer: machine
Confidence: Start=0.713, End=0.546
------------------------------------------------------------
Example 3:
Question: What materials were used to build the Great Wall?
Answer: the
Confidence: Start=0.429, End=0.326
------------------------------------------------------------


# Performance Analysis and Comparison

In [8]:
# Memory efficiency analysis
def analyze_lora_efficiency():
    """Compare LoRA vs full fine-tuning efficiency"""
    base_params = sum(p.numel() for p in model.base_model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print("LoRA Efficiency Analysis:")
    print(f"Base model parameters: {base_params:,}")
    print(f"LoRA trainable parameters: {trainable_params:,}")
    print(f"Parameter reduction: {(1 - trainable_params/base_params)*100:.2f}%")
    print(f"Memory savings: ~{(base_params - trainable_params) * 4 / 1024**2:.1f} MB")

analyze_lora_efficiency()

# Advanced debugging for QA
def debug_qa_prediction(question, context, model, tokenizer):
    """Debug QA model predictions with attention analysis"""
    inputs = tokenizer(
        question,
        context,
        max_length=384,
        truncation="only_second",
        return_tensors="pt",
        return_attention_mask=True
    ).to(device)

    # Get tokens for visualization
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs, output_attentions=True)
        start_scores = outputs.start_logits[0]
        end_scores = outputs.end_logits[0]

    print(f"Question: {question}")
    print(f"Context length: {len(context)} characters")
    print(f"Tokenized length: {len(tokens)} tokens")
    print(f"Top 3 start positions: {torch.topk(start_scores, 3).indices.tolist()}")
    print(f"Top 3 end positions: {torch.topk(end_scores, 3).indices.tolist()}")

# Debug a challenging example
debug_qa_prediction(
    "What is the percentage of Amazon rainforest in Brazil?",
    test_examples[0]["context"],
    model,
    tokenizer
)


LoRA Efficiency Analysis:
Base model parameters: 66,955,780
LoRA trainable parameters: 591,362
Parameter reduction: 99.12%
Memory savings: ~253.2 MB
Question: What is the percentage of Amazon rainforest in Brazil?
Context length: 474 characters
Tokenized length: 119 tokens
Top 3 start positions: [12, 82, 92]
Top 3 end positions: [12, 98, 83]
