In [1]:
!pip install datasets



In [2]:
!pip install peft



In [32]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset
from tqdm import tqdm
from peft import get_peft_model, LoraConfig, TaskType

# Load dataset and tokenizer
squad_dataset = load_dataset("squad_v2")

# Split the validation set into validation and test
squad_dataset["test"] = squad_dataset["validation"].select(range(len(squad_dataset["validation"]) // 2))
squad_dataset["validation"] = squad_dataset["validation"].select(range(len(squad_dataset["validation"]) // 2, len(squad_dataset["validation"])))

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased-distilled-squad')

Downloading readme:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [35]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]

        # Handle unanswerable questions
        if len(answer["answer_start"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)

            # Find the start and end of the context
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            # If the answer is not fully inside the context, label is (0, 0)
            if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise it's the start and end token positions
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [36]:
processed_datasets = squad_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=squad_dataset["train"].column_names,
)

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Map:   0%|          | 0/5937 [00:00<?, ? examples/s]

Map:   0%|          | 0/5936 [00:00<?, ? examples/s]

In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [38]:
# Dataset class
class SquadDataset(Dataset):
    def __init__(self, encodings, device):
        self.encodings = encodings
        self.device = device

    def __getitem__(self, idx):
        return {key: torch.tensor(self.encodings[idx][key]).to(self.device) for key in self.encodings.column_names}

    def __len__(self):
        return len(self.encodings)

train_dataset = SquadDataset(processed_datasets["train"], device)
eval_dataset = SquadDataset(processed_datasets["validation"], device)
test_dataset = SquadDataset(processed_datasets["test"], device)

In [39]:
# Setup

model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')

In [40]:
# Apply LoRA
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    inference_mode=False,
    task_type="QUESTION_ANS",
    target_modules=["q_lin", "k_lin", "v_lin"]
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

model.to(device)

trainable params: 222,722 || all params: 66,587,140 || trainable%: 0.3345


PeftModelForQuestionAnswering(
  (base_model): LoraModel(
    (model): DistilBertForQuestionAnswering(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_feat

In [52]:
# Training parameters
batch_size = 16
accumulation_steps = 4  # Gradient Accumulation
num_epochs = 10
lr = 5e-5
max_steps_per_epoch = 8000

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [53]:
optimizer = AdamW(model.parameters(), lr=lr)
num_training_steps = num_epochs * max_steps_per_epoch
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [54]:
# Mixed Precision
scaler = torch.cuda.amp.GradScaler()

In [55]:
# Get a single batch from your dataloader
batch = next(iter(train_loader))

input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
start_positions = batch['start_positions'].to(device)
end_positions = batch['end_positions'].to(device)

# Try a single forward pass
try:
    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        start_positions=start_positions,
        end_positions=end_positions
    )
    print("Forward pass successful")
    print("Loss:", outputs.loss.item())
except Exception as e:
    print("Error during forward pass:", str(e))

Forward pass successful
Loss: 1.5857070684432983


In [56]:
from tqdm import tqdm
import torch
import numpy as np
from transformers import EvalPrediction

def evaluate(model, data_loader, tokenizer):
    model.eval()
    all_start_logits = []
    all_end_logits = []
    all_start_positions = []
    all_end_positions = []

    for batch in tqdm(data_loader, desc="Evaluating"):
        batch = {k: v.to(model.device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        all_start_logits.append(start_logits.cpu().numpy())
        all_end_logits.append(end_logits.cpu().numpy())
        all_start_positions.append(batch["start_positions"].cpu().numpy())
        all_end_positions.append(batch["end_positions"].cpu().numpy())

    all_start_logits = np.concatenate(all_start_logits)
    all_end_logits = np.concatenate(all_end_logits)
    all_start_positions = np.concatenate(all_start_positions)
    all_end_positions = np.concatenate(all_end_positions)

    eval_prediction = EvalPrediction(
        predictions=(all_start_logits, all_end_logits),
        label_ids=(all_start_positions, all_end_positions)
    )

    metrics = compute_metrics(eval_prediction)
    return metrics

def compute_metrics(eval_prediction: EvalPrediction):
    start_logits, end_logits = eval_prediction.predictions
    start_positions, end_positions = eval_prediction.label_ids

    start_pred = np.argmax(start_logits, axis=1)
    end_pred = np.argmax(end_logits, axis=1)

    exact_match = ((start_pred == start_positions) & (end_pred == end_positions)).mean()
    f1_scores = []

    for i in range(len(start_positions)):
        pred_range = set(range(start_pred[i], end_pred[i] + 1))
        true_range = set(range(start_positions[i], end_positions[i] + 1))

        overlap = len(pred_range.intersection(true_range))
        precision = overlap / len(pred_range) if len(pred_range) > 0 else 0
        recall = overlap / len(true_range) if len(true_range) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1)

    return {
        "exact_match": exact_match,
        "f1": np.mean(f1_scores)
    }

In [57]:
# Training loop
model.train()
for epoch in range(num_epochs):
    epoch_iterator = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", total=min(len(train_loader), max_steps_per_epoch))
    for step, batch in enumerate(epoch_iterator):
        if step >= max_steps_per_epoch:
            break

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                start_positions=start_positions,
                end_positions=end_positions
            )

            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        lr_scheduler.step()

        # Update progress bar
        epoch_iterator.set_postfix({"loss": loss.item()})

    # Evaluate after each epoch
    eval_results = evaluate(model, eval_loader, tokenizer)
    print(f"Epoch {epoch+1}/{num_epochs} - Validation Results:", eval_results)

# Final evaluation on test set
test_results = evaluate(model, test_loader, tokenizer)
print("Final Test Results:", test_results)


Epoch 1/10: 100%|██████████| 8000/8000 [28:34<00:00,  4.67it/s, loss=0.416]
Evaluating: 100%|██████████| 373/373 [01:31<00:00,  4.09it/s]


Epoch 1/10 - Validation Results: {'exact_match': 0.47811504276370953, 'f1': 0.5550553090635221}


Epoch 2/10: 100%|██████████| 8000/8000 [25:43<00:00,  5.18it/s, loss=1.08]
Evaluating: 100%|██████████| 373/373 [01:31<00:00,  4.08it/s]


Epoch 2/10 - Validation Results: {'exact_match': 0.5254066744927051, 'f1': 0.5969202142120136}


Epoch 3/10: 100%|██████████| 8000/8000 [25:31<00:00,  5.22it/s, loss=0.705]
Evaluating: 100%|██████████| 373/373 [01:31<00:00,  4.10it/s]


Epoch 3/10 - Validation Results: {'exact_match': 0.5426798591313098, 'f1': 0.6114725415158976}


Epoch 4/10: 100%|██████████| 8000/8000 [25:33<00:00,  5.22it/s, loss=1.14]
Evaluating: 100%|██████████| 373/373 [01:31<00:00,  4.09it/s]


Epoch 4/10 - Validation Results: {'exact_match': 0.5359718262619487, 'f1': 0.6070155628458944}


Epoch 5/10: 100%|██████████| 8000/8000 [25:38<00:00,  5.20it/s, loss=0.649]
Evaluating: 100%|██████████| 373/373 [01:31<00:00,  4.08it/s]


Epoch 5/10 - Validation Results: {'exact_match': 0.5430152607747778, 'f1': 0.6140512470762648}


Epoch 6/10: 100%|██████████| 8000/8000 [25:39<00:00,  5.20it/s, loss=1.34]
Evaluating: 100%|██████████| 373/373 [01:31<00:00,  4.08it/s]


Epoch 6/10 - Validation Results: {'exact_match': 0.5517357035049472, 'f1': 0.6241292349406403}


Epoch 7/10: 100%|██████████| 8000/8000 [25:27<00:00,  5.24it/s, loss=0.625]
Evaluating: 100%|██████████| 373/373 [01:30<00:00,  4.10it/s]


Epoch 7/10 - Validation Results: {'exact_match': 0.5455307731007882, 'f1': 0.6193094910043733}


Epoch 8/10: 100%|██████████| 8000/8000 [25:29<00:00,  5.23it/s, loss=0.708]
Evaluating: 100%|██████████| 373/373 [01:30<00:00,  4.11it/s]


Epoch 8/10 - Validation Results: {'exact_match': 0.5478785846050646, 'f1': 0.6193464283385617}


Epoch 9/10: 100%|██████████| 8000/8000 [25:29<00:00,  5.23it/s, loss=0.803]
Evaluating: 100%|██████████| 373/373 [01:31<00:00,  4.10it/s]


Epoch 9/10 - Validation Results: {'exact_match': 0.5376488344792889, 'f1': 0.6146965573155986}


Epoch 10/10: 100%|██████████| 8000/8000 [25:27<00:00,  5.24it/s, loss=0.598]
Evaluating: 100%|██████████| 373/373 [01:31<00:00,  4.09it/s]


Epoch 10/10 - Validation Results: {'exact_match': 0.5415059533791715, 'f1': 0.6162325044315368}


Evaluating: 100%|██████████| 386/386 [01:33<00:00,  4.11it/s]

Final Test Results: {'exact_match': 0.561011181332037, 'f1': 0.631016040892558}





In [58]:
import os

output_dir = "./fine_tuned_model"
os.makedirs(output_dir, exist_ok=True)

In [59]:
model.save_pretrained(output_dir)



In [60]:
tokenizer.save_pretrained(output_dir)

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

In [65]:
import json
import os
from peft import LoraConfig

def lora_config_to_dict(config):
    return {
        key: getattr(config, key)
        for key in dir(config)
        if not key.startswith('__') and not callable(getattr(config, key))
    }

# Convert LoraConfig to a dictionary
peft_config_dict = lora_config_to_dict(model.peft_config)

# Save the PEFT config as a JSON file
with open(os.path.join(output_dir, "peft_config.json"), "w") as f:
    json.dump(peft_config_dict, f, indent=2)

Evaluation

In [66]:
# Final evaluation on test set
test_results = evaluate(model, test_loader, tokenizer)
print("Final Test Results:", test_results)

Evaluating: 100%|██████████| 386/386 [01:33<00:00,  4.15it/s]

Final Test Results: {'exact_match': 0.561011181332037, 'f1': 0.631016040892558}





In [70]:
!pip install scikit-learn rouge torch tqdm

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [71]:
!pip install rouge



In [72]:
from sklearn.metrics import accuracy_score, f1_score
from rouge import Rouge
import torch
from tqdm import tqdm

In [75]:
from sklearn.metrics import accuracy_score
from rouge import Rouge
import torch
from tqdm import tqdm

def calculate_f1(prediction, ground_truth):
    prediction_tokens = set(prediction.split())
    ground_truth_tokens = set(ground_truth.split())
    common = prediction_tokens & ground_truth_tokens
    if not common:
        return 0
    precision = len(common) / len(prediction_tokens)
    recall = len(common) / len(ground_truth_tokens)
    return (2 * precision * recall) / (precision + recall)

def evaluate(model, data_loader, tokenizer):
    model.eval()
    total_eval_loss = 0
    all_predictions = []
    all_true_answers = []
    all_contexts = []
    all_questions = []

    for batch in tqdm(data_loader, desc="Evaluating"):
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask,
                            start_positions=start_positions,
                            end_positions=end_positions)

            loss = outputs.loss
            total_eval_loss += loss.item()

            # Get predictions
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits

            for i in range(input_ids.shape[0]):
                start_index = torch.argmax(start_logits[i])
                end_index = torch.argmax(end_logits[i])

                pred_answer = tokenizer.decode(input_ids[i][start_index:end_index+1]).strip()
                true_answer = tokenizer.decode(input_ids[i][start_positions[i]:end_positions[i]+1]).strip()

                all_predictions.append(pred_answer)
                all_true_answers.append(true_answer)

                context = tokenizer.decode(input_ids[i]).strip()
                all_contexts.append(context)

                question = tokenizer.decode(input_ids[i][:start_positions[i]]).strip()
                all_questions.append(question)

    print(f"Number of predictions: {len(all_predictions)}")
    print(f"Number of true answers: {len(all_true_answers)}")
    print(f"Sample prediction: '{all_predictions[0]}'")
    print(f"Sample true answer: '{all_true_answers[0]}'")

    # Ensure predictions and true answers are not empty
    all_predictions = [p if p else "empty" for p in all_predictions]
    all_true_answers = [t if t else "empty" for t in all_true_answers]

    # Calculate metrics
    exact_match = accuracy_score(all_true_answers, all_predictions)
    f1 = sum(calculate_f1(pred, true) for pred, true in zip(all_predictions, all_true_answers)) / len(all_predictions)

    # Calculate ROUGE scores
    rouge = Rouge()
    rouge_scores = rouge.get_scores(all_predictions, all_true_answers, avg=True)

    # Calculate context recall
    context_recall = sum([1 for pred, context in zip(all_predictions, all_contexts) if pred in context]) / len(all_predictions)

    # Calculate faithfulness (simple version: check if prediction is subset of context)
    faithfulness = sum([1 for pred, context in zip(all_predictions, all_contexts) if pred in context]) / len(all_predictions)

    return {
        'eval_loss': total_eval_loss / len(data_loader),
        'exact_match': exact_match,
        'f1_score': f1,
        'rouge1_f': rouge_scores['rouge-1']['f'],
        'rouge2_f': rouge_scores['rouge-2']['f'],
        'rougeL_f': rouge_scores['rouge-l']['f'],
        'context_recall': context_recall,
        'faithfulness': faithfulness
    }

In [76]:
# Evaluate the model on validation set
eval_results = evaluate(model, eval_loader, tokenizer)
print("Validation Results:", eval_results)

# Evaluate the model on test set
test_results = evaluate(model, test_loader, tokenizer)
print("Test Results:", test_results)

# Quantize the model
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

# Save the quantized model
model_version = "v1.0"
save_directory = f"./fine_tuned_bert_squad_lora_quantized_{model_version}"

# Ensure the directory exists
os.makedirs(save_directory, exist_ok=True)

# Save the quantized model
torch.save(quantized_model.state_dict(), os.path.join(save_directory, "quantized_model.pth"))

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

# Save the PEFT config
peft_config.save_pretrained(save_directory)

print(f"Quantized model saved in {save_directory}")

Evaluating: 100%|██████████| 373/373 [01:44<00:00,  3.57it/s]


Number of predictions: 5963
Number of true answers: 5963
Sample prediction: '[CLS]'
Sample true answer: '[CLS]'
Validation Results: {'eval_loss': 1.322047110936597, 'exact_match': 0.5431829615965118, 'f1_score': 0.6186853746228796, 'rouge1_f': 0.6193684959836186, 'rouge2_f': 0.18184614973367125, 'rougeL_f': 0.6191075243249131, 'context_recall': 0.9656213315445246, 'faithfulness': 0.9656213315445246}


Evaluating: 100%|██████████| 386/386 [01:42<00:00,  3.76it/s]


Number of predictions: 6171
Number of true answers: 6171
Sample prediction: 'france'
Sample true answer: 'france'
Test Results: {'eval_loss': 1.260658800331731, 'exact_match': 0.5636039539782856, 'f1_score': 0.6354601248722953, 'rouge1_f': 0.6360292509195259, 'rouge2_f': 0.17893439680653334, 'rougeL_f': 0.6358564854974489, 'context_recall': 0.9596499756927565, 'faithfulness': 0.9596499756927565}
Quantized model saved in ./fine_tuned_bert_squad_lora_quantized_v1.0
