# Question Answering Task

In [24]:
!pip install datasets
!pip install transformers
!pip install peft
!pip install evaluate

In [8]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: write

In [2]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
try:
  from datasets import load_dataset, DatasetDict, Dataset
except:
  !pip install datasets
  from datasets import load_dataset, DatasetDict, Dataset


from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoModelForQuestionAnswering,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig

try:
  import evaluate
except:
  !pip install evaluate
  import evaluate

import pandas as pd
import numpy as np

#### <i>Model : Llama 3.2</i>

In [9]:
# Load the Llama 3.2-1B model and tokenizer
model_name = "meta-llama/Llama-3.2-1B"
model = AutoModelForQuestionAnswering.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of LlamaForQuestionAnswering were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['embed_tokens.weight', 'layers.0.input_layernorm.weight', 'layers.0.mlp.down_proj.weight', 'layers.0.mlp.gate_proj.weight', 'layers.0.mlp.up_proj.weight', 'layers.0.post_attention_layernorm.weight', 'layers.0.self_attn.k_proj.weight', 'layers.0.self_attn.o_proj.weight', 'layers.0.self_attn.q_proj.weight', 'layers.0.self_attn.v_proj.weight', 'layers.1.input_layernorm.weight', 'layers.1.mlp.down_proj.weight', 'layers.1.mlp.gate_proj.weight', 'layers.1.mlp.up_proj.weight', 'layers.1.post_attention_layernorm.weight', 'layers.1.self_attn.k_proj.weight', 'layers.1.self_attn.o_proj.weight', 'layers.1.self_attn.q_proj.weight', 'layers.1.self_attn.v_proj.weight', 'layers.10.input_layernorm.weight', 'layers.10.mlp.down_proj.weight', 'layers.10.mlp.gate_proj.weight', 'layers.10.mlp.up_proj.weight', 'layers.10.post_attention_layernorm.weight', 'layers.10.s

#### <i>pre-trained Model</i>

In [5]:
trainable_params = 0
non_trainable_params = 0


for p in model.parameters():
    if p.requires_grad == True:
        trainable_params += p.numel()
    else:
        non_trainable_params += p.numel()

billions = 10**9
vgg16_fc_params = (trainable_params + non_trainable_params)/billions

print("Pre-trained Model")
print("-------------------------")
print("Total Parameters:",vgg16_fc_params,"billions")
print("Trainable:", trainable_params/billions,"billions")
print("Not Trainable:", non_trainable_params/billions,"billions")

Pre-trained Model
-------------------------
Total Parameters: 1.235818498 billions
Trainable: 1.235818498 billions
Not Trainable: 0.0 billions


#### <i>zero-shot</i>

In [10]:
# Load the SQuAD dataset
dataset = load_dataset("rajpurkar/squad_v2")

# Take a subset of 1,000 rows for training
small_train_dataset = dataset["train"].select(range(8000))
small_test_dataset = dataset["validation"].select(range(2000))

In [11]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from tqdm import tqdm
# from datasets import load_dataset
# try:
#   import evaluate
# except:
#   !pip install evaluate
#   import evaluate

qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)


predictions = []
references = []

for example in tqdm(small_test_dataset):
    context = example["context"]
    question = example["question"]
    ground_truths = example["answers"]["text"]


    result = qa_pipeline({"context": context, "question": question})
    predicted_answer = result["answer"]

    predictions.append({"id": example["id"], "prediction_text": predicted_answer})
    references.append({"id": example["id"], "answers": {"text": ground_truths}})


results_df = pd.DataFrame({
        "Prediction": predictions,
        "Reference": references
    })
results_df.to_csv('./prednrefs_baseline.csv', index=False)
# squad_metric = evaluate.load("squad_v2")
# meteor_metric = evaluate.load("meteor")
# bleu_metric = evaluate.load("bleu")
# try:
#   rouge_metric = evaluate.load("rouge")
# except:
#   !pip install rouge_score
#   rouge_metric = evaluate.load("rouge")

# squad_results = squad_metric.compute(predictions=predictions, references=references)

# flat_predictions = [pred["prediction_text"] for pred in predictions]
# flat_references = [ref["answers"]["text"] for ref in references]

# meteor_results = meteor_metric.compute(predictions=flat_predictions, references=flat_references)
# bleu_results = bleu_metric.compute(predictions=flat_predictions, references=flat_references)
# rouge_results = rouge_metric.compute(predictions=flat_predictions, references=flat_references)

# print("SQuAD v2 Metric (Exact Match & F1):", squad_results)
# print("METEOR:", meteor_results["meteor"])
# print("BLEU:", bleu_results["bleu"])
# print("ROUGE:", rouge_results)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
100%|██████████| 2000/2000 [04:06<00:00,  8.10it/s]


#### <i>Fine-tunning</i>

In [8]:
peft_config = LoraConfig(task_type="QUESTION_ANSWERING",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01   #target_modules = ['query']
                        )

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 425,984 || all params: 1,236,244,482 || trainable%: 0.0345


In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


def preprocess_function(examples):
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    sample_map = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")

    # Initialize start and end positions
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answers = examples["answers"][sample_idx]
        if len(answers["answer_start"]) > 0:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Find the token start and end indices
            token_start = 0
            token_end = 0
            for idx, (start, end) in enumerate(offsets):
                if start <= start_char < end:
                    token_start = idx
                if start < end_char <= end:
                    token_end = idx
            start_positions.append(token_start)
            end_positions.append(token_end)
        else:
            # No answer case
            start_positions.append(0)
            end_positions.append(0)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [10]:

# Tokenize the subsets
train_dataset = small_train_dataset.map(preprocess_function, batched=True, remove_columns=small_train_dataset.column_names)
test_dataset = small_test_dataset.map(preprocess_function, batched=True, remove_columns= small_test_dataset.column_names)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [17]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results1",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs1",
    logging_steps=10,
    save_steps=500,
    seed=1,
    save_total_limit=2,
    report_to=["none"]
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Push the model to Hugging Face
# trainer.push_to_hub("fine-tuned-llama3.2-1b-squad")

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,4.7621,No log


TrainOutput(global_step=1016, training_loss=4.7962178902363215, metrics={'train_runtime': 4041.5402, 'train_samples_per_second': 2.01, 'train_steps_per_second': 0.251, 'total_flos': 1.8223103232129024e+16, 'train_loss': 4.7962178902363215, 'epoch': 1.0})

In [22]:
trainable_params = 0
non_trainable_params = 0


for p in model.parameters():
    if p.requires_grad == True:
        trainable_params += p.numel()
    else:
        non_trainable_params += p.numel()

billions = 10**9
vgg16_fc_params = (trainable_params + non_trainable_params)/billions

print("Fine-Tunned Model")
print("-----------------------")
print("Total Parameters:",vgg16_fc_params,"billions")
print("Trainable:", trainable_params,)
print("Not Trainable:", non_trainable_params/billions,"billions")

Fine-Tunned Model
-----------------------
Total Parameters: 1.236244482 billions
Trainable: 425984
Not Trainable: 1.235818498 billions


In [30]:
from transformers import pipeline
from tqdm import tqdm

qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)


predictions = []
references = []

test_dataset = dataset["validation"]
small_test_dataset = test_dataset.select(range(2000))

for example in tqdm(small_test_dataset):
    context = example["context"]
    question = example["question"]
    ground_truths = example["answers"]["text"]


    result = qa_pipeline({"context": context, "question": question})
    predicted_answer = result["answer"]

    predictions.append({"id": example["id"], "prediction_text": predicted_answer})
    references.append({"id": example["id"], "answers": {"text": ground_truths}})


results_df = pd.DataFrame({
        "Prediction": predictions,
        "Reference": references
    })
results_df.to_csv('./prednrefs_finetuneed.csv', index=False)


# squad_metric = evaluate.load("squad_v2")
# meteor_metric = evaluate.load("meteor")
# bleu_metric = evaluate.load("bleu")
# try:
#   rouge_metric = evaluate.load("rouge")
# except:
#   !pip install rouge_score
#   rouge_metric = evaluate.load("rouge")

# squad_results = squad_metric.compute(predictions=predictions, references=references)

# flat_predictions = [pred["prediction_text"] for pred in predictions]
# flat_references = [ref["answers"]["text"] for ref in references]

# meteor_results = meteor_metric.compute(predictions=flat_predictions, references=flat_references)
# bleu_results = bleu_metric.compute(predictions=flat_predictions, references=flat_references)
# rouge_results = rouge_metric.compute(predictions=flat_predictions, references=flat_references)

# print("SQuAD v2 Metric (Exact Match & F1):", squad_results)
# print("METEOR:", meteor_results["meteor"])
# print("BLEU:", bleu_results["bleu"])
# print("ROUGE:", rouge_results)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
The model 'PeftModel' is not supported for question-answering. Supported models are ['AlbertForQuestionAnswering', 'BartForQuestionAnswering', 'BertForQuestionAnswering', 'BigBirdForQuestionAnswering', 'BigBirdPegasusForQuestionAnswering', 'BloomForQuestionAnswering', 'CamembertForQuestionAnswering', 'CanineForQuestionAnswering', 'ConvBertForQuestionAnswering', 'Data2VecTextForQuestionAnswering', 'DebertaForQuestionAnswering', 'DebertaV2ForQuestionAnswering', 'DistilBertForQuestionAnswering', 'ElectraForQuestionAnswering', 'ErnieForQuestionAnswering', 'ErnieMForQuestionAnswering', 'FalconForQuestionAnswering', 'FlaubertForQuestionAnsweringSimple', 'FNetForQuestionAnswering', 'FunnelForQuestionAnswering', 'GPT2ForQuestionAnswering', 'GPTNeoForQuestionAnswering', 'GPTNeoXForQuestionAnswering', 'GPTJForQuestionAnswering', 'IBertForQuestionAnswer

In [None]:
# predictions

In [None]:

# # squad_results = squad_metric.compute(predictions=predictions, references=references)

# flat_predictions = [pred["prediction_text"] for pred in predictions]
# flat_references = [ref["answers"]["text"] for ref in references]

# meteor_results = meteor_metric.compute(predictions=flat_predictions, references=flat_references)
# bleu_results = bleu_metric.compute(predictions=flat_predictions, references=flat_references)
# rouge_results = rouge_metric.compute(predictions=flat_predictions, references=flat_references)

# print("SQuAD v2 Metric (Exact Match & F1):", squad_results)
# print("METEOR:", meteor_results["meteor"])
# print("BLEU:", bleu_results["bleu"])
# print("ROUGE:", rouge_results)

