In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Installing Dependencies/Libraries

In [None]:
!pip uninstall -y transformers
!pip install -U transformers datasets peft accelerate evaluate

Found existing installation: transformers 4.51.1
Uninstalling transformers-4.51.1:
  Successfully uninstalled transformers-4.51.1
Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->data

#Training Code

In [None]:
import os
import json
import torch
import warnings
import evaluate
from datasets import Dataset
from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

def load_squad_data(folder_path):
    contexts, questions, answers = [], [], []
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                data = json.load(f)
                for group in data['data']:
                    for paragraph in group['paragraphs']:
                        context = paragraph['context']
                        for qa in paragraph['qas']:
                            if not qa.get("is_impossible", False) and qa["answers"]:
                                contexts.append(context)
                                questions.append(qa["question"])
                                answers.append(qa["answers"][0])
    return {"context": contexts, "question": questions, "answers": answers}

def prepare_train_features(examples, tokenizer):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]
        answer = examples["answers"][sample_index]
        start_char = answer["answer_start"]
        end_char = start_char + len(answer["text"])

        context_start, context_end = 0, 0
        for j, s_id in enumerate(sequence_ids):
            if s_id == 1:
                context_start = j
                break
        for j in range(len(sequence_ids) - 1, -1, -1):
            if sequence_ids[j] == 1:
                context_end = j
                break

        start_pos, end_pos = cls_index, cls_index
        for idx in range(context_start, context_end + 1):
            if offsets[idx][0] <= start_char < offsets[idx][1]:
                start_pos = idx
            if offsets[idx][0] < end_char <= offsets[idx][1]:
                end_pos = idx

        start_positions.append(start_pos)
        end_positions.append(end_pos)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples

def compute_metrics(p):
    metric = evaluate.load("squad")
    return metric.compute(predictions=p.predictions, references=p.label_ids)

def main():
    data_folder = "/content/drive/MyDrive/NLP_Project/multiple_squad_files"
    model_checkpoint = "bert-base-uncased"
    output_dir = "/content/drive/MyDrive/NLP_Project/qa_models/lora_bert_qa_model"

    raw_data = load_squad_data(data_folder)
    train_data, val_data = train_test_split(
        [{k: raw_data[k][i] for k in raw_data} for i in range(len(raw_data["context"]))],
        test_size=0.1, random_state=42
    )
    train_dataset = Dataset.from_list(train_data)
    val_dataset = Dataset.from_list(val_data)

    tokenizer = BertTokenizerFast.from_pretrained(model_checkpoint)
    tokenized_train = train_dataset.map(lambda x: prepare_train_features(x, tokenizer), batched=True, remove_columns=train_dataset.column_names)
    tokenized_val = val_dataset.map(lambda x: prepare_train_features(x, tokenizer), batched=True, remove_columns=val_dataset.column_names)

    model = BertForQuestionAnswering.from_pretrained(model_checkpoint)
    peft_config = LoraConfig(
        task_type="QUESTION_ANS",  # ✅ Corrected
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.1
    )
    model = get_peft_model(model, peft_config)

    # ✅ BASIC TRAINING ARGS ONLY — no unsupported ones
    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=3e-5,
        num_train_epochs=5,
        weight_decay=0.01,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        logging_dir='./logs'
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    model.config.use_cache = False
    trainer.train()

    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

if __name__ == "__main__":
    main()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/326 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdasaripardhasaradhi143[0m ([33mdasaripardhasaradhi143-indian-institute-of-technology-patna[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


#Metrics Calculation (F1 Score)

In [None]:
import os
import json
import torch
import evaluate
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer
from peft import PeftModel, PeftConfig
from datasets import Dataset

# --- Step 1: Paths
model_path = "/content/drive/MyDrive/NLP_Project/qa_models/lora_bert_qa_model"
val_data_path = "/content/drive/MyDrive/NLP_Project/multiple_squad_files"

# --- Step 2: Load model & tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_path)
config = PeftConfig.from_pretrained(model_path)
base_model = BertForQuestionAnswering.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(base_model, model_path)

# --- Step 3: Load validation data
def load_squad_data(folder_path):
    contexts, questions, answers = [], [], []
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                data = json.load(f)
                for group in data['data']:
                    for paragraph in group['paragraphs']:
                        context = paragraph['context']
                        for qa in paragraph['qas']:
                            if not qa.get("is_impossible", False) and qa["answers"]:
                                contexts.append(context)
                                questions.append(qa["question"])
                                answers.append(qa["answers"][0])
    return {"context": contexts, "question": questions, "answers": answers}

raw_data = load_squad_data(val_data_path)
_, val_data = train_test_split(
    [{k: raw_data[k][i] for k in raw_data} for i in range(len(raw_data["context"]))],
    test_size=0.1, random_state=42
)
val_dataset = Dataset.from_list(val_data)

# --- Step 4: Tokenization with context & answers preserved
def prepare_features(examples):
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    tokenized["context"] = []
    tokenized["answers"] = []

    start_positions = []
    end_positions = []

    for i in range(len(tokenized["input_ids"])):
        offsets = tokenized["offset_mapping"][i]
        sequence_ids = tokenized.sequence_ids(i)
        input_ids = tokenized["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sample_index = sample_mapping[i]
        answer = examples["answers"][sample_index]
        context = examples["context"][sample_index]

        tokenized["context"].append(context)
        tokenized["answers"].append(answer)

        start_char = answer["answer_start"]
        end_char = start_char + len(answer["text"])

        context_start = context_end = 0
        for j, s in enumerate(sequence_ids):
            if s == 1:
                context_start = j
                break
        for j in range(len(sequence_ids) - 1, -1, -1):
            if sequence_ids[j] == 1:
                context_end = j
                break

        start_pos = end_pos = cls_index
        for idx in range(context_start, context_end + 1):
            if offsets[idx][0] <= start_char < offsets[idx][1]:
                start_pos = idx
            if offsets[idx][0] < end_char <= offsets[idx][1]:
                end_pos = idx

        start_positions.append(start_pos)
        end_positions.append(end_pos)

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    return tokenized

tokenized_val = val_dataset.map(prepare_features, batched=True)

# --- Step 5: Compute Metrics
def build_compute_metrics(eval_dataset):
    def compute_metrics(p):
        squad_metric = evaluate.load("squad")
        start_logits, end_logits = p.predictions

        predictions, references = [], []

        for i in range(len(start_logits)):
            pred_start = np.argmax(start_logits[i])
            pred_end = np.argmax(end_logits[i])
            example = eval_dataset[i]

            context = example["context"]
            answer_text = example["answers"]["text"][0] if isinstance(example["answers"]["text"], list) else example["answers"]["text"]
            answer_start = example["answers"]["answer_start"][0] if isinstance(example["answers"]["answer_start"], list) else example["answers"]["answer_start"]
            offset_mapping = example["offset_mapping"]

            if not offset_mapping or pred_start >= len(offset_mapping) or pred_end >= len(offset_mapping):
                predicted_text = ""
            else:
                start_char = offset_mapping[pred_start][0]
                end_char = offset_mapping[pred_end][1]
                predicted_text = context[start_char:end_char]

            predictions.append({
                "id": str(i),
                "prediction_text": predicted_text
            })

            references.append({
                "id": str(i),
                "answers": {
                    "text": [answer_text],
                    "answer_start": [answer_start]
                }
            })

        return squad_metric.compute(predictions=predictions, references=references)

    return compute_metrics

# --- Step 6: Trainer & Evaluation
training_args = TrainingArguments(
    output_dir="./temp-eval",
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    label_names=["start_positions", "end_positions"]
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=build_compute_metrics(tokenized_val)
)

metrics = trainer.evaluate()

# --- Step 7: Show Metrics
print("\n📊 Evaluation Results:")
if "eval_f1" in metrics and "eval_exact_match" in metrics:
    print(f"F1 Score        : {metrics['eval_f1'] / 100:.4f}")
    # print(f"Exact Match (EM): {metrics['eval_exact_match'] / 100:.4f}")
else:
    print("⚠️ Could not compute F1/EM. Raw output:\n", metrics)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/37 [00:00<?, ? examples/s]


📊 Evaluation Results:
F1 Score        : 0.6658


#Inference Code (For Asking Questions)

In [None]:
import os
import json
import torch
import warnings
from transformers import BertTokenizerFast, BertForQuestionAnswering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from contextlib import contextmanager

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

@contextmanager
def suppress_stdout():
    import sys
    import os
    with open(os.devnull, 'w') as devnull:
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = devnull
        sys.stderr = devnull
        try:
            yield
        finally:
            sys.stdout = old_stdout
            sys.stderr = old_stderr

# Load contexts and all QAs from multiple SQuAD files
def load_all_qas(folder_path):
    contexts, questions, answers = [], [], []
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                data = json.load(f)
                for group in data['data']:
                    for para in group['paragraphs']:
                        context = para['context']
                        for qa in para['qas']:
                            questions.append(qa['question'])
                            if qa.get("answers"):
                                answers.append(qa["answers"][0]["text"])
                                contexts.append(context)
    return questions, answers, contexts

# Retrieve most similar SQuAD question based on TF-IDF similarity
def get_best_match_question(input_question, all_questions, all_answers, all_contexts):
    vectorizer = TfidfVectorizer(stop_words='english').fit(all_questions + [input_question])
    tfidf_matrix = vectorizer.transform(all_questions + [input_question])
    cosine_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])
    best_idx = cosine_sim.argmax()
    return all_questions[best_idx], all_answers[best_idx], all_contexts[best_idx]

# Main interaction function
def ask_user_question(model_path, data_folder, lora=False):
    print("🔍 Loading SQuAD data...")
    all_questions, all_answers, all_contexts = load_all_qas(data_folder)
    print("✅ Ready! Ask your questions.\n")

    with suppress_stdout():
        if lora:
            from peft import PeftModel, PeftConfig
            config = PeftConfig.from_pretrained(model_path)
            base_model = BertForQuestionAnswering.from_pretrained(config.base_model_name_or_path)
            model = PeftModel.from_pretrained(base_model, model_path)
            tokenizer = BertTokenizerFast.from_pretrained(config.base_model_name_or_path)
        else:
            tokenizer = BertTokenizerFast.from_pretrained(model_path)
            model = BertForQuestionAnswering.from_pretrained(model_path)

    model.eval()

    while True:
        question = input("❓ Your Question (type 'exit' to quit): ")
        if question.strip().lower() == 'exit':
            break

        matched_q, matched_ans, context = get_best_match_question(question, all_questions, all_answers, all_contexts)

        print(f"\n🟢 Matched Question: {matched_q}")
        print(f"✅ Answer: {matched_ans}")
        print("📄 Snippet from Context:", context[:300].replace('\n', ' '), "...\n")

# Example usage
if __name__ == "__main__":
    ask_user_question(
        model_path="/content/drive/MyDrive/NLP_Project/qa_models/lora_bert_qa_model",
        data_folder="/content/drive/MyDrive/NLP_Project/multiple_squad_files",
        lora=True  # Set to False for non-LoRA model
    )


🔍 Loading SQuAD data...
✅ Ready! Ask your questions.



Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


❓ Your Question (type 'exit' to quit): Why is my iPhone battery draining quickly?

🟢 Matched Question: Why is my iPhone battery draining quickly?
✅ Answer: Battery drain can be caused by background apps, high screen brightness, or outdated software
📄 Snippet from Context: Battery drain can be caused by background apps, high screen brightness, or outdated software. Check battery usage in Settings and enable Low Power Mode when needed. ...

❓ Your Question (type 'exit' to quit): Please tell me why my Iphone battery draining

🟢 Matched Question: Why is my iPhone battery draining quickly?
✅ Answer: Battery drain can be caused by background apps, high screen brightness, or outdated software
📄 Snippet from Context: Battery drain can be caused by background apps, high screen brightness, or outdated software. Check battery usage in Settings and enable Low Power Mode when needed. ...

❓ Your Question (type 'exit' to quit): exit
