***Libraries***

In [1]:
import transformers
import datasets
from datasets import Dataset,load_dataset,DatasetDict
import peft
import bitsandbytes
import accelerate
import evaluate
from evaluate import evaluator
import seqeval
from transformers import pipeline, AutoTokenizer,AutoModelForSequenceClassification,Trainer,TrainingArguments,DataCollatorWithPadding
from transformers import AutoModelForQuestionAnswering,BitsAndBytesConfig,AutoModelForTokenClassification,default_data_collator
from peft import LoraConfig,get_peft_model,TaskType,PeftModel,prepare_model_for_kbit_training
import numpy as np
import pandas as pd 
import torch
import os
from tqdm import tqdm
import scipy.stats
metric_accuracy = evaluate.load('accuracy')
metric_f1 = evaluate.load("f1")
metric_seqeval = evaluate.load("seqeval") 
metric_squad = evaluate.load("squad")
import warnings
warnings.filterwarnings(action = 'ignore')

  from .autonotebook import tqdm as notebook_tqdm
The 8-bit optimizer is not available on your device, only available on CUDA for now.


In [2]:
from transformers import DataCollatorForTokenClassification

In [3]:
import random

In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


In [5]:
set_seed(42)

***BERT-PEFT-LORA***

In [6]:
dataset_name = "squad"
model_name = "bert-base-uncased"
low_resource_samples = 512

In [7]:
def simple_preprocess_qa_function(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = [c.strip() for c in examples["context"]]
 
    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation="only_second",
        padding="max_length",
        return_tensors="pt"
    )
    
    
    start_positions = []
    end_positions = []
    
    for i, (answer, context) in enumerate(zip(examples["answers"], contexts)):
        
        answer_text = answer["text"][0]
        answer_start = answer["answer_start"][0]
        answer_end = answer_start + len(answer_text)
        
   
        if answer_text.lower() in context[answer_start:answer_end].lower():
            
            tokenized_context = tokenizer(context, add_special_tokens=False)
            answer_tokens = tokenizer(answer_text, add_special_tokens=False)["input_ids"]
            found = False
            
            for j in range(len(tokenized_context["input_ids"]) - len(answer_tokens) + 1):
                if tokenized_context["input_ids"][j:j+len(answer_tokens)] == answer_tokens:
                    start_positions.append(j + 1)
                    end_positions.append(j + len(answer_tokens))
                    found = True
                    break
            
            if not found:
                start_positions.append(0)
                end_positions.append(0)
        else:
            start_positions.append(0)
            end_positions.append(0)
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [8]:
dataset = load_dataset("squad")

train_dataset_small = dataset["train"].select(range(512))

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def simple_preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = [c.strip() for c in examples["context"]]
    
    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation="only_second",
        padding="max_length",
        return_tensors="pt"
    )
    

    start_positions = []
    end_positions = []
    
    for i, answer in enumerate(examples["answers"]):
      
        start_positions.append(0)
        end_positions.append(0)
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


tokenized_train = train_dataset_small.map(simple_preprocess_function, batched=True)
tokenized_eval = dataset["validation"].map(simple_preprocess_function, batched=True)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)




model = AutoModelForQuestionAnswering.from_pretrained(
    "bert-base-uncased",
    quantization_config=bnb_config)


model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "key", "value"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.QUESTION_ANS
)

model = get_peft_model(model, lora_config)




training_args = TrainingArguments(
    output_dir="./qa_lora_simple",
    eval_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs_qa_lora_simple",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none",
    seed=42
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer)





Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
results = trainer.evaluate()
print("LoRA", results)

In [None]:
trainer.save_model("./Bert_QA_Qlora-MODEL")