In [None]:
!pip install transformers datasets peft accelerate trl

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting peft
  Downloading peft-0.13.0-py3-none-any.whl.metadata (13 kB)
Collecting trl
  Downloading trl-0.11.1-py3-none-any.whl.metadata (12 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.11-py3-none-any.whl.metadata (8.4 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets)
  Downloading mult

In [None]:
# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
import torch

In [None]:
# AutoModelForQuestionAnswering--------> For extractive QA.
# AutoModel--------> For base models without task-specific heads.
# AutoModelForSequenceClassification--------> For text classification tasks.
# AutoModelForTokenClassification--------> For token-level classification like NER.
# AutoModelForSeq2SeqLM--------> For sequence-to-sequence tasks (translation, summarization).
# AutoModelForMultipleChoice--------> For multiple-choice tasks.
# AutoModelForImageClassification--------> For image classification tasks.
# AutoModelForVision2Seq--------> For vision-to-text tasks like image captioning.
# AutoModelForSpeechSeq2Seq--------> For speech-to-text conversion.

In [None]:
 # Load the TinyBERT model and tokenizer
model_name = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Load a QA dataset (using SQuAD for this example)
dataset = load_dataset("squad")

# dataset["train"] = dataset["train"].select(range(int(len(dataset["train"]) * 0.01)))
# dataset["validation"] = dataset["validation"].select(range(int(len(dataset["validation"]) * 0.01)))


In [None]:
dataset,dataset['train'][0]

(DatasetDict({
     train: Dataset({
         features: ['id', 'title', 'context', 'question', 'answers'],
         num_rows: 87599
     })
     validation: Dataset({
         features: ['id', 'title', 'context', 'question', 'answers'],
         num_rows: 10570
     })
 }),
 {'id': '5733be284776f41900661182',
  'title': 'University_of_Notre_Dame',
  'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues a

In [None]:
s ="Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to"
len(s)

514

In [None]:
def answer_question(question, context, model):
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the start and end logits
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Get the most likely start and end of the answer
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits)

    answer_tokens = inputs["input_ids"][0][start_index:end_index + 1]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    return answer

In [None]:
# Define some sample questions from the dataset
sample_questions = [
    {"question": "What is the capital of France?", "context": "Paris is the capital and most populous city of France."},
    {"question": "Who wrote 'Pride and Prejudice'?", "context": "Jane Austen wrote the novel 'Pride and Prejudice'."},
    {"question": "What is the largest planet in our solar system?", "context": "Jupiter is the largest planet in the solar system."}
]

# Evaluate the model before fine-tuning
print("Answers from the model before fine-tuning:\n")
for sample in sample_questions:
    answer = answer_question(sample["question"], sample["context"], model)
    print(f"Question: {sample['question']}")
    print(f"Answer: {answer}\n")



Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Answers from the model before fine-tuning:

Question: What is the capital of France?
Answer: 

Question: Who wrote 'Pride and Prejudice'?
Answer: 

Question: What is the largest planet in our solar system?
Answer: 



In [None]:
# Preprocess the dataset
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        padding="max_length",
        return_tensors="pt",
        # Add this line to return offset_mapping
        return_offsets_mapping=True
    )

    # Extract offset_mapping from the inputs
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]

    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]["text"][0]
        start_char = answers[i]["answer_start"][0]
        end_char = start_char + len(answer)
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end token indices in the tokenized version
        token_start_index = sequence_ids.index(1)
        token_end_index = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

        # Offset mappings: map token index to character index in the original context
        start_positions.append(token_start_index)
        end_positions.append(token_end_index)


    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

# Tokenize and format the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 10570
    })
})

In [None]:
# Intermediate Results:

# Example 1
# Question: What is the capital of France?
# Context: Paris is the capital and most populous city of France.
# Tokenized input_ids: [101, 2054, 2003, 1996, 3007, 1997, 2605, 1029, 102, 3000, 2003, 1996, 3007, 1998, 2087, 19278, 2103, 1997, 2605, 102, ...]
# Attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...]
# Offset mapping: [[0, 0], [0, 4], [5, 7], [8, 11], ...]
# Answer: Paris
# Answer character start: 0, end: 5
# Token start index: 9, Token end index: 9


In [None]:
# Original Question: What is the capital of France?
# Original Context: Paris is the capital and most populous city of France.
# Original Answer: Paris

# Tokenized Input IDs: tensor([[101, 2054, 2003, 1996, 3007, 1997, 2605, 1029, 102, 3000, 2003, 1996, 3007, 1997, 2605, 102]])
# Tokenized Tokens: ['[CLS]', 'what', 'is', 'the', 'capital', 'of', 'france', '?', '[SEP]', 'paris', 'is', 'the', 'capital', 'of', 'france', '[SEP]']

# Offset Mapping: [[(0, 0), (0, 4), (5, 7), (8, 11), (12, 19), (20, 22), (23, 29), (30, 31), (0, 0), (0, 5), (6, 8), (9, 12), (13, 20), (21, 23), (24, 30), (31, 32)]]
# Start Token Position: [9]
# End Token Position: [9]

# ================================================================================


In [None]:
lora_config = LoraConfig(
    r=4,  # Rank of the decomposition matrix
    lora_alpha=16,  # Scaling factor
    lora_dropout=0.1,  # Dropout rate for LoRA
    target_modules=["query", "key"],  # LoRA applied to these layers in attention mechanism
)

# Transformer Block:
#   - Self-Attention:
#       - Query
#       - Key
#       - Value
#   - Feedforward:
#       - Linear Layer

LoRA Weight Update Formula:
𝑊′ = 𝑊 + Δ𝑊

LoRA Decomposition:
Δ𝑊 = (𝛼/𝑟) *(𝐵*𝐴)

𝑟 - r is the rank of the low-rank matrices.
𝛼 - α is the scaling factor.

So the final updated weight matrix becomes:

W′ =W + (α/r)*(B*A)


1. If alpha / r becomes extremely large:

Effect: The LoRA matrices will have a much larger influence on the model's activations. The low-rank adaptation (LoRA updates) will dominate over the original model's behavior.

Example:
If alpha = 1000 and r = 1, the scaling factor is 1000, meaning the low-rank matrices would have 1000 times the influence of the regular model weights.

2. If alpha / r becomes extremely small:

Effect: The LoRA matrices will have very little impact on the model's activations. Essentially, the original model's pretrained weights will dominate, and the LoRA fine-tuning will be negligible.

Example:
If alpha = 1 and r = 1000, the scaling factor is 0.001, meaning the influence of LoRA would be extremely weak, and the model would perform almost like the original pretrained model.

In [None]:
# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 19,968 || all params: 14,273,186 || trainable%: 0.1399


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=False,
    save_strategy="epoch",
    # metric_for_best_model="eval_loss"  # Set the metric to monitor for best model
)



In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels) # Calculate and return relevant metrics

In [None]:
trainer = Trainer( #Using Trainer instead of SFTTrainer as mentioned in the code.
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Save the model
model.save_pretrained("./lora_tinybert_final")
tokenizer.save_pretrained("./lora_tinybert_final")

Epoch,Training Loss,Validation Loss
1,5.2994,No log
2,5.0881,No log


('./lora_tinybert_final/tokenizer_config.json',
 './lora_tinybert_final/special_tokens_map.json',
 './lora_tinybert_final/vocab.txt',
 './lora_tinybert_final/added_tokens.json',
 './lora_tinybert_final/tokenizer.json')

In [None]:
# Load the TinyBERT model and tokenizer
model_name = "./lora_tinybert_final"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)


print("Answers from the model before fine-tuning:\n")
for sample in sample_questions:
    answer = answer_question(sample["question"], sample["context"], model)
    print(f"Question: {sample['question']}")
    print(f"Answer: {answer}\n")



Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Answers from the model before fine-tuning:

Question: What is the capital of France?
Answer: what is the capital of france? paris is the capital and most populous city of france

Question: Who wrote 'Pride and Prejudice'?
Answer: who wrote'pride and prejudice '? jane austen wrote the novel'pride and prejudice '

Question: What is the largest planet in our solar system?
Answer: what is the largest planet in our solar system? jupiter is the largest planet in the



In [None]:
ss

In [None]:
# Install necessary packages (if not already installed)
!pip install transformers datasets peft accelerate trl

# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer # Import the regular Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
import torch


# Load the TinyBERT model and tokenizer
model_name = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Load a QA dataset (using SQuAD for this example)
dataset = load_dataset("squad")

dataset["train"] = dataset["train"].select(range(int(len(dataset["train"]) * 0.01)))
dataset["validation"] = dataset["validation"].select(range(int(len(dataset["validation"]) * 0.01)))

def answer_question(question, context, model):
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the start and end logits
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Get the most likely start and end of the answer
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits)

    answer_tokens = inputs["input_ids"][0][start_index:end_index + 1]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    return answer

# Define some sample questions from the dataset
sample_questions = [
    {"question": "What is the capital of France?", "context": "Paris is the capital and most populous city of France."},
    {"question": "Who wrote 'Pride and Prejudice'?", "context": "Jane Austen wrote the novel 'Pride and Prejudice'."},
    {"question": "What is the largest planet in our solar system?", "context": "Jupiter is the largest planet in the solar system."}
]

# Evaluate the model before fine-tuning
print("Answers from the model before fine-tuning:\n")
for sample in sample_questions:
    answer = answer_question(sample["question"], sample["context"], model)
    print(f"Question: {sample['question']}")
    print(f"Answer: {answer}\n")


# Preprocess the dataset
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        padding="max_length",
        return_tensors="pt",
        # Add this line to return offset_mapping
        return_offsets_mapping=True
    )

    # Extract offset_mapping from the inputs
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]

    start_positions = []
    end_positions = []
    for i, offset in enumerate(offset_mapping):
        answer = answers[i]["text"][0]
        start_char = answers[i]["answer_start"][0]
        end_char = start_char + len(answer)
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end token indices in the tokenized version
        token_start_index = sequence_ids.index(1)
        token_end_index = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

        # Offset mappings: map token index to character index in the original context
        start_positions.append(token_start_index)
        end_positions.append(token_end_index)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Tokenize and format the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Configure LoRA
lora_config = LoraConfig(
    r=2,  # Rank of the decomposition matrix
    lora_alpha=4,  # Scaling factor
    lora_dropout=0.1,  # Dropout rate for LoRA
    target_modules=["query", "key"],  # LoRA applied to these layers in attention mechanism
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=False,
    save_strategy="epoch",
    # metric_for_best_model="eval_loss"  # Set the metric to monitor for best model
)



Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Answers from the model before fine-tuning:

Question: What is the capital of France?
Answer: capital of france? paris is the capital and most populous city of france

Question: Who wrote 'Pride and Prejudice'?
Answer: and prejudice '? jane austen wrote the

Question: What is the largest planet in our solar system?
Answer: what is the largest planet in our solar system? jupiter is the largest planet in the solar system



Map:   0%|          | 0/105 [00:00<?, ? examples/s]



In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels) # Calculate and return relevant metrics

In [None]:
#  Use SFTTrainer for fine-tuning with LoRA
trainer = Trainer( #Using Trainer instead of SFTTrainer as mentioned in the code.
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Save the model
model.save_pretrained("./lora_tinybert_fast")
tokenizer.save_pretrained("./lora_tinybert_fast")

Epoch,Training Loss,Validation Loss
1,5.946,No log


('./lora_tinybert_fast/tokenizer_config.json',
 './lora_tinybert_fast/special_tokens_map.json',
 './lora_tinybert_fast/vocab.txt',
 './lora_tinybert_fast/added_tokens.json',
 './lora_tinybert_fast/tokenizer.json')

In [None]:
# Load the TinyBERT model and tokenizer
model_name = "./lora_tinybert_fast"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)


print("Answers from the model before fine-tuning:\n")
for sample in sample_questions:
    answer = answer_question(sample["question"], sample["context"], model)
    print(f"Question: {sample['question']}")
    print(f"Answer: {answer}\n")



Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Answers from the model before fine-tuning:

Question: What is the capital of France?
Answer: 

Question: Who wrote 'Pride and Prejudice'?
Answer: austen

Question: What is the largest planet in our solar system?
Answer: our solar system? jupiter is the largest

