In [None]:
!pip install --upgrade torch transformers datasets peft accelerate evaluate bitsandbytes pandas
!pip install --upgrade torch
!pip install evaluate
!pip install peft
!pip install -U accelerate
!pip install -U transformers

In [1]:
import torch
import os
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, pipeline
from evaluate import load
import pandas as pd

from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training
from transformers import DataCollatorForLanguageModeling

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
from huggingface_hub import login

login("hf_SECRET_TOKEN")
os.environ["CUDA_VISIBLE_DEVICES"] = "1" 
torch.cuda.set_device(0)
torch.cuda.empty_cache()
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
# --- Configuration ---
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
DATASET_NAME = "tatsu-lab/alpaca"
ADAPTER_PATH = "./lora_adapter"
MERGED_MODEL_PATH = "./merged_lora_model"

In [4]:
# --- 1. Prepare Dataset and Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

def format_prompt(sample):
    instruction = sample.get('instruction', '')
    input_text = sample.get('input', '')
    response = sample.get('output', '')
    
    if input_text:
        prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n"
    else:
        prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"
        
    full_text = prompt + response
    return {"prompt": prompt, "full_text": full_text}

# Load, format, and tokenize a small subset of the data
dataset = load_dataset(DATASET_NAME, split="train[:500]")
formatted_dataset = dataset.map(format_prompt)

def tokenize_function(examples):
    tokenized = tokenizer(examples["full_text"], truncation=True, max_length=256, padding="max_length")
    # Convert to tensors and set labels for causal language modeling
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['instruction', 'input', 'output', 'text', 'prompt', 'full_text'])

# Create a separate validation set for perplexity calculation
eval_dataset = load_dataset(DATASET_NAME, split="train[500:1000]")
eval_dataset = eval_dataset.map(format_prompt).map(tokenize_function, batched=True)
eval_dataset = eval_dataset.remove_columns(['instruction', 'input', 'output', 'text', 'prompt', 'full_text'])

## LORA

In [6]:
# --- Load Model and Tokenizer ---

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True,
)

Step 1: Loading Model and Tokenizer


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# --- Setup LoRA
# Simply enable training mode
base_model.train()

# Enable gradient computation for input embeddings (important for LoRA)
base_model.get_input_embeddings().weight.requires_grad_(True)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

peft_model = get_peft_model(base_model, lora_config)


Step 2: Setting up LoRA


In [8]:
# --- Setup LoRA - See Trainable Params

# Check that LoRA parameters require gradients
has_trainable = False
for name, param in peft_model.named_parameters():
    if param.requires_grad:
        print(f"✓ Trainable: {name}")
        has_trainable = True
        break

trainable params: 6,815,744 || all params: 7,248,547,840 || trainable%: 0.0940
✓ Trainable: base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight


In [9]:
# --- Training Setup ---

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_steps=500,

    report_to="none",
    remove_unused_columns=False,
    gradient_checkpointing=False,
    optim="adamw_torch",
    warmup_steps=10,
    save_total_limit=1,
    dataloader_pin_memory=False,
    eval_strategy="steps",
    eval_steps=50

)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [10]:
# --- Evaluation of untrained model ---

# Quantitative Evaluation: Perplexity
print("\n--- Quantitative Evaluation (Perplexity) ---")
perplexity = load("perplexity", module_type="metric")
eval_results = trainer.evaluate()
print(f"Perplexity on eval set: {torch.exp(torch.tensor(eval_results['eval_loss'])):.2f}")


Step 5: Evaluating the Model

--- Quantitative Evaluation (Perplexity) ---


Perplexity on eval set: 6.36


In [11]:
# --- Start Training ---
print("Starting training...")
trainer.train()
print("✅ Training complete!")

Starting training...


Step,Training Loss,Validation Loss,Model Preparation Time
50,1.1179,1.150921,0.0144
100,1.1227,1.13684,0.0144


✅ Training complete!


In [12]:
# --- Save the Adapter ---
print(f"\nSaving adapter to {ADAPTER_PATH}")
peft_model.save_pretrained(ADAPTER_PATH)


Step 4: Saving adapter to ./lora_adapter


In [13]:
# ---Evaluation on Trained Model---

# Quantitative Evaluation: Perplexity
print("\n--- Quantitative Evaluation (Perplexity) ---")
perplexity = load("perplexity", module_type="metric")
eval_results = trainer.evaluate()
print(f"Perplexity on eval set: {torch.exp(torch.tensor(eval_results['eval_loss'])):.2f}")


Step 5: Evaluating the Model

--- Quantitative Evaluation (Perplexity) ---


Perplexity on eval set: 3.11


In [14]:
# ---Qualitative Evaluation: Side-by-Side Comparison---

# Use a few challenging prompts for qualitative assessment
eval_prompts = [
    {"instruction": "What are the three primary colors?", "input": ""},
    {"instruction": "Explain the concept of quantum entanglement in simple terms.", "input": ""}
]

base_generator = pipeline('text-generation', model=base_model, tokenizer=tokenizer)
fine_tuned_generator = pipeline('text-generation', model=peft_model, tokenizer=tokenizer)

results_list = []
for sample in eval_prompts:
    prompt_text = format_prompt(sample)["prompt"]
    base_output = base_generator(prompt_text, max_new_tokens=50, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    lora_output = fine_tuned_generator(prompt_text, max_new_tokens=50, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    
    results_list.append({
        "Instruction": sample["instruction"],
        "Input": sample["input"],
        "Base Model Output": base_output[0]['generated_text'].replace(prompt_text, "").strip(),
        "Fine-Tuned Output": lora_output[0]['generated_text'].replace(prompt_text, "").strip()
    })

results_df = pd.DataFrame(results_list)
results_df

Device set to use cuda:0
Device set to use cuda:0



--- Qualitative Evaluation (Side-by-Side) ---


Unnamed: 0,Instruction,Input,Base Model Output,Fine-Tuned Output
0,What are the three primary colors?,,"The three primary colors are red, blue, and ye...","The three primary colors are red, blue, and ye..."
1,Explain the concept of quantum entanglement in...,,Quantum entanglement is a phenomenon in which ...,Quantum entanglement is a phenomenon in which ...


In [15]:
# --- Merging for Zero Latency ---
# Note: Merging requires enough memory to hold the full model in FP16
merged_model = peft_model.merge_and_unload()
print("✅ Merge complete! The model now behaves like a standard transformer model.")

✅ Merge complete! The model now behaves like a standard transformer model.


In [16]:
print(f"Saving merged model to {MERGED_MODEL_PATH}")
merged_model.save_pretrained(MERGED_MODEL_PATH)
tokenizer.save_pretrained(MERGED_MODEL_PATH)


💾 Saving merged model to ./merged_lora_model


('./merged_lora_model/tokenizer_config.json',
 './merged_lora_model/special_tokens_map.json',
 './merged_lora_model/tokenizer.json')

In [18]:
merged_model = AutoModelForCausalLM.from_pretrained("./merged_lora_model")
merged_tokenizer = AutoTokenizer.from_pretrained("./merged_lora_model")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## QLORA

In [5]:
class QLoRATrainer:
    def __init__(self, model_name, device_map="auto"):
        self.model_name = model_name
        self.device_map = device_map

    def setup_quantization_config(self):
        return BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
        )

    def setup_production_qlora(self, r=16, lora_alpha=32, lora_dropout=0.05):
        bnb_config = self.setup_quantization_config()
        model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            quantization_config=bnb_config,
            device_map=self.device_map,
            trust_remote_code=True
        )
        model = prepare_model_for_kbit_training(model)
        lora_config = LoraConfig(
            r=r, lora_alpha=lora_alpha,
            target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
            lora_dropout=lora_dropout, bias="none", task_type="CAUSAL_LM",
        )
        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()
        return model

In [7]:
qLoRATrainer = QLoRATrainer(MODEL_NAME)

In [8]:
qlora_model = qLoRATrainer.setup_production_qlora()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 13,631,488 || all params: 7,255,363,584 || trainable%: 0.1879


In [9]:
# --- Training Setup ---

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_steps=500,

    report_to="none",
    remove_unused_columns=False,
    gradient_checkpointing=False,
    optim="adamw_torch",
    warmup_steps=10,
    save_total_limit=1,
    dataloader_pin_memory=False,
    eval_strategy="steps",
    eval_steps=50

)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8,
)

trainer = Trainer(
    model=qlora_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [10]:
# --- Start Training ---
print("Starting training...")
trainer.train()
print("✅ Training complete!")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Starting training...


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
50,1.1324,1.162442
100,1.1243,1.138778


✅ Training complete!


In [11]:
# ---Evaluation on Trained Model---

# Quantitative Evaluation: Perplexity
print("\n--- Quantitative Evaluation (Perplexity) ---")
perplexity = load("perplexity", module_type="metric")
eval_results = trainer.evaluate()
print(f"Perplexity on eval set: {torch.exp(torch.tensor(eval_results['eval_loss'])):.2f}")


--- Quantitative Evaluation (Perplexity) ---


Perplexity on eval set: 3.11


## DORA

In [14]:
from peft import LoraConfig

class DoRATrainer:
    def __init__(self, model_name, device_map="auto"):
        self.model_name = model_name
        self.device_map = device_map

    def setup_dora(self, r=16, lora_alpha=32, lora_dropout=0.1):
        model = AutoModelForCausalLM.from_pretrained(
            self.model_name, device_map=self.device_map, torch_dtype=torch.float16
        )
        
        # DoRA is enabled by setting use_dora=True in LoraConfig
        dora_config = LoraConfig(
            r=r,
            lora_alpha=lora_alpha,
            target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
            lora_dropout=lora_dropout,
            bias="none",
            task_type="CAUSAL_LM",
            use_dora=True,  # This is the key difference!
        )
        
        model = get_peft_model(model, dora_config)
        model.print_trainable_parameters()
        print("✅ DoRA setup complete! This adapter will learn both directional and magnitude changes.")
        return model

    def setup_dora_with_quantization(self, r=16, lora_alpha=32):
        """Combine DoRA with 4-bit quantization for maximum efficiency"""
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            quantization_config=bnb_config,
            device_map=self.device_map,
            trust_remote_code=True
        )
        model = prepare_model_for_kbit_training(model)
        
        dora_config = LoraConfig(
            r=r, lora_alpha=lora_alpha,
            target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
            lora_dropout=0.05, bias="none", task_type="CAUSAL_LM",
            use_dora=True,  # Enable DoRA
        )
        
        model = get_peft_model(model, dora_config)
        model.print_trainable_parameters()
        return model


In [15]:
doRATrainer = DoRATrainer(MODEL_NAME)

In [16]:
dora_model = doRATrainer.setup_dora_with_quantization()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 13,959,168 || all params: 7,255,691,264 || trainable%: 0.1924


In [17]:
# --- Training Setup ---

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_steps=500,

    report_to="none",
    remove_unused_columns=False,
    gradient_checkpointing=False,
    optim="adamw_torch",
    warmup_steps=10,
    save_total_limit=1,
    dataloader_pin_memory=False,
    eval_strategy="steps",
    eval_steps=50

)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8,
)

trainer = Trainer(
    model=dora_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [18]:
# --- Start Training ---
print("Starting training...")
trainer.train()
print("✅ Training complete!")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Starting training...


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
50,1.1332,1.16412
100,1.1272,1.141404


✅ Training complete!


In [19]:
# ---Evaluation on Trained Model---

# Quantitative Evaluation: Perplexity
print("\n--- Quantitative Evaluation (Perplexity) ---")
perplexity = load("perplexity", module_type="metric")
eval_results = trainer.evaluate()
print(f"Perplexity on eval set: {torch.exp(torch.tensor(eval_results['eval_loss'])):.2f}")


--- Quantitative Evaluation (Perplexity) ---


Perplexity on eval set: 3.12


## Adalora

In [5]:
from peft import AdaLoraConfig

In [6]:
class AdaLoRATrainer:
    def __init__(self, model_name, device_map="auto"):
        self.model_name = model_name
        self.device_map = device_map

    def setup_adalora(self, actual_training_steps, device_map="auto"):
        """Setup AdaLoRA with proper total_step matching your training"""

        model = AutoModelForCausalLM.from_pretrained(
            self.model_name, device_map=self.device_map, torch_dtype=torch.float16
        )

        # Configure AdaLoRA for your actual training length
        adalora_config = AdaLoraConfig(
            peft_type="ADALORA", 
            task_type="CAUSAL_LM",
            init_r=32,
            lora_alpha=32, 
            lora_dropout=0.1,
            target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
            orth_reg_weight=0.5,
            total_step=actual_training_steps,
            deltaT=max(5, actual_training_steps // 20),
            beta1=0.85,
            beta2=0.85,
        )

        print(f"🔧 AdaLoRA Configuration:")
        print(f"   total_step: {adalora_config.total_step}")
        print(f"   deltaT: {adalora_config.deltaT}")
        print(f"   Expected adaptations: {actual_training_steps // adalora_config.deltaT}")

        model = get_peft_model(model, adalora_config)
        model.print_trainable_parameters()
        return model


In [7]:
adaloRATrainer = AdaLoRATrainer(MODEL_NAME)

In [8]:
adalora_model = adaloRATrainer.setup_adalora(actual_training_steps=125)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

🔧 AdaLoRA Configuration:
   total_step: 125
   deltaT: 6
   Expected adaptations: 20
trainable params: 27,267,072 || all params: 7,268,999,296 || trainable%: 0.3751


In [9]:
# --- Training Setup ---

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_steps=500,

    report_to="none",
    remove_unused_columns=False,
    gradient_checkpointing=False,
    optim="adamw_torch",
    warmup_steps=10,
    save_total_limit=1,
    dataloader_pin_memory=False,
    eval_strategy="steps",
    eval_steps=50

)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8,
)

trainer = Trainer(
    model=adalora_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [10]:
# --- Start Training ---
print("Starting training...")
trainer.train()
print("✅ Training complete!")

Starting training...


Step,Training Loss,Validation Loss
50,2.8756,2.064665
100,1.6718,1.587517


✅ Training complete!


In [14]:
# ---Evaluation on Trained Model---

# Quantitative Evaluation: Perplexity
print("\n--- Quantitative Evaluation (Perplexity) ---")
perplexity = load("perplexity", module_type="metric")
eval_results = trainer.evaluate()
print(f"Perplexity on eval set: {torch.exp(torch.tensor(eval_results['eval_loss'])):.2f}")


--- Quantitative Evaluation (Perplexity) ---


Perplexity on eval set: 4.60


## IA3

In [5]:
from peft import IA3Config

class IA3Trainer:
    def __init__(self, model_name, device_map="auto"):
        self.model_name = model_name
        self.device_map = device_map

    def setup_ia3(self):
        model = AutoModelForCausalLM.from_pretrained(
            self.model_name, device_map=self.device_map, torch_dtype=torch.float16
        )
        # IA³ targets key/value projections and the feed-forward network
        ia3_config = IA3Config(
            peft_type="IA3", task_type="CAUSAL_LM",
            target_modules=["k_proj", "v_proj", "down_proj"],
            feedforward_modules=["down_proj"],
        )
        model = get_peft_model(model, ia3_config)
        model.print_trainable_parameters()
        return model


In [6]:
ia3Trainer = IA3Trainer(MODEL_NAME)

In [7]:
ia3_model = ia3Trainer.setup_ia3()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 524,288 || all params: 7,242,256,384 || trainable%: 0.0072


In [8]:
# --- Training Setup ---

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_steps=500,

    report_to="none",
    remove_unused_columns=False,
    gradient_checkpointing=False,
    optim="adamw_torch",
    warmup_steps=10,
    save_total_limit=1,
    dataloader_pin_memory=False,
    eval_strategy="steps",
    eval_steps=50

)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8,
)

trainer = Trainer(
    model=ia3_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [9]:
# --- Start Training ---
print("Starting training...")
trainer.train()
print("✅ Training complete!")

Starting training...


Step,Training Loss,Validation Loss
50,1.7069,1.751249
100,1.5986,1.698713


✅ Training complete!


In [10]:
# ---Evaluation on Trained Model---

# Quantitative Evaluation: Perplexity
print("\n--- Quantitative Evaluation (Perplexity) ---")
perplexity = load("perplexity", module_type="metric")
eval_results = trainer.evaluate()
print(f"Perplexity on eval set: {torch.exp(torch.tensor(eval_results['eval_loss'])):.2f}")


--- Quantitative Evaluation (Perplexity) ---


Perplexity on eval set: 5.43


In [16]:
!pip freeze > requirements.txt