In [3]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
!pip install --no-deps xformers trl peft accelerate bitsandbytes


In [6]:
!pip install "unsloth[colab-new]"
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)


Collecting unsloth[colab-new]
  Downloading unsloth-2025.9.10-py3-none-any.whl.metadata (55 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.9.13 (from unsloth[colab-new])
  Downloading unsloth_zoo-2025.9.13-py3-none-any.whl.metadata (31 kB)
Collecting tyro (from unsloth[colab-new])
  Downloading tyro-0.9.32-py3-none-any.whl.metadata (11 kB)
Collecting datasets!=4.0.*,!=4.1.0,>=3.4.1 (from unsloth[colab-new])
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting pyarrow>=21.0.0 (from datasets!=4.0.*,!=4.1.0,>=3.4.1->unsloth[colab-new])
  Downloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting cut_cross_entropy (from unsloth_zoo>=2025.9.13->unsloth[colab-new])
  Downloading cut_cross_entropy-25.1.1-py3-none-an

model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

In [7]:
from datasets import load_dataset
expert_constitution_prompt = """You are an expert on the Constitution of India. Your task is to answer the following question directly and accurately based on constitutional principles.

### Question:
{}

### Answer:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    questions = examples["question"]
    answers   = examples["answer"]
    texts = []
    for question, answer in zip(questions, answers):
        text = expert_constitution_prompt.format(question, answer) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

# Load and map the dataset
dataset = load_dataset("json", data_files="/content/drive/MyDrive/Colab Notebooks/finetuning_dataset/constitution_qa.json", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/4082 [00:00<?, ? examples/s]

In [8]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

Unsloth 2025.9.10 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [9]:
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Baseline GPU memory usage: {start_gpu_memory} GB")

Baseline GPU memory usage: 6.967 GB


In [10]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 2,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/4082 [00:00<?, ? examples/s]

In [11]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,082 | Num Epochs = 2 | Total steps = 1,022
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkushk41001[0m ([33mkushk41001-personal-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.1901
2,2.1287
3,2.1692
4,2.2593
5,1.9587
6,1.6277
7,1.2932
8,1.2014
9,1.0526
10,1.0711


Step,Training Loss
1,2.1901
2,2.1287
3,2.1692
4,2.2593
5,1.9587
6,1.6277
7,1.2932
8,1.2014
9,1.0526
10,1.0711


In [12]:

max_memory = round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 3)

# Calculate the final stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024**3, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

# Print the results
print(f"--- Training Stats ---")
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

--- Training Stats ---
4395.7381 seconds used for training.
73.26 minutes used for training.
Peak reserved memory = 7.324 GB.
Peak reserved memory for training = 0.357 GB.
Peak reserved memory % of max memory = 49.685 %.
Peak reserved memory for training % of max memory = 2.422 %.


In [13]:
FastLanguageModel.for_inference(model)

inputs = tokenizer(
[
    expert_constitution_prompt.format(
        "What does the Right to Equality state in the Constitution of India?",
        "",
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 210, use_cache = True)
print(tokenizer.batch_decode(outputs)[0])

<|begin_of_text|>You are an expert on the Constitution of India. Your task is to answer the following question directly and accurately based on constitutional principles.

### Question:
What does the Right to Equality state in the Constitution of India?

### Answer:
The Right to Equality states that the State shall not deny to any person equality before the law or the equal protection of the laws within the territory of India.<|end_of_text|>


In [14]:
save_directory = "/content/drive/MyDrive/Colab Notebooks/Lora_finetuned_model/consitution_lora_adapter"
model.save_pretrained(save_directory)
print(f"Model and tokenizer saved successfully to: {save_directory}")

Model and tokenizer saved successfully to: /content/drive/MyDrive/Colab Notebooks/Lora_finetuned_model/consitution_lora_adapter


In [None]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "/content/drive/MyDrive/Colab Notebooks/Lora_finetuned_model/consitution_lora_adapter",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference


inputs = tokenizer(
[
    expert_constitution_prompt.format(

        "Who has the power to grant pardons, reprieves, or remissions of punishment?",
        "",
    )
], return_tensors = "pt").to("cuda")

# Generate the response
outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
print(tokenizer.batch_decode(outputs)[0])

==((====))==  Unsloth 2025.9.9: Fast Llama patching. Transformers: 4.56.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
<|begin_of_text|>You are an expert on the Constitution of India. Your task is to answer the following question directly and accurately based on constitutional principles.

### Question:
Who has the power to grant pardons, reprieves, or remissions of punishment?

### Answer:
The President<|end_of_text|>


In [None]:
# change to true to save the model combined with lora adapters
final_dir = "/content/drive/MyDrive/Colab Notebooks/Merged_model/constitution_model"
if False: model.save_pretrained_merged(final_dir, tokenizer, save_method = "merged_16bit",)

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files: 100%|██████████| 4/4 [11:48<00:00, 177.25s/it]
Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [14:48<00:00, 222.15s/it]


Unsloth: Merge process complete.
