In [24]:
## Importing necessary libararies
import torch
torch.cuda.empty_cache()


In [25]:
## Hugging face login --> for importing llama model
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: read)

In [26]:
# Install required libraries
!pip install transformers datasets bitsandbytes peft

# Import necessary libraries
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model

# Load your dataset from the local JSON file
dataset = load_dataset("json", data_files={"train": "augmented_llama_finetune_chat_data.json"})["train"]

# Preprocess your dataset: combine the fields into one text prompt.
def preprocess(example):
    return {
        "text": (
            f"Instruction: {example['instruction']}\n"
            f"Question: {example['question']}\n"
            f"Response: {example['response']}\n"
        )
    }
dataset = dataset.map(preprocess)
print(dataset[0])  # Optional: check a sample

# Set up quantization configuration for 4-bit training
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # Compute in FP16
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"  # Use NF4 quantization type (you can experiment with "fp4" as well)
)

# Load Llama 3.2 1B model and tokenizer using 4-bit quantization
model_name = "meta-llama/Llama-3.2-1B"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto"
)
model.config.use_cache = False  # Disable cache for training

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

# Set up LoRA configuration to further reduce the number of trainable parameters
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

# Tokenize the dataset
def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512
    )

tokenized_dataset = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"]
)

# Set up the data collator for causal LM (handles padding)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Disable masked LM for causal models
)

# Define training arguments (adjust hyperparameters as needed)
training_args = TrainingArguments(
    output_dir="./llama3.2-1B-quant-finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Use a smaller batch size to reduce memory usage
    gradient_accumulation_steps=8,  # Adjust to achieve desired effective batch size
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=100,
    warmup_ratio=0.1,
    fp16=True,  # Mixed precision can still be enabled if supported
    report_to=[]  # Disable wandb logging if not needed
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

# Start fine-tuning (this should now use significantly less GPU memory)
trainer.train()

# Save the fine-tuned model
trainer.save_model("./llama3.2-1B-quant-finetuned")




Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

KeyError: 'instruction'

In [None]:
!nvidia-smi


In [None]:
trainer.save_model("./llama3.2-1B-quant-finetuned")

# Save the tokenizer
tokenizer.save_pretrained("./llama3.2-1B-quant-finetuned") # Save the tokenizer to the same directory


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_path = "./llama3.2-1B-quant-finetuned"

# Specify device_map to load the model onto the appropriate device(s)
# Adjust the device IDs as needed for your system

# If you have multiple GPUs, you can try to load the model across them:
# device_map = {"": 0, "lm_head": 1}  # Example for a two-GPU setup

# If you only have one GPU:
device_map = "auto"  # Let Transformers automatically choose the device

# Use the original quantization config or create a new one with offloading enabled
# original_quant_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4"
# )
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,  # Use bfloat16 for compute dtype
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    llm_int8_enable_fp32_cpu_offload=True  # Enable CPU offloading for specific layers
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map=device_map,  # Apply the device map
    quantization_config=quant_config, # Use the specified quantization config
    # or quantization_config=original_quant_config,  # Use the original config
)

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

print("Model loaded in 4-bit successfully!")

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Load the fine-tuned model and tokenizer
model_path = "./llama3.2-1B-quant-finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# Use the same quantization config used during training when loading the model
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # Assuming you used float16 during training
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    quantization_config=quant_config # This is the key change
)

# Create a text-generation pipeline
generate_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=200,
    temperature=0.7,
)



In [None]:
# Take dynamic question input from user
user_question = input("Ask your question: ")

# Static instruction (can be made dynamic too)
instruction = "Act as a subject expert in Stack Data Structure. Answer the following question."

# Build the prompt dynamically
prompt = f"Instruction: {instruction}\nQuestion: {user_question}\nResponse:"
outputs = generate_pipeline(prompt, num_return_sequences=1)
print(outputs[0]["generated_text"])


Sample Output 1{Question was What is Hashing}:

Response: Hashing techniques are used to transform the data to a smaller fixed-size representation that can be easily stored and retrieved. There are four main types: 1. Direct Methods: 2. Hashing: 3. Collision Resolution: 4. Collision Resolution Methods 1. Direct Methods: Direct methods use a fixed-size array to store the data, and a collision resolution algorithm to identify the duplicate entries. The simplest direct method is linear probing, where each entry is assigned an index and the array is searched from left to right. 2. Hashing: Hashing is a technique where a fixed-size array is divided into buckets and each entry is hashed to the bucket number. The hash function is used to map the data to the bucket number, and the array is searched from the bucket number to retrieve the data.

In [None]:
# Importing  bits and byte module for qunatisating modell into smaller model.
!pip install transformers bitsandbytes peft


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel


In [None]:
base_model_name = "meta-llama/Llama-3.2-1B"  # The base Llama model you used
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map="auto",
)
base_model.config.use_cache = False


In [None]:
lora_path = "./llama3.2-1B-quant-finetuned"
model = PeftModel.from_pretrained(base_model, lora_path)
model.eval()  # put model in evaluation mode


This is basically testing out fine tunned model.

It loads the base model along with the fine-tuned LoRA adapter, merges them, and then uses the combined model in a text-generation pipeline to see how it responds to a given prompt.

In [None]:
from peft import PeftModel, PeftConfig

lora_path = "./llama3.2-1B-quant-finetuned"

# 1. Load base model + config
config = PeftConfig.from_pretrained(lora_path)

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    quantization_config=quant_config,
    device_map="auto"
)

# 2. Load LoRA adapter
model = PeftModel.from_pretrained(base_model, lora_path)

# 3. Merge LoRA layers into the base model
model = model.merge_and_unload()   # <--- merges LoRA => LlamaForCausalLM

# 4. Load the tokenizer from the base model
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, trust_remote_code=True)

# 5. Now 'model' is a standard LlamaForCausalLM class
from transformers import pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=256,
    temperature=0.7,
    do_sample=True
)

prompt = (
    "Instruction: Provide a concise summary on tree data structures.\n"
    "Question: What are the key properties and common applications of trees in computer science?\n"
    "Response:"
)

outputs = generator(prompt, num_return_sequences=1)
print(outputs[0]["generated_text"])


Doing infernce here !

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load from local path
tokenizer = AutoTokenizer.from_pretrained("./llama3.2-1B-quant-finetuned")
model = AutoModelForCausalLM.from_pretrained("./llama3.2-1B-quant-finetuned")

# Set model to eval mode
model.eval()


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load your model (either local path or Hugging Face repo)
model_path = "/content/llama3.2-1B-quant-finetuned"  # Change this to your model folder
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# ✅ Set model to eval mode
model.eval()

# 🧠 List of prompts to test
prompts = [
    "What is a Linked List? Give an example.",
    "Explain insert(), delete(), and find() in a linked list.",
    "What are the advantages and disadvantages of using a Linked List?",
    "When should you use a linked list instead of an array?",
    "Explain the difference between singly and doubly linked list.",
    "What is a circular linked list?",
    "Explain a stack with a real-life example.",
    "How does push and pop work in a stack?",
    "What is a queue and how is it different from a stack?",
]

# 🔁 Loop over each prompt
for idx, prompt in enumerate(prompts, 1):
    print(f"\n🔹 Prompt {idx}: {prompt}\n")

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device)

    # Generate output
    with torch.no_grad():
        output = model.generate(
            inputs["input_ids"],
            attention_mask=inputs.get("attention_mask"),
            max_length=300,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode and print
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"🧠 Response:\n{generated_text}")
