In [24]:
## Importing necessary libararies
import torch
torch.cuda.empty_cache()


In [42]:
## Hugging face login --> for importing llama model
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: read)

In [26]:
# Install required libraries
!pip install transformers datasets bitsandbytes peft

# Import necessary libraries
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model

# Load your dataset from the local JSON file
dataset = load_dataset("json", data_files={"train": "new.json"})["train"]

# Preprocess your dataset: combine the fields into one text prompt.
def preprocess(example):
    return {
        "text": (
            f"Instruction: {example['instruction']}\n"
            f"Question: {example['question']}\n"
            f"Response: {example['response']}\n"
        )
    }
dataset = dataset.map(preprocess)
print(dataset[0])  # Optional: check a sample

# Set up quantization configuration for 4-bit training
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # Compute in FP16
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"  # Use NF4 quantization type (you can experiment with "fp4" as well)
)

# Load Llama 3.2 1B model and tokenizer using 4-bit quantization
model_name = "meta-llama/Llama-3.2-1B"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto"
)
model.config.use_cache = False  # Disable cache for training

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

# Set up LoRA configuration to further reduce the number of trainable parameters
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

# Tokenize the dataset
def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512
    )

tokenized_dataset = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"]
)

# Set up the data collator for causal LM (handles padding)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Disable masked LM for causal models
)

# Define training arguments (adjust hyperparameters as needed)
training_args = TrainingArguments(
    output_dir="./llama3.2-1B-quant-finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Use a smaller batch size to reduce memory usage
    gradient_accumulation_steps=8,  # Adjust to achieve desired effective batch size
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=100,
    warmup_ratio=0.1,
    fp16=True,  # Mixed precision can still be enabled if supported
    report_to=[]  # Disable wandb logging if not needed
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

# Start fine-tuning (this should now use significantly less GPU memory)
trainer.train()

# Save the fine-tuned model
trainer.save_model("./llama3.2-1B-quant-finetuned")


{'instruction': 'Act as a subject expert in Data Structures and Algorithms.', 'question': 'What are variables in programming, and why are they important?', 'response': 'Variables are symbolic names for storage locations that hold data in a computer program. They allow us to label and manipulate data, similar to how algebra uses letters to represent numbers in equations. For example, in a simple math equation, variables x and y might hold values 10 and 20, respectively. In programming, variables let us store user input, process information, and keep track of state. Without variables, every operation would have to be expressed in raw binary or machine code, making programs extremely difficult to write and maintain. Their importance lies in making code more readable, maintainable, and efficient by abstracting data storage.', 'text': 'Instruction: Act as a subject expert in Data Structures and Algorithms.\nQuestion: What are variables in programming, and why are they important?\nResponse: 

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,1.9734
20,1.7829
30,1.5843
40,1.5344
50,1.4956
60,1.4336
70,1.3533
80,1.4464
90,1.3695
100,1.3252


In [27]:
trainer.save_model("./llama3.2-1B-quant-finetuned")

# Save the tokenizer
tokenizer.save_pretrained("./llama3.2-1B-quant-finetuned") # Save the tokenizer to the same directory


('./llama3.2-1B-quant-finetuned/tokenizer_config.json',
 './llama3.2-1B-quant-finetuned/special_tokens_map.json',
 './llama3.2-1B-quant-finetuned/tokenizer.json')

In [28]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_path = "./llama3.2-1B-quant-finetuned"

# Specify device_map to load the model onto the appropriate device(s)
# Adjust the device IDs as needed for your system

# If you have multiple GPUs, you can try to load the model across them:
# device_map = {"": 0, "lm_head": 1}  # Example for a two-GPU setup

# If you only have one GPU:
device_map = "auto"  # Let Transformers automatically choose the device

# Use the original quantization config or create a new one with offloading enabled
# original_quant_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4"
# )
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,  # Use bfloat16 for compute dtype
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    llm_int8_enable_fp32_cpu_offload=True  # Enable CPU offloading for specific layers
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map=device_map,  # Apply the device map
    quantization_config=quant_config, # Use the specified quantization config
    # or quantization_config=original_quant_config,  # Use the original config
)

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

print("Model loaded in 4-bit successfully!")

Model loaded in 4-bit successfully!


In [31]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Load the fine-tuned model and tokenizer
model_path = "./llama3.2-1B-quant-finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# Use the same quantization config used during training when loading the model
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # Assuming you used float16 during training
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    quantization_config=quant_config # This is the key change
)

# Create a text-generation pipeline
generate_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=200,
    temperature=0.7,
)

# ... (Rest of your code for inference)
# Define a prompt for testing
prompt = (
    "Instruction: Act as a subject expert in Stack Data Structure. Answer the following question.\n"
    "Question: What is Stack Data Structure?\n"
    "Response:"
)

# Generate text
outputs = generate_pipeline(prompt, num_return_sequences=1)
print(outputs[0]["generated_text"])


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Instruction: Act as a subject expert in Stack Data Structure. Answer the following question.
Question: What is Stack Data Structure?
Response: Stack Data Structure: Stack is a linear data structure that consists of a sequence of elements, with the first element at the beginning and the last element at the end. Operations: 1. Push: Add an element at the top of the stack. 2. Pop: Remove an element from the top of the stack. 3. Top: Return the top element without removing it. 4. Is Empty: Check if the stack is empty. 5. Size: Count the number of elements in the stack. 6. Rearrange: Move elements from the bottom of the stack to the top. 7. Balance: Check if the stack is balanced (i.e., the number of left and right parentheses is equal). 8. Implement Stack using Linked List: Linked List can be used to represent the stack by storing the elements in a


Sample Output 1{Question was What is Hashing}:

Response: Hashing techniques are used to transform the data to a smaller fixed-size representation that can be easily stored and retrieved. There are four main types: 1. Direct Methods: 2. Hashing: 3. Collision Resolution: 4. Collision Resolution Methods 1. Direct Methods: Direct methods use a fixed-size array to store the data, and a collision resolution algorithm to identify the duplicate entries. The simplest direct method is linear probing, where each entry is assigned an index and the array is searched from left to right. 2. Hashing: Hashing is a technique where a fixed-size array is divided into buckets and each entry is hashed to the bucket number. The hash function is used to map the data to the bucket number, and the array is searched from the bucket number to retrieve the data.

In [32]:
# Importing  bits and byte module for qunatisating modell into smaller model.
!pip install transformers bitsandbytes peft




In [33]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel


In [34]:
base_model_name = "meta-llama/Llama-3.2-1B"  # The base Llama model you used
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map="auto",
)
base_model.config.use_cache = False


In [35]:
lora_path = "./llama3.2-1B-quant-finetuned"
model = PeftModel.from_pretrained(base_model, lora_path)
model.eval()  # put model in evaluation mode


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear

This is basically testing out fine tunned model.

It loads the base model along with the fine-tuned LoRA adapter, merges them, and then uses the combined model in a text-generation pipeline to see how it responds to a given prompt.

In [37]:
from peft import PeftModel, PeftConfig

lora_path = "./llama3.2-1B-quant-finetuned"

# 1. Load base model + config
config = PeftConfig.from_pretrained(lora_path)

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    quantization_config=quant_config,
    device_map="auto"
)

# 2. Load LoRA adapter
model = PeftModel.from_pretrained(base_model, lora_path)

# 3. Merge LoRA layers into the base model
model = model.merge_and_unload()   # <--- merges LoRA => LlamaForCausalLM

# 4. Load the tokenizer from the base model
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, trust_remote_code=True)

# 5. Now 'model' is a standard LlamaForCausalLM class
from transformers import pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=256,
    temperature=0.7,
    do_sample=True
)

prompt = (
    "Instruction: Provide a concise summary on tree data structures.\n"
    "Question: What are the key properties and common applications of trees in computer science?\n"
    "Response:"
)

outputs = generator(prompt, num_return_sequences=1)
print(outputs[0]["generated_text"])


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Instruction: Provide a concise summary on tree data structures.
Question: What are the key properties and common applications of trees in computer science?
Response: Trees are a fundamental data structure in computer science. They allow us to organize data in a hierarchical manner, which is essential for many applications such as search and retrieval. Trees are used in many different contexts, including graph theory, data structures, and algorithms.
Trees can be implemented using different data structures, such as binary trees, binary search trees, AVL trees, and more. Binary trees are a popular choice because they are efficient and easy to implement. Binary search trees are a specialized case of binary trees, where each node has a key value and a pointer to its left and right children. AVL trees are another popular choice, which are also efficient and easy to implement.
Trees are used for many different applications, including searching and retrieval. Search algorithms are designed to