In [None]:
!pip install --upgrade bitsandbytes
!pip install -r requirements.txt

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from huggingface_hub import login
import json

with open("config.json", "r") as config_file:
    config = json.load(config_file)
    access_token = config["HF_ACCESS_TOKEN"]

login(token=access_token)

In [None]:
from datasets import load_dataset
from datasets.arrow_dataset import Dataset

def format_sample(sample):
    """ Helper function to format a single input sample"""
    instruction=sample['instruction']
    input_text=sample['input']
    output_text=sample['output']

    if input_text is None or input_text=="":
        formatted_prompt=(
            f"<|start_header_id|>user<|end_header_id|>\n\n"
            f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
            f"### Instruction:\n{instruction}\n\n"
            f"### Response:\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            f"{output_text}<|eot_id|>"
        )
    else:
        formatted_prompt=(
            f"<|start_header_id|>user<|end_header_id|>\n\n"
            f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
            f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n"
            f"### Response:\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            f"{output_text}<|eot_id|>"
        )
    formatted_prompt="".join(formatted_prompt) # exclude trailing white spaces
    return formatted_prompt                    # stream text into the dataloader, one by one



def gen_train_input():
    """ Format all data input in alpaca style
        Return:
            A generator on train data "train_gen"
    """
    # load data
    ds=load_dataset("iamtarun/python_code_instructions_18k_alpaca",streaming=True, split="train")
    # datata set has 18.6k samples, we use 16.8k (90%) for training + 1.8k for validation
    num_samples=16800
    counter=0
    for sample in iter(ds):
        if counter>=num_samples:
            break
        formatted_prompt=format_sample(sample)
        yield {'text': formatted_prompt}
        counter+=1


def gen_val_input():
    """ Format all data input in alpaca style
        Return:
            A generator on val data "val_gen"
    """
    # load data
    ds=load_dataset("iamtarun/python_code_instructions_18k_alpaca",streaming=True, split="train")
    # datata set has 18.6k samples, we use 16.8k (90%) for training + 1.8k for validation
    num_samples=16800
    counter=0
    for sample in iter(ds):
        if counter<num_samples:
            counter+=1
            continue

        formatted_prompt=format_sample(sample)
        yield {'text': formatted_prompt}
        counter+=1

dataset_train = Dataset.from_generator(gen_train_input)
dataset_val=Dataset.from_generator(gen_val_input)

Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/905 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
print(f"Train dataset size: {len(dataset_train)}")
print(f"Validation dataset size: {len(dataset_val)}")

print(f"Sample train:\n{dataset_train[0]}")


Train dataset size: 16800
Validation dataset size: 1812
Sample train:
{'text': '<|start_header_id|>user<|end_header_id|>\n\nBelow is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a function to calculate the sum of a sequence of integers.\n\n### Input:\n[1, 2, 3, 4, 5]\n\n### Response:\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n# Python code\ndef sum_sequence(sequence):\n  sum = 0\n  for num in sequence:\n    sum += num\n  return sum<|eot_id|>'}


In [None]:
!pip install trl
!pip install torch transformers accelerate peft bitsandbytes datasets



In [None]:
!pip install -q transformers accelerate torch peft bitsandbytes trl datasets huggingface_hub

In [None]:
import torch
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import warnings

# Suppress specific warnings if needed, but be cautious
warnings.filterwarnings("ignore", message=".*padding_side` right should be used.*")

model_name = "meta-llama/Llama-3.2-1B-Instruct"

def create_and_prepare_model(hf_token=None):
    """Loads and prepares the quantized model and tokenizer for Colab GPU."""
    if not torch.cuda.is_available():
        print("--------------------------------------------------")
        print("ERROR: No GPU detected. This code requires a GPU.")
        print("Go to 'Runtime' -> 'Change runtime type' and select a GPU.")
        print("--------------------------------------------------")
        return None, None, None
    print(f"GPU detected: {torch.cuda.get_device_name(0)}")

    compute_dtype = torch.bfloat16
    print(f"Using compute dtype: {compute_dtype}")

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
    )

    print(f"Loading model: {model_name} with 4-bit quantization...")
    # Use a try-except block for robustness during model loading
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            torch_dtype=compute_dtype,
            device_map="auto",
            token=hf_token # Ensure hf_token is valid if needed, or rely on login()
        )
        print("Model loaded successfully onto GPU(s).")
    except Exception as e:
        print(f"!!! ERROR loading model: {e}")
        return None, None, None # Exit if model loading fails

    peft_config = LoraConfig(
        lora_alpha=16,
        r=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],
    )
    print("LoRA config created.")

    print("Loading tokenizer...")
    # Use a try-except block for robustness during tokenizer loading
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
        # *** ADDED DEBUG PRINT ***
        print(f"DEBUG: Tokenizer object loaded: {tokenizer}")
        # *************************
    except Exception as e:
         print(f"!!! ERROR loading tokenizer: {e}")
         # Decide how to handle tokenizer failure - here we exit
         return model, peft_config, None # Return model/peft but None for tokenizer

    # --- Configure Padding ---
    # Check if tokenizer loaded successfully before proceeding
    if tokenizer is not None:
        if tokenizer.pad_token is None:
            print("Warning: No pad token found. Setting pad_token to eos_token.")
            # Important: Ensure the model's embeddings are resized if adding a *new* special token
            # Using eos_token usually avoids this, but good practice to be aware of.
            tokenizer.pad_token = tokenizer.eos_token
            # model.resize_token_embeddings(len(tokenizer)) # Only needed if adding a truly new token

        tokenizer.padding_side = "right" # Ensure padding side is set

        # --- Final Print Statement ---
        # This print statement now only runs if tokenizer is not None
        print(f"Tokenizer loaded. Pad token set to: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id}). Padding side: {tokenizer.padding_side}")
    else:
        print("Skipping final tokenizer print statement as tokenizer failed to load.")


    return model, peft_config, tokenizer


In [None]:
!pip install -q -U transformers accelerate torch peft bitsandbytes trl datasets huggingface_hub

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.2/865.2 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.1/393.1 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m114.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m91.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.7/897.7 kB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m571.0/571.0 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.2/200.2 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from transformers import TrainingArguments # Make sure this import ran after restart
from trl import SFTTrainer
import os # Import os for path joining later if needed

# --- Define General Training Arguments ---
print("Defining TrainingArguments...")

# *** IMPORTANT: Define where to save checkpoints - Preferably Google Drive ***
# Ensure Google Drive is mounted first (e.g., using 'from google.colab import drive; drive.mount("/content/drive")')
output_dir_gdrive = "/content/drive/MyDrive/colab_training/llama32-python-save15steps"
# Create the directory if it doesn't exist
os.makedirs(output_dir_gdrive, exist_ok=True)
print(f"Checkpoints will be saved to: {output_dir_gdrive}")
# --------------------------------------------------------------------------

training_args = TrainingArguments(
    output_dir=output_dir_gdrive, # <<< Point to your desired save location (ideally Drive)
    num_train_epochs=3,           # <<< Using 1 epoch as discussed for faster run, adjust if needed (e.g., 2 or 3)
    per_device_train_batch_size=4,  # Keep lower for Colab T4
    per_device_eval_batch_size=4,   # Keep lower for Colab T4
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    optim="adamw_torch_fused",
    logging_steps=15,              # <<< Log around the same time as saving
    # --- Checkpoint Strategy Change ---
    save_strategy="steps",         # <<< CHANGE: Save based on steps
    save_steps=15,                 # <<< ADDED: Save every 15 steps
    save_total_limit=3,            # <<< ADDED: Keep only the last 3 checkpoints (prevents filling storage)
    # ----------------------------------
    # evaluation_strategy defaults to "epoch" (will evaluate once at the end if eval_dataset is provided)
    learning_rate=2e-4,
    bf16=True,
    # tf32=True, # Keep disabled unless on Ampere+ GPU
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    gradient_checkpointing_kwargs={"use_reentrant": False},
    remove_unused_columns=False,    # Important for custom datasets
)
print("TrainingArguments defined successfully (set to save every 15 steps).")

# Ensure model, tokenizer, peft_config, dataset_train, dataset_val are loaded/defined correctly before this
# Example: Assuming these were loaded in a previous cell
# Call the create_and_prepare_model function to initialize the model, peft_config, and tokenizer
model, peft_config, tokenizer = create_and_prepare_model()
# dataset_train, dataset_val = load_and_prepare_datasets(...) # Make sure datasets are loaded

print("Defining SFTTrainer...")
# Using the minimal arguments compatible with trl 0.17.0
trainer = SFTTrainer(
    model=model,                    # Your prepared model
    args=training_args,             # Pass the modified TrainingArguments
    train_dataset=dataset_train,    # Training data
    eval_dataset=dataset_val,       # Evaluation data
    peft_config=peft_config,        # LoRA config
    # Removed: tokenizer, dataset_text_field, max_seq_length, packing
)
print("SFTTrainer defined successfully.")

# --- Start Training ---
print("\nAttempting to start training...")
try:
    # *** CHANGE HERE: Set resume_from_checkpoint=True ***
    # This tells the trainer to automatically look for the latest checkpoint
    # in the 'output_dir' specified in TrainingArguments and resume if found.
    print(f"Attempting to resume training from latest checkpoint in {training_args.output_dir}...")
    train_result = trainer.train(resume_from_checkpoint=True)

    print("Training finished successfully.")

    # --- Optional: Save the final adapter ---
    print("Saving final adapter model (after training completion)...")
    final_save_path = os.path.join(output_dir_gdrive, "final_adapter")
    trainer.save_model(final_save_path)
    print(f"Final adapter saved to: {final_save_path}")

    # You might also want to save metrics and state explicitly at the end
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
    print("Final trainer state and metrics saved.")

except Exception as e:
    print(f"\n--- TRAINING FAILED ---")
    print(f"Error during training: {e}")
    import traceback
    traceback.print_exc()
    print("-----------------------")

Defining TrainingArguments...
Checkpoints will be saved to: /content/drive/MyDrive/colab_training/llama32-python-save15steps
TrainingArguments defined successfully (set to save every 15 steps).
GPU detected: Tesla T4
Using compute dtype: torch.bfloat16
Loading model: meta-llama/Llama-3.2-1B-Instruct with 4-bit quantization...


config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Model loaded successfully onto GPU(s).
LoRA config created.
Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

DEBUG: Tokenizer object loaded: PreTrainedTokenizerFast(name_or_path='meta-llama/Llama-3.2-1B-Instruct', vocab_size=128000, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|begin_of_text|>', 'eos_token': '<|eot_id|>'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	128000: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128003: AddedToken("<|reserved_special_token_1|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128004: AddedToken("<|finetune_right_pad_id|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128005: AddedToken("

Converting train dataset to ChatML:   0%|          | 0/16800 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/16800 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/16800 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/16800 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/1812 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/1812 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1812 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1812 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


SFTTrainer defined successfully.

Attempting to start training...
Attempting to resume training from latest checkpoint in /content/drive/MyDrive/colab_training/llama32-python-save15steps...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1440,0.6156
1455,0.6798
1470,0.5953
1485,0.5876
1500,0.6327
1515,0.6239
1530,0.5663
1545,0.6268
1560,0.6045
1575,0.5946


Training finished successfully.
Saving final adapter model (after training completion)...
Final adapter saved to: /content/drive/MyDrive/colab_training/llama32-python-save15steps/final_adapter
***** train metrics *****
  total_flos               = 82619595GF
  train_loss               =     0.0584
  train_runtime            = 1:30:11.09
  train_samples_per_second =      9.314
  train_steps_per_second   =      0.291
Final trainer state and metrics saved.


In [None]:
!pip uninstall -y torch torchvision torchaudio transformers fastai accelerate peft bitsandbytes trl datasets huggingface_hub
!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121
!pip install -q -U transformers accelerate peft bitsandbytes trl datasets huggingface_hub

Found existing installation: torch 2.7.0
Uninstalling torch-2.7.0:
  Successfully uninstalled torch-2.7.0
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Successfully uninstalled transformers-4.51.3
Found existing installation: fastai 2.7.19
Uninstalling fastai-2.7.19:
  Successfully uninstalled fastai-2.7.19
Found existing installation: accelerate 1.6.0
Uninstalling accelerate-1.6.0:
  Successfully uninstalled accelerate-1.6.0
Found existing installation: peft 0.15.2
Uninstalling peft-0.15.2:
  Successfully uninstalled peft-0.15.2
Found existing installation: bitsandbytes 0.45.5
Uninstalling bitsandbytes-0.45.5:
  Successfully uninstalled bitsandbytes-0.45.5
Found exist

In [None]:
import torch
from peft import PeftModel # LoraConfig is not needed for loading here
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # Added BitsAndBytesConfig
import warnings

# Suppress specific warnings if needed
warnings.filterwarnings("ignore", message=".*padding_side` right should be used.*")

def load_quantized_lora_model(base_model_id, adapter_directory, hf_token=None):
    """
    Loads the base model with 4-bit quantization and then applies the LoRA adapter.

    Args:
        base_model_id (str): Identifier for the base model (e.g., "meta-llama/Llama-3.2-1B-Instruct").
        adapter_directory (str): Path to the directory containing adapter_config.json and adapter weights.
        hf_token (str, optional): Hugging Face token if needed for base model. Defaults to None.

    Returns:
        tuple: (model_with_adapters, tokenizer) or (None, None) if loading fails.
    """
    print(f"Loading base model: {base_model_id} with 4-bit quantization...")

    # --- Load Base Model with Quantization (Recommended for Colab) ---
    if not torch.cuda.is_available():
        print("Warning: CUDA not available, loading model on CPU in float32. May be slow/require lots of RAM.")
        model_dtype = torch.float32
        quantization_config = None
        device_map = "cpu"
    else:
        print(f"GPU detected: {torch.cuda.get_device_name(0)}")
        model_dtype = torch.bfloat16 # Use bf16 for compute on compatible GPUs
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=model_dtype,
            bnb_4bit_use_double_quant=True, # Same as training
        )
        device_map = "auto"

    try:
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_id,
            quantization_config=quantization_config,
            torch_dtype=model_dtype,
            device_map=device_map,
            token=hf_token,
            # trust_remote_code=True # Only if needed by model
        )
        print("Base model loaded.")
    except Exception as e:
        print(f"!!! ERROR loading base model: {e}")
        return None, None

    # --- Load Tokenizer ---
    print("Loading tokenizer...")
    try:
        # Load tokenizer associated with the base model or from where it was saved
        # Using base_model_id is usually safe if tokenizer wasn't modified heavily
        tokenizer = AutoTokenizer.from_pretrained(base_model_id, token=hf_token)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "right" # Consistent setting
        print("Tokenizer loaded.")
    except Exception as e:
        print(f"!!! ERROR loading tokenizer: {e}")
        # Clean up loaded base model if tokenizer fails
        del base_model
        if torch.cuda.is_available(): torch.cuda.empty_cache()
        return None, None

    # --- Load LoRA Adapters ---
    print(f"Loading LoRA adapters from: {adapter_directory}")
    try:
        # This automatically loads adapter_config.json and adapter_model.safetensors
        # and attaches the adapters to the base_model
        model_with_adapters = PeftModel.from_pretrained(base_model, adapter_directory)
        print("LoRA adapters loaded successfully.")
    except Exception as e:
        print(f"!!! ERROR loading LoRA adapters: {e}")
        # Clean up loaded base model/tokenizer if adapter loading fails
        del base_model, tokenizer
        if torch.cuda.is_available(): torch.cuda.empty_cache()
        return None, None

    # Set to evaluation mode (PeftModel.from_pretrained might do this, but explicit is fine)
    model_with_adapters.eval()
    print("Model set to evaluation mode.")

    return model_with_adapters, tokenizer

# --- Define Paths ---
base_model_id = "meta-llama/Llama-3.2-1B-Instruct"
# *** IMPORTANT: Use the DIRECTORY where adapters were saved ***
adapter_directory = "/content/drive/MyDrive/colab_training/llama32-python-save15steps/final_adapter" # Or path to a specific checkpoint folder like ".../checkpoint-1575"

# --- Load Model ---
print("Loading fine-tuned model using PeftModel.from_pretrained...")
# Make sure you have logged in or provide token if needed
model_ft, tokenizer = load_quantized_lora_model(base_model_id, adapter_directory)

# --- Check if loading was successful ---
if model_ft is not None and tokenizer is not None:
    print("\nModel and tokenizer loaded successfully!")
    # Count parameters (Optional)
    total_params = sum(p.numel() for p in model_ft.parameters())
    trainable_params = sum(p.numel() for p in model_ft.parameters() if p.requires_grad)
    print(f"Total parameters (including base model): {total_params:,}")
    print(f"Trainable parameters (LoRA adapters): {trainable_params:,}")

    # --- Ready for Inference ---
    # Example:
    # prompt = "Your prompt here"
    # inputs = tokenizer(prompt, return_tensors="pt").to(model_ft.device)
    # outputs = model_ft.generate(**inputs, max_new_tokens=100)
    # print(tokenizer.decode(outputs[0], skip_special_tokens=True))
else:
    print("\nFailed to load the model or tokenizer.")

Loading fine-tuned model using PeftModel.from_pretrained...
Loading base model: meta-llama/Llama-3.2-1B-Instruct with 4-bit quantization...
GPU detected: Tesla T4


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Base model loaded.
Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Tokenizer loaded.
Loading LoRA adapters from: /content/drive/MyDrive/colab_training/llama32-python-save15steps/final_adapter
LoRA adapters loaded successfully.
Model set to evaluation mode.

Model and tokenizer loaded successfully!
Total parameters (including base model): 760,547,328
Trainable parameters (LoRA adapters): 0


In [None]:
import torch
from transformers import AutoTokenizer # Assuming tokenizer and model are loaded elsewhere

# Example EOS token IDs for Llama 3 (replace if needed)
# You can often get this from tokenizer.eos_token_id or tokenizer.convert_tokens_to_ids(...)
LLAMA3_EOS_IDS = [
    128001, # <|eot_id|>
    128009, # <|end_of_text|>
]

def generate_with_hf(model, tokenizer, prompt, max_new_tokens=256, temperature=0.6, top_k=50, top_p=0.9, eos_token_id=LLAMA3_EOS_IDS):
    """
    Generates text using the Hugging Face model.generate() method with Llama 3 chat template.

    Args:
        model: The loaded Hugging Face model (e.g., PeftModelForCausalLM).
        tokenizer: The loaded Hugging Face tokenizer.
        prompt (str): The user's instruction or question.
        max_new_tokens (int): Maximum number of new tokens to generate.
        temperature (float): Controls randomness. Higher values (e.g., 0.7) make output more random,
                           lower values (e.g., 0.2) make it more deterministic. 0 disables sampling (greedy).
        top_k (int): Filters predictions to the top K most likely tokens. 0 disables top-k.
        top_p (float): Filters predictions using nucleus sampling (cumulative probability). 1.0 disables top-p.
        eos_token_id (int or list[int]): The token ID(s) that signify the end of generation.

    Returns:
        str: The generated text response.
    """
    # Ensure model and tokenizer are loaded
    if model is None or tokenizer is None:
        raise ValueError("Model and tokenizer must be loaded before calling generate.")

    # Get the device the model is on
    model_device = next(model.parameters()).device

    # --- Apply the Llama 3 Chat Template ---
    # Create the message structure expected by the template
    messages = [
        {"role": "user", "content": prompt},
        # Add previous conversation turns here if needed, e.g.:
        # {"role": "assistant", "content": "Previous response..."},
        # {"role": "user", "content": "Follow-up question..."}
    ]

    # Apply the template. add_generation_prompt=True adds the required tokens
    # for the model to start its response (e.g., '<|start_header_id|>assistant<|end_header_id|>\n\n')
    try:
        # Note: Some tokenizers might not require add_generation_prompt=True if the template handles it
        inputs_templated = tokenizer.apply_chat_template(
            messages,
            tokenize=True,          # Return token IDs
            add_generation_prompt=True, # Add the prompt for the assistant's turn
            return_tensors="pt"     # Return PyTorch tensors
        )
    except Exception as e:
        print(f"Error applying chat template: {e}")
        print("Ensure your tokenizer supports chat templates or format the prompt manually.")
        # Fallback manual formatting (less ideal)
        formatted_prompt = (
            f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|>"
            f"<|start_header_id|>assistant<|end_header_id|>\n\n"
        )
        inputs_templated = tokenizer(formatted_prompt, return_tensors="pt")


    # Move inputs to the same device as the model
    inputs = inputs_templated.to(model_device)
    input_length = inputs.input_ids.shape[1] # Get length of the input prompt tokens

    # --- Set Sampling Parameters ---
    do_sample = True
    if temperature <= 0.0 or temperature == 1.0 and top_k == 1 and top_p == 1.0:
        # If temperature is 0 or effectively greedy, disable sampling
        do_sample = False
        temperature = 1.0 # Set temp to 1.0 for greedy decoding
        top_k = 1         # Set top_k to 1 for greedy decoding
        top_p = 1.0       # Set top_p to 1.0 for greedy decoding
        print("Using greedy decoding (temperature=0 or equivalent).")
    else:
        print(f"Using sampling: temp={temperature}, top_k={top_k}, top_p={top_p}")

    # --- Generate ---
    print(f"Generating response (max_new_tokens={max_new_tokens})...")
    with torch.no_grad(): # Ensure no gradients are calculated during inference
        # Generate output tokens
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=max_new_tokens,
            eos_token_id=eos_token_id,
            do_sample=do_sample,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id # Use pad token ID if set, otherwise EOS
            # attention_mask=inputs.attention_mask # Pass attention mask if your input needs it (usually handled by generate)
        )

    # --- Decode the Output ---
    # Get only the generated tokens (excluding the input prompt)
    generated_ids = outputs[0, input_length:]
    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)

    print("Generation complete.")
    return generated_text

# --- Example Usage ---
# Assuming 'model_ft' and 'tokenizer' are loaded from the previous step

# if model_ft is not None and tokenizer is not None:
#     user_prompt = "Write a short Python function to calculate the factorial of a number."
#     print(f"\nUser Prompt:\n{user_prompt}\n")
#
#     response = generate_with_hf(
#         model=model_ft,
#         tokenizer=tokenizer,
#         prompt=user_prompt,
#         max_new_tokens=150,
#         temperature=0.2, # Lower temp for more factual code
#         top_k=10,
#         top_p=1.0 # Disable top-p for this example
#     )
#
#     print(f"Generated Response:\n{response}")
# else:
#     print("Model or tokenizer not loaded, cannot generate.")



In [None]:
import torch
from transformers import AutoTokenizer # Assuming tokenizer and model are loaded elsewhere

# Example EOS token IDs for Llama 3 (replace if needed)
LLAMA3_EOS_IDS = [
    128001, # <|eot_id|>
    128009, # <|end_of_text|>
]

def generate_with_hf(model, tokenizer, prompt, max_new_tokens=256, temperature=0.6, top_k=50, top_p=0.9, eos_token_id=LLAMA3_EOS_IDS):
    """
    Generates text using the Hugging Face model.generate() method with Llama 3 chat template.
    Corrected to handle tokenizer output structure properly.

    Args:
        model: The loaded Hugging Face model (e.g., PeftModelForCausalLM).
        tokenizer: The loaded Hugging Face tokenizer.
        prompt (str): The user's instruction or question.
        max_new_tokens (int): Maximum number of new tokens to generate.
        temperature (float): Controls randomness. Higher values (e.g., 0.7) make output more random,
                           lower values (e.g., 0.2) make it more deterministic. 0 disables sampling (greedy).
        top_k (int): Filters predictions to the top K most likely tokens. 0 disables top-k.
        top_p (float): Filters predictions using nucleus sampling (cumulative probability). 1.0 disables top-p.
        eos_token_id (int or list[int]): The token ID(s) that signify the end of generation.

    Returns:
        str: The generated text response.
    """
    # Ensure model and tokenizer are loaded
    if model is None or tokenizer is None:
        raise ValueError("Model and tokenizer must be loaded before calling generate.")

    # Get the device the model is on
    model_device = next(model.parameters()).device

    # --- Apply the Llama 3 Chat Template ---
    messages = [
        {"role": "user", "content": prompt},
    ]
    try:
        inputs_dict = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt"
        )
        # *** Correction: Move the specific tensor to device ***
        input_ids = inputs_dict['input_ids'].to(model_device)
        # attention_mask might also be present and needed
        attention_mask = inputs_dict.get('attention_mask') # Use .get() for safety
        if attention_mask is not None:
            attention_mask = attention_mask.to(model_device)

    except Exception as e:
        print(f"Error applying chat template: {e}")
        print("Falling back to manual formatting (less ideal).")
        formatted_prompt = (
            f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|>"
            f"<|start_header_id|>assistant<|end_header_id|>\n\n"
        )
        inputs_dict = tokenizer(formatted_prompt, return_tensors="pt")
        # *** Correction: Move the specific tensor to device ***
        input_ids = inputs_dict['input_ids'].to(model_device)
        attention_mask = inputs_dict.get('attention_mask')
        if attention_mask is not None:
            attention_mask = attention_mask.to(model_device)


    # *** Correction: Get length from the input_ids tensor ***
    input_length = input_ids.shape[1]

    # --- Set Sampling Parameters ---
    do_sample = True
    if temperature <= 0.0 or temperature == 1.0 and top_k == 1 and top_p == 1.0:
        do_sample = False
        temperature = 1.0
        top_k = 1
        top_p = 1.0
        print("Using greedy decoding (temperature=0 or equivalent).")
    else:
        print(f"Using sampling: temp={temperature}, top_k={top_k}, top_p={top_p}")

    # --- Generate ---
    print(f"Generating response (max_new_tokens={max_new_tokens})...")
    with torch.no_grad():
        # *** Correction: Pass input_ids and optionally attention_mask ***
        generate_kwargs = {
            "input_ids": input_ids,
            "max_new_tokens": max_new_tokens,
            "eos_token_id": eos_token_id,
            "do_sample": do_sample,
            "temperature": temperature,
            "top_k": top_k,
            "top_p": top_p,
            "pad_token_id": tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
        }
        # Only add attention_mask if it exists
        if attention_mask is not None:
            generate_kwargs["attention_mask"] = attention_mask

        outputs = model.generate(**generate_kwargs)


    # --- Decode the Output ---
    generated_ids = outputs[0, input_length:]
    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)

    print("Generation complete.")
    return generated_text

# --- Example Usage ---
# Assuming 'model_ft' and 'tokenizer' are loaded from the previous step

if 'model_ft' in globals() and model_ft is not None and 'tokenizer' in globals() and tokenizer is not None:

    user_prompt = "Write a function that computes fibonacci numbers."
    print(f"\nUser Prompt:\n{user_prompt}\n")

    response = generate_with_hf(
        model=model_ft,
        tokenizer=tokenizer,
        prompt=user_prompt,
        max_new_tokens=150,
        temperature=0.2,
        top_k=10,
        top_p=1.0
    )

    print(f"Generated Response:\n-------------------\n{response}\n-------------------")

else:
    print("Model 'model_ft' or 'tokenizer' is not loaded. Please run the loading cell first.")



User Prompt:
Write a function that computes fibonacci numbers.

Error applying chat template: too many indices for tensor of dimension 2
Falling back to manual formatting (less ideal).
Using sampling: temp=0.2, top_k=10, top_p=1.0
Generating response (max_new_tokens=150)...
Generation complete.
Generated Response:
-------------------
# Python program to compute Fibonacci numbers

def fibonacci(n):
    a, b = 0, 1
    if n < 0:
        print("Incorrect input")
    elif n == 0:
        return a
    elif n == 1:
        return b
    else:
        for i in range(2, n+1):
            c = a + b
            a = b
            b = c
        return c

# Driver code
print(fibonacci(9))
-------------------
