In [1]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    get_linear_schedule_with_warmup,
)
from peft import PeftModel, LoraConfig, TaskType, get_peft_model
import os
from tqdm import tqdm
from torch.utils.data import DataLoader
import bitsandbytes as bnb

def prepare_dataset(file_path, tokenizer, max_length=256):
    """Loads a JSONL file and tokenizes it for causal language modeling."""
    dataset = load_dataset("json", data_files=file_path)["train"]

    def tokenize_function(examples):
        if "prompt" not in examples or "completion" not in examples:
            raise KeyError("Dataset must have 'prompt' and 'completion' fields.")

        # Combine prompt and completion for Causal LM fine-tuning
        full_text = [
            f"Human: {p}\nBart:{c}{tokenizer.eos_token}"
            for p, c in zip(examples["prompt"], examples["completion"])
        ]
        model_inputs = tokenizer(
            full_text, truncation=True, max_length=max_length, padding="max_length"
        )

        # Create labels and mask the prompt portion
        labels = torch.tensor(model_inputs["input_ids"])
        prompts_only = [f"Human: {p}\nBart:" for p in examples["prompt"]]
        prompt_toks = tokenizer(prompts_only, add_special_tokens=False)
        prompt_lengths = [len(p) for p in prompt_toks["input_ids"]]

        for i, length in enumerate(prompt_lengths):
            labels[i, :length] = -100  # Mask prompt tokens

        model_inputs["labels"] = labels
        return model_inputs

    tokenized_dataset = dataset.map(
        tokenize_function, batched=True, remove_columns=dataset.column_names
    )
    tokenized_dataset.set_format(type="torch")
    return tokenized_dataset


def load_model(model_name, peft_config, peft_model_path=None):
    """Loads the base model with 4-bit quantization and applies PEFT adapters."""
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )

    base_model = AutoModelForCausalLM.from_pretrained(
        model_name, quantization_config=bnb_config, device_map="auto"
    )

    if peft_model_path and os.path.exists(os.path.join(peft_model_path, "adapter_config.json")):
        print(f"Loading PEFT adapters from {peft_model_path}")
        model = PeftModel.from_pretrained(base_model, peft_model_path)
    else:
        print("Creating new PEFT model")
        model = get_peft_model(base_model, peft_config)

    model.print_trainable_parameters()
    return model


def fine_tune(model, dataset, output_dir, num_epochs, batch_size=1, learning_rate=1e-4):
    """Performs the training loop for the PEFT model."""
    model.train()
    optimizer = bnb.optim.PagedAdamW8bit(model.parameters(), lr=learning_rate)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=100, num_training_steps=len(dataloader) * num_epochs
    )

    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
        for batch in progress_bar:
            # Data is on CPU, must be moved to the model's device (GPU)
            batch = {k: v.to("cuda") for k, v in batch.items()}

            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            progress_bar.set_postfix({"loss": loss.item()})

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

        model.save_pretrained(os.path.join(output_dir, f"checkpoint-epoch-{epoch+1}"))

    return model


def main():
    """Main function to run the two-stage fine-tuning process."""
    model_name = "/home/jj/Llama-3.2-11B-Vision-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=8,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
    )

    # Stage 1: Informal language fine-tuning
    informal_model_path = "./informal_finetuned"
    if not os.path.exists(os.path.join(informal_model_path, "adapter_model.safetensors")):
        print("--- Stage 1: Informal language fine-tuning ---")
        informal_dataset = prepare_dataset("informal.jsonl", tokenizer)
        model = load_model(model_name, peft_config)
        model = fine_tune(model, informal_dataset, informal_model_path, num_epochs=3)
    else:
        print("Skipping Stage 1: Informal fine-tuned model already exists")

    # Stage 2: Bart Simpson-specific fine-tuning
    print("\n--- Stage 2: Bart Simpson-specific fine-tuning ---")
    bart_dataset = prepare_dataset("bart.jsonl", tokenizer)
    bart_model_path = "./bart_finetuned"
    model = load_model(model_name, peft_config, informal_model_path)
    model = fine_tune(model, bart_dataset, bart_model_path, num_epochs=5)

    model.save_pretrained(bart_model_path)
    tokenizer.save_pretrained(bart_model_path)
    print("\nFine-tuning completed. Final model saved to:", bart_model_path)


if __name__ == "__main__":
    main()

--- Stage 1: Informal language fine-tuning ---


Map:   0%|          | 0/28 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Creating new PEFT model
trainable params: 4,259,840 || all params: 9,779,451,920 || trainable%: 0.0436


Epoch 1/3: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:09<00:00,  2.91it/s, loss=1.35]


Epoch 1/3, Average Loss: 2.4602


Epoch 2/3: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:09<00:00,  3.04it/s, loss=0.272]


Epoch 2/3, Average Loss: 0.3527


Epoch 3/3: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:09<00:00,  2.98it/s, loss=0.272]


Epoch 3/3, Average Loss: 0.3014

--- Stage 2: Bart Simpson-specific fine-tuning ---


Map:   0%|          | 0/89 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Creating new PEFT model
trainable params: 4,259,840 || all params: 9,779,451,920 || trainable%: 0.0436


Epoch 1/5: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 89/89 [00:29<00:00,  3.02it/s, loss=0.409]


Epoch 1/5, Average Loss: 1.2458


Epoch 2/5: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 89/89 [00:29<00:00,  3.00it/s, loss=0.427]


Epoch 2/5, Average Loss: 0.3611


Epoch 3/5: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 89/89 [00:29<00:00,  3.03it/s, loss=0.277]


Epoch 3/5, Average Loss: 0.2697


Epoch 4/5: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 89/89 [00:29<00:00,  3.03it/s, loss=0.217]


Epoch 4/5, Average Loss: 0.2275


Epoch 5/5: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 89/89 [00:29<00:00,  3.02it/s, loss=0.216]


Epoch 5/5, Average Loss: 0.2011

Fine-tuning completed. Final model saved to: ./bart_finetuned


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import os

def load_model_and_tokenizer(model_path):
    """Loads the 4-bit quantized model and tokenizer from a PEFT checkpoint."""
    # Step 1: Load the PEFT config to get the base model name
    config = PeftConfig.from_pretrained(model_path)
    base_model_name = config.base_model_name_or_path

    # Step 2: Setup 4-bit quantization config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    # Step 3: Load the base model with quantization
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        quantization_config=bnb_config,
        device_map="auto"
    )

    # Step 4: Load the PEFT model by combining the base model and adapters
    model = PeftModel.from_pretrained(base_model, model_path)
    model.eval()  # Set the model to evaluation mode

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

def generate_response(model, tokenizer, prompt, max_new_tokens=100):
    """Generates a response from the model based on a user prompt."""
    # Format the prompt to match the training data
    full_prompt = f"Human: {prompt}\nBart:"
    input_ids = tokenizer.encode(full_prompt, return_tensors="pt").to(model.device)

    # Generate the response
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Decode and extract only the new, generated tokens
    response = tokenizer.decode(output_ids[0][len(input_ids[0]):], skip_special_tokens=True)
    return response.strip()

def main():
    """Main function to load the model and start a chat session."""
    model_path = "./bart_finetuned"
    
    if not os.path.exists(model_path):
        print(f"Error: Model path not found at {model_path}")
        return

    print("Loading fine-tuned model...")
    model, tokenizer = load_model_and_tokenizer(model_path)
    
    print("\nAy, caramba! Model loaded. Ask me anything, man. Type 'exit' to quit.")
    
    while True:
        user_input = input("\nYou: ")
        if user_input.lower() == 'exit':
            break
        
        response = generate_response(model, tokenizer, user_input)
        print(f"Bart: {response}")

if __name__ == "__main__":
    main()

Loading fine-tuned model...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]


Ay, caramba! Model loaded. Ask me anything, man. Type 'exit' to quit.



You:  what is storage?


Bart: Storage? It's like... having a really big treehouse for your stuff, dude. You can hide anything in there and forget about it. But, you know, for things instead of toys. Unless you're me, then it's all about toys... and chocolate... in storage. D'oh!
