# This is the colab link where I ran this: https://colab.research.google.com/drive/1oAtbXx3sf1z1UrW8eiDKC-jk-uSXF_7m?usp=sharing

### instalation for Colab

In [None]:
!pip install unsloth[colab-new]
!pip install --no-deps "xformers<0.0.29" "trl<0.13.0" peft accelerate bitsandbytes

## Load model unsloth/Llama-3.2-1B-Instruct-bnb-4bit

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None # Auto-detection (will use BF16 on your A100)
load_in_4bit = True 

print("Load the base model")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
print(f"Base model: 'unsloth/Llama-3.2-1B-Instruct-bnb-4bit' loaded locally")

### Test model loaded

In [None]:
# Enable fast inference mode
FastLanguageModel.for_inference(model) 

In [None]:
# Define your test conversation
# Make sure to include names like in your training data
messages = [
    {"role": "user", "content": "Está avanzando mucho la IA últimamente? responde en plain text y respuesta muy corta"},
]

# 3. Apply the chat template and move to GPU
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # This tells the model it's its turn to talk
    return_tensors = "pt",
).to("cuda")

In [None]:
#  Generate the response
outputs = model.generate(
    input_ids = inputs,
    max_new_tokens = 64, # Keep it short for chat responses
    use_cache = True,
    temperature = 0.8,   # Higher = more creative/random
    top_p = 0.9,
)

In [None]:
# 5. Decode and print
response = tokenizer.batch_decode(outputs)
print(response[0].split("<|start_header_id|>assistant<|end_header_id|>")[-1].split("<|eot_id|>")[0])

In [None]:
print("Apply LoRA adapters to train specific layers only")
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Rank: increase for more complex patterns (e.g., 32)
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)
print("applied")

In [None]:
from datasets import load_dataset

# Load your custom JSONL file located in /content/
dataset = load_dataset("json", data_files={"train": "content/whatsapp_chat_cleaned.jsonl"}, split="train")

### Format with chat templates

In [None]:
def formatting_prompts_func(examples):
    convos = examples["messages"]
    # Apply the official chat template without tokenizing yet
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True)

## Configure and start training (fine tuning)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, # Increase this to 300+ for better "humor" learning
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer.train()

## Test fine funed model

In [None]:
# Switch to fast inference mode
FastLanguageModel.for_inference(model) 

messages = [
    {"role": "user", "content": "Jeter: Muchachos, que framework de backend me recomiendan?"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64)
response_string = tokenizer.batch_decode(outputs)[0]

### Extract response

In [None]:
# Extract user input
user_start_tag = '<|start_header_id|>user<|end_header_id|>\n\n'
assistant_start_tag = '<|start_header_id|>assistant<|end_header_id|>\n\n'
eot_tag = '<|eot_id|>'

user_input_start = response_string.find(user_start_tag)
user_input_end = response_string.find(eot_tag, user_input_start)
original_input = response_string[user_input_start + len(user_start_tag):user_input_end].strip()

# Extract model output
assistant_output_start = response_string.find(assistant_start_tag)
assistant_output_end = response_string.find(eot_tag, assistant_output_start)
model_output = response_string[assistant_output_start + len(assistant_start_tag):assistant_output_end].strip()

print(f"> {original_input}")
print(f"AI bot: {model_output}")

## Export to GGUF (for Ollama)

In [None]:
# This will create the GGUF file in your Colab file system
model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")