# Fine-tuning Llama 3.1 8B for IT Ticket Classification

Fine-tuning do **Meta-Llama-3.1-8B-Instruct** para classificação de tickets. Dados: `dataset_with_id.csv` **excluindo** os IDs em `sample_200.csv` (mesmo protocolo de `labs/architecture-comparison.ipynb`). O modelo base é o mesmo do `./models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf`; o treino usa o modelo completo via HuggingFace (LoRA); ao final pode-se exportar para GGUF.

Contents:
1. Setup e carregamento (excl. sample_200)
2. Split estratificado e dataset de instrução
3. LoRA fine-tuning (Transformers + PEFT)
4. Salvamento do adapter e nota sobre export para GGUF

# 1. Data pre-processing

In [None]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

ROOT = Path(".").resolve()
if not (ROOT / "data").exists():
    ROOT = ROOT.parent
dataset_path = ROOT / "data" / "processed" / "dataset_with_id.csv"
sample_path = ROOT / "data" / "processed" / "sample_200.csv"

df_full = pd.read_csv(dataset_path)
sample_ids = set(pd.read_csv(sample_path)["id"].astype(str))
df = df_full[~df_full["id"].astype(str).isin(sample_ids)].copy()
df = df.reset_index(drop=True)
tickets_df = df
print(f"Loaded {len(tickets_df)} rows (excluding {len(sample_ids)} sample_200 ids)")
tickets_df.head()

In [None]:
# Stratified split: 95% train+val, 5% test; then 80% train, 20% val (protocolo architecture-comparison)
RANDOM_STATE = 28
label_col = "Topic_group"
tickets_df_train_val, tickets_df_test = train_test_split(
    tickets_df, test_size=0.05, random_state=RANDOM_STATE, stratify=tickets_df[label_col]
)
tickets_df_train, tickets_df_val = train_test_split(
    tickets_df_train_val, test_size=0.2, random_state=RANDOM_STATE, stratify=tickets_df_train_val[label_col]
)
print("Train:", tickets_df_train.shape[0], "Val:", tickets_df_val.shape[0], "Test:", tickets_df_test.shape[0])
classes = sorted(tickets_df[label_col].unique().tolist())
print("Classes:", classes)

# 2. Dataset de instrução para classificação

Formato: instrução com o texto do ticket → resposta = categoria (exatamente um dos nomes de classe).

In [None]:
CLASSIFICATION_INSTRUCTION = (
    "Classify the following IT support ticket into exactly one of these categories: {classes}. "
    "Reply with only the category name.\n\nTicket:\n{text}"
)

def build_instruction(row, text_col="Document", label_col="Topic_group", classes_list=None):
    classes_list = classes_list or classes
    classes_str = ", ".join(classes_list)
    text = (row[text_col] or "").strip()[:2000]
    instruction = CLASSIFICATION_INSTRUCTION.format(classes=classes_str, text=text)
    return instruction, row[label_col]

def df_to_instruction_data(df, text_col="Document", label_col="Topic_group"):
    instructions, labels = [], []
    for _, row in df.iterrows():
        instr, label = build_instruction(row, text_col=text_col, label_col=label_col)
        instructions.append(instr)
        labels.append(label)
    return pd.DataFrame({"instruction": instructions, "response": labels})

In [None]:
train_inst = df_to_instruction_data(tickets_df_train)
val_inst = df_to_instruction_data(tickets_df_val)
print("Train samples:", len(train_inst))
print(train_inst.head(2))

# 3. LoRA fine-tuning

Requer: `pip install transformers peft datasets accelerate` (e, para 4-bit, `bitsandbytes`). Modelo base: **meta-llama/Meta-Llama-3.1-8B-Instruct** (mesma família do GGUF em `./models/`).

In [None]:
# Descomente para instalar dependências do fine-tuning:
# !pip install transformers peft datasets accelerate
# # Opcional (QLoRA): !pip install bitsandbytes

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
import torch

MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
OUTPUT_DIR = str(ROOT / "outputs" / "artifacts" / "llama_finetune_classification")
ADAPTER_SAVE = str(ROOT / "outputs" / "artifacts" / "llama_classification_adapter")

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
def format_chat(instruction: str, response: str) -> list[dict]:
    return [
        {"role": "user", "content": instruction},
        {"role": "assistant", "content": response},
    ]

def tokenize_with_labels(examples, max_length=512):
    texts = []
    for instr, resp in zip(examples["instruction"], examples["response"]):
        messages = format_chat(instr, resp)
        text = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=False
        )
        texts.append(text)
    out = tokenizer(
        texts,
        truncation=True,
        max_length=max_length,
        padding="max_length",
        return_tensors=None,
    )
    prompt_lengths = []
    for instr, resp in zip(examples["instruction"], examples["response"]):
        prompt_only = tokenizer.apply_chat_template(
            format_chat(instr, ""), tokenize=False, add_generation_prompt=True
        )
        enc = tokenizer(prompt_only, return_tensors=None)
        prompt_lengths.append(len(enc["input_ids"]))
    labels = []
    for i, input_ids in enumerate(out["input_ids"]):
        plen = min(prompt_lengths[i], len(input_ids))
        labels.append([-100] * plen + input_ids[plen:])
    out["labels"] = labels
    return out

MAX_SEQ_LENGTH = 512
train_ds = Dataset.from_pandas(train_inst).map(
    lambda x: tokenize_with_labels(x, max_length=MAX_SEQ_LENGTH),
    batched=True,
    remove_columns=train_inst.columns.tolist(),
)
val_ds = Dataset.from_pandas(val_inst).map(
    lambda x: tokenize_with_labels(x, max_length=MAX_SEQ_LENGTH),
    batched=True,
    remove_columns=val_inst.columns.tolist(),
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None,
    trust_remote_code=True,
)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.03,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    bf16=torch.cuda.is_available(),
    report_to="none",
)

def data_collator_fn(features):
    batch = tokenizer.pad(
        {"input_ids": [f["input_ids"] for f in features], "attention_mask": [f["attention_mask"] for f in features]},
        padding=True,
        return_tensors="pt",
    )
    batch["labels"] = torch.tensor([f["labels"] for f in features], dtype=torch.long)
    batch["labels"].masked_fill_(batch["labels"] == tokenizer.pad_token_id, -100)
    return batch

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator_fn,
)

In [None]:
trainer.train()

In [None]:
import os
os.makedirs(ADAPTER_SAVE, exist_ok=True)
model.save_pretrained(ADAPTER_SAVE)
tokenizer.save_pretrained(ADAPTER_SAVE)
print("Adapter and tokenizer saved to:", ADAPTER_SAVE)