# IT Ticket Classifier — DHAUZ Challenge

Notebook executável: amostragem da base, preparo, fluxo LangGraph, inferência e métricas.

In [1]:
import os
import sys
import numpy as np
from pathlib import Path

ROOT = Path(".").resolve()
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from dotenv import load_dotenv
load_dotenv(ROOT / ".env")

import config
np.random.seed(config.SEED)

## 0. Baixar dataset do Kaggle (obrigatório)

Execute esta célula uma vez. É necessário ter `~/.kaggle/kaggle.json` configurado e ter aceitado as regras do dataset no site do Kaggle.

In [2]:
from src.prep import download_from_kaggle

path = download_from_kaggle()
print(f"Dataset em: {path}")

You must authenticate before you can call the Kaggle API.
Follow the instructions to authenticate at: https://github.com/Kaggle/kaggle-cli/blob/main/docs/README.md#authentication


RuntimeError: Falha ao baixar o dataset do Kaggle. Configure ~/.kaggle/kaggle.json com suas credenciais e aceite as regras do dataset em https://www.kaggle.com/datasets/aniketg11/supportticketsclassification

## 1. Carregar dataset e identificar coluna de rótulo

In [None]:
from src.prep import load_dataset, get_text_and_label_columns

df = load_dataset()
text_cols, label_col = get_text_and_label_columns(df)
print("Colunas de texto:", text_cols)
print("Coluna de rótulo:", label_col)
print("Classes:", sorted(df[label_col].astype(str).unique().tolist()))
print("Shape:", df.shape)

## 2. Amostragem estratificada (200 tickets)

In [None]:
from src.prep import stratified_sample

n_sample = min(config.SAMPLE_SIZE, len(df))
df_sample = stratified_sample(df, label_col, n=n_sample)
df_sample.to_csv(config.DATA_PROCESSED / "sample_200.csv", index=False)
print(f"Amostra: {len(df_sample)} tickets")
print(df_sample[label_col].value_counts())

## 3. Montar texto do ticket e preparar RAG (embeddings + FAISS)


In [None]:
def make_ticket_text(row, text_cols):
    parts = [str(row.get(c, "")) for c in text_cols if c in row.index]
    return " ".join(p for p in parts if p and str(p).strip())

texts = [make_ticket_text(row, text_cols) for _, row in df_sample.iterrows()]
labels = df_sample[label_col].astype(str).tolist()
classes = sorted(df_sample[label_col].astype(str).unique().tolist())

from src.rag import Embedder, VectorStore

embedder = Embedder()
store = VectorStore(embedder)
store.build(texts, labels)
print("Vector store built.")
print("Classes:", classes)

## 4. Construir grafo LangGraph e inferência em 2–3 exemplos

In [None]:
from src.graph import build_pipeline, run_pipeline

compiled, _, _, _ = build_pipeline(store, classes)

for i in range(min(3, len(texts))):
    out = run_pipeline(compiled, texts[i], classes)
    print(f"--- Ticket {i+1} ---")
    print("Texto (trecho):", texts[i][:150], "...")
    print("Saída:", out)
    print()

## 5. Rodar nos 200 (ou amostra) e salvar resultados

In [None]:
from src.logging_utils import log_result

results_path = config.OUTPUTS / "results_sample.jsonl"
if results_path.exists():
    results_path.unlink()

predictions = []
for i, row in df_sample.iterrows():
    text = make_ticket_text(row, text_cols)
    out = run_pipeline(compiled, text, classes)
    pred = out["classe"]
    predictions.append(pred)
    log_result({"ticket_index": int(i), "true": row[label_col], "pred": pred, "justificativa": out["justificativa"]})

print(f"Salvos {len(predictions)} resultados em {results_path}")

## 6. Métricas e relatório

In [None]:
from src.metrics import compute_metrics, save_metrics_report

y_true = df_sample[label_col].astype(str).tolist()
metrics = compute_metrics(y_true, predictions, labels=classes)
save_metrics_report(metrics)

print("Accuracy:", metrics["accuracy"])
print("F1 macro:", metrics["f1_macro"])
print("F1 weighted:", metrics["f1_weighted"])
print("\nClassification report:")
for k, v in metrics["classification_report"].items():
    if isinstance(v, dict):
        print(k, v)
    else:
        print(k, v)

## 7. Exemplo de saída JSON

In [None]:
import json
ex = run_pipeline(compiled, texts[0], classes)
print(json.dumps(ex, indent=2, ensure_ascii=False))