# IT Ticket Classifier — DHAUZ Challenge

Notebook executável: carrega o dataset, amostra 200 tickets, monta o RAG (embeddings + FAISS), executa o fluxo LangGraph em exemplos e na amostra, calcula métricas.

In [1]:
import os
import sys
import numpy as np
from pathlib import Path

ROOT = Path(".").resolve()
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from dotenv import load_dotenv
load_dotenv(ROOT / ".env")

import config
np.random.seed(config.SEED)

## 0. Baixar dataset do Kaggle (só se ainda não tiver o CSV em data/raw)

In [2]:
from src.prep import download_from_kaggle

path = download_from_kaggle()
print(f"Dataset em: {path}")

Dataset em: /Users/moises/Documents/ticket-classifier:/data/raw/all_tickets_processed_improved_v3.csv


## 1. Carregar dataset, amostrar 200 tickets e montar o vector store (RAG)

In [3]:
import pandas as pd
from src.prep import load_dataset, get_text_and_label_columns, stratified_sample
from src.rag import VectorStore

def make_ticket_text(row, text_cols):
    parts = [str(row.get(c, "")) for c in text_cols if c in row.index]
    return " ".join(p for p in parts if p and str(p).strip())

df = load_dataset()
text_cols, label_col = get_text_and_label_columns(df)
n_sample = min(config.SAMPLE_SIZE, len(df))
df_sample = stratified_sample(df, label_col, n=n_sample)
df_sample.to_csv(config.DATA_PROCESSED / "sample_200.csv", index=False)

texts = [make_ticket_text(row, text_cols) for _, row in df_sample.iterrows()]
labels = df_sample[label_col].astype(str).tolist()
classes = sorted(set(labels))

artifact_path = config.ARTIFACTS_DIR
if (artifact_path / "index.faiss").exists():
    store = VectorStore.load(artifact_path)
    print("Vector store carregado de:", artifact_path)
else:
    vc = VectorStore()
    store = vc.build(texts, labels)
    store.save(artifact_path)
    print("Vector store construído e salvo em:", artifact_path)
print("Classes:", classes)
print("Amostra:", len(df_sample), "tickets")

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Vector store carregado de: /Users/moises/Documents/ticket-classifier:/outputs/artifacts
Classes: ['Access', 'Administrative rights', 'HR Support', 'Hardware', 'Internal Project', 'Miscellaneous', 'Purchase', 'Storage']
Amostra: 200 tickets


In [4]:
print("Distribuição por classe:")
print(df_sample[label_col].value_counts())

Distribuição por classe:
Topic_group
Hardware                 25
Access                   25
Miscellaneous            25
HR Support               25
Purchase                 25
Administrative rights    25
Storage                  25
Internal Project         25
Name: count, dtype: int64


## 2. Inferência em exemplos


In [5]:
# store, classes e texts já carregados na célula 1.
from src.graph import build_pipeline, run_pipeline

compiled, _, _, _ = build_pipeline(store, classes)

for i in range(min(3, len(texts))):
    out = run_pipeline(compiled, texts[i], classes)
    print(f"--- Ticket {i+1} ---")
    print("Texto (trecho):", texts[i][:150], "...")
    print("Saída:", out)
    print()

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
{"event": "llm_usage", "source": "knn", "model": "n/a", "input_tokens": 0, "output_tokens": 0, "total_tokens": 0, "requests": 0, "confidence": 0.8}
{"event": "llm_usage", "source": "knn", "model": "n/a", "input_tokens": 0, "output_tokens": 0, "total_tokens": 0, "requests": 0, "confidence": 0.8}


--- Ticket 1 ---
Texto (trecho): monitor request vulcan friday october pm hello please log each user monitor allocation user vulcan thank weekend engineer friday october vulcan parte  ...
Saída: {'classe': 'Purchase', 'justificativa': 'Classe atribuída: Purchase. Termos do ticket: monitor request vulcan friday october pm hello please log each user monitor allocation user vulcan thank weekend engineer friday october vulcan parte ne va respective va.'}

--- Ticket 2 ---
Texto (trecho): stopped when docker start was executed sent wednesday february hi we having same problem we had few days ago server was stopped when executed docker s ...
Saída: {'classe': 'Hardware', 'justificativa': 'Classe atribuída: Hardware. Termos do ticket: stopped when docker start was executed sent wednesday february hi we having same problem we had few days ago server was stopped when executed docker start looks like if machine....'}



{"event": "llm_usage", "source": "classification", "model": "nvidia/nemotron-3-nano-30b-a3b:free", "input_tokens": 175, "output_tokens": 50, "total_tokens": 225, "requests": 1}
{"event": "llm_usage", "source": "justification", "model": "nvidia/nemotron-3-nano-30b-a3b:free", "input_tokens": 421, "output_tokens": 146, "total_tokens": 567, "requests": 1}


--- Ticket 3 ---
Texto (trecho): issue re access through for hello still work attached log error received during installation restarted machine disconnected tethered phone can connect ...
Saída: {'classe': 'Access', 'justificativa': "O ticket menciona 'issue re access through', 'can connect' e problemas de 'disconnected tethered phone', indicando dificuldades de acesso à rede ou ao serviço."}



## 3. Rodar na amostra de 200 e salvar resultados

In [6]:
from src.logging_utils import log_result

results_path = config.OUTPUTS / "results_sample.jsonl"
if results_path.exists():
    results_path.unlink()

predictions = []
for i, row in df_sample.iterrows():
    text = make_ticket_text(row, text_cols)
    out = run_pipeline(compiled, text, classes)
    pred = out["classe"]
    predictions.append(pred)
    log_result({"ticket_index": int(i), "true": row[label_col], "pred": pred, "justificativa": out["justificativa"]})

print(f"Salvos {len(predictions)} resultados em {results_path}")

{"event": "llm_usage", "source": "justification", "model": "nvidia/nemotron-3-nano-30b-a3b:free", "input_tokens": 433, "output_tokens": 140, "total_tokens": 573, "requests": 1}
{"event": "llm_usage", "source": "classification", "model": "nvidia/nemotron-3-nano-30b-a3b:free", "input_tokens": 175, "output_tokens": 50, "total_tokens": 225, "requests": 1}
{"event": "llm_usage", "source": "classification", "model": "nvidia/nemotron-3-nano-30b-a3b:free", "input_tokens": 228, "output_tokens": 50, "total_tokens": 278, "requests": 1}
{"event": "llm_usage", "source": "classification", "model": "nvidia/nemotron-3-nano-30b-a3b:free", "input_tokens": 356, "output_tokens": 50, "total_tokens": 406, "requests": 1}
{"event": "llm_usage", "source": "justification", "model": "nvidia/nemotron-3-nano-30b-a3b:free", "input_tokens": 569, "output_tokens": 128, "total_tokens": 697, "requests": 1}
{"event": "llm_usage", "source": "classification", "model": "nvidia/nemotron-3-nano-30b-a3b:free", "input_tokens": 

## 4. Métricas e relatório

In [None]:
from src.metrics import compute_metrics, save_metrics_report

y_true = df_sample[label_col].astype(str).tolist()
metrics = compute_metrics(y_true, predictions, labels=classes)
save_metrics_report(metrics)

print("Accuracy:", metrics["accuracy"])
print("F1 macro:", metrics["f1_macro"])
print("F1 weighted:", metrics["f1_weighted"])
print("\nClassification report:")
for k, v in metrics["classification_report"].items():
    if isinstance(v, dict):
        print(k, v)
    else:
        print(k, v)

## 5. Exemplo de saída JSON

In [None]:
import json
ex = run_pipeline(compiled, texts[0], classes)
print(json.dumps(ex, indent=2, ensure_ascii=False))

---

In [None]:
import json
ex = run_pipeline(compiled, texts[0], classes)
print(json.dumps(ex, indent=2, ensure_ascii=False))