# Treino de modelos – Classificação de Contratação

Este notebook treina Logistic Regression e LightGBM, com split temporal e SHAP.

In [None]:
!pip -q install lightgbm shap

In [None]:

from pathlib import Path
import json, sys
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, roc_curve, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb, shap, joblib

DATA_RAW = Path("../data/raw"); DATA_PROC = Path("../data/processed"); DATA_PROC.mkdir(parents=True, exist_ok=True)
sys.path.append(str(Path("../").resolve())); sys.path.append(str(Path("../src").resolve()))
from src.data.prepare_data import build_dataset


In [None]:

X, y, meta, df_full = build_dataset(DATA_RAW, out_dir=DATA_PROC)
meta


In [None]:

date_col = "ultima_atualizacao" if "ultima_atualizacao" in df_full.columns else "data_candidatura"
dates = pd.to_datetime(df_full[date_col], errors="coerce")
cut = dates.quantile(0.8)
train_idx = dates <= cut; test_idx = dates > cut
X_train, X_test = X.loc[train_idx].reset_index(drop=True), X.loc[test_idx].reset_index(drop=True)
y_train, y_test = y.loc[train_idx].reset_index(drop=True), y.loc[test_idx].reset_index(drop=True)
print("Train:", X_train.shape, " Test:", X_test.shape, " cut:", cut)


In [None]:

num_cols, cat_cols, id_cols = meta["num_cols"], meta["cat_cols"], meta["id_cols"]
preprocess = ColumnTransformer([
    ("num", StandardScaler(with_mean=False), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", min_frequency=10), cat_cols),
], remainder="drop", sparse_threshold=0.3)
logreg = Pipeline([("prep", preprocess), ("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))])
lgbm_clf = Pipeline([("prep", preprocess), ("clf", lgb.LGBMClassifier(n_estimators=600, learning_rate=0.03, subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0, random_state=42, class_weight="balanced"))])


In [None]:

def evaluate(model, Xtr, ytr, Xte, yte, name="model"):
    model.fit(Xtr, ytr)
    p_tr = model.predict_proba(Xtr)[:,1]; p_te = model.predict_proba(Xte)[:,1]
    roc_tr, roc_te = roc_auc_score(ytr, p_tr), roc_auc_score(yte, p_te)
    pr_tr, pr_te = average_precision_score(ytr, p_tr), average_precision_score(yte, p_te)
    print(f"{name} — ROC_AUC train={roc_tr:.3f} test={roc_te:.3f} | PR_AUC train={pr_tr:.3f} test={pr_te:.3f}")
    prec, rec, thr = precision_recall_curve(yte, p_te); f1s = 2*prec*rec/(prec+rec+1e-9)
    best_idx = f1s.argmax(); best_thr = thr[max(0, best_idx-1)] if len(thr)>0 else 0.5
    y_pred = (p_te >= best_thr).astype(int)
    print(f"Best threshold (F1) = {best_thr:.3f}"); print(classification_report(yte, y_pred, digits=3))
    fpr, tpr, _ = roc_curve(yte, p_te); plt.figure(figsize=(5,4)); plt.plot(fpr, tpr); plt.plot([0,1],[0,1],'--'); plt.title(f"ROC — {name}"); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.grid(True); plt.show()
    plt.figure(figsize=(5,4)); plt.plot(rec, prec); plt.title(f"PR — {name}"); plt.xlabel("Recall"); plt.ylabel("Precision"); plt.grid(True); plt.show()
    return {"model": model, "best_thr": float(best_thr)}

res_lr  = evaluate(logreg,   X_train, y_train, X_test, y_test, name="LogReg")
res_lgb = evaluate(lgbm_clf, X_train, y_train, X_test, y_test, name="LightGBM")


In [None]:

prep = lgbm_clf.named_steps["prep"]; clf = lgbm_clf.named_steps["clf"]
Xte_trans = prep.transform(X_test)
explainer = shap.TreeExplainer(clf); shap_values = explainer.shap_values(Xte_trans)
shap.summary_plot(shap_values, Xte_trans, max_display=20, show=False); plt.show()

from pathlib import Path
import json, joblib
ARTIF_DIR = Path("../models"); ARTIF_DIR.mkdir(parents=True, exist_ok=True)
joblib.dump(logreg, ARTIF_DIR / "logreg_pipeline.joblib")
joblib.dump(lgbm_clf, ARTIF_DIR / "lgbm_pipeline.joblib")
with open(ARTIF_DIR / "metadata.json","w",encoding="utf-8") as f:
    json.dump({"id_cols": id_cols, "num_cols": num_cols, "cat_cols": cat_cols, "threshold_lgbm": res_lgb["best_thr"]}, f, ensure_ascii=False, indent=2)
print("Artefatos salvos em ../models")
