In [0]:
%pip install xgboost
%pip install shap
dbutils.library.restartPython()

In [0]:
import pandas as pd
import numpy as np
import xgboost as xgb
import mlflow
import mlflow.xgboost
from pyspark.sql.functions import col, lit
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, roc_auc_score, f1_score, accuracy_score
import shap
import json

In [0]:
CATALOGO = "workspace"
SCHEMA = "churn_zero"
TABELA_GOLD = "history_gold"

In [0]:
df = spark.table(f"{CATALOGO}.{SCHEMA}.{TABELA_GOLD}").toPandas()
X = df.drop(columns= ['Churn', 'customerID', 'MonthlyCharges'])
y = df['Churn']
feature_names = X.columns.tolist()

# Divisão dos dados
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 30,stratify=y)

In [0]:

neg = y_train.value_counts()[0]
pos = y_train.value_counts()[1]
scale_pos_weight = neg / pos

with mlflow.start_run(run_name="XGBoost_Recall_Optimized"):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc', 
        'use_label_encoder': False,
        'n_estimators': 150,
        'max_depth': 5,
        'learning_rate': 0.1,
        'scale_pos_weight': scale_pos_weight,
        'random_state': 30
    }
    mlflow.log_params(params)

    # 2. Treinar o Modelo
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    # 3. Previsão de Probabilidades
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # 4. Encontrar o Threshold Otimizado para Recall
    best_threshold = 0.5
    best_recall = 0
    
    # Busca o threshold entre 0.1 e 0.7, de 0.05 em 0.05
    for threshold in np.arange(0.5, 0.5, 0.05):
        y_pred = (y_pred_proba >= threshold).astype(int)
        
        current_recall = recall_score(y_test, y_pred, zero_division=0)
        current_precision = precision_score(y_test, y_pred, zero_division=0)
        
        # Condição de Otimização: Maximize Recall, mantendo Precision > 0.3
        if current_recall > best_recall and current_precision > 0.8: 
            best_recall = current_recall
            best_threshold = threshold

    # 5. Avaliação Final com o Threshold Otimizado
    y_pred_final = (y_pred_proba >= best_threshold).astype(int)
    
    # Calcular TODAS as Métricas Finais
    final_recall = recall_score(y_test, y_pred_final)
    final_precision = precision_score(y_test, y_pred_final, zero_division=0)
    final_auc = roc_auc_score(y_test, y_pred_proba)
    final_accuracy = accuracy_score(y_test, y_pred_final)
    final_f1_score = f1_score(y_test, y_pred_final)

    # 6. Registrar TODAS as Métricas no MLflow
    mlflow.log_metric("optimal_threshold", best_threshold)
    mlflow.log_metric("final_recall", final_recall)
    mlflow.log_metric("final_precision", final_precision)
    mlflow.log_metric("final_auc", final_auc)
    mlflow.log_metric("final_accuracy", final_accuracy)
    mlflow.log_metric("final_f1_score", final_f1_score)
    
    # 7. Registrar o Modelo e Nomes das Features
    mlflow.xgboost.log_model(
        xgb_model=model, 
        artifact_path="churn_model", 
        signature=mlflow.models.signature.infer_signature(X, y)
    )
    with open("feature_names.json", "w") as f:
        json.dump(feature_names, f)
    mlflow.log_artifact("feature_names.json")

    run_id = mlflow.last_active_run().info.run_id
    model_uri = f"runs:/{run_id}/churn_model"

print(f"\nMelhor Threshold Encontrado (Priorizando Recall): {best_threshold:.2f}")
print(f"=====================================================")
print(f"Recall Final (Identificação de Churners): {final_recall:.2f}")
print(f"Acurácia (Geralmente Não é Prioridade): {final_accuracy:.2f}")
print(f"F1-Score (Equilíbrio entre P e R): {final_f1_score:.2f}")
print(f"Modelo registrado no MLflow com URI: {model_uri}")

In [0]:
import mlflow.xgboost
#uri modelo sem customer feedback runs:/410e46af42934f4fa16a0337818c3292/churn_model
model_uri = "runs:/4fb0ccc97fb34d51964951e5a25b3dea/churn_model"

loaded_model = mlflow.xgboost.load_model(model_uri)

print("Modelo XGBoost carregado com sucesso!")

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

# 1. Obter a importância das features (este passo está correto)
feature_importance = pd.Series(loaded_model.get_booster().get_score(importance_type='weight'))
feature_importance = feature_importance.sort_values(ascending=False) 

# 2. Definir o tamanho da figura dinamicamente
num_features = len(feature_importance)

# Define a altura da figura com base no número de features (ex: 0.3 polegadas por feature)
figure_height = max(10, int(num_features * 0.3)) 

# 3. Criar a figura com o tamanho dinâmico
plt.figure(figsize=(12, figure_height)) 
feature_importance.plot(kind='barh')

# 4. Configurar rótulos e título
plt.title("Importância das Features (XGBoost)", fontsize=16)
plt.xlabel("Weight Score (Número de vezes que a feature foi usada)", fontsize=12)
plt.ylabel("Feature", fontsize=12)

plt.tight_layout()

print(f"Total de features com importância: {num_features}")

In [0]:
import mlflow
from mlflow.entities.model_registry import ModelVersion

# 2. Defina o nome do modelo no formato Unity Catalog (3 níveis)
CATALOG = "workspace"    # Ex: workspace
SCHEMA = "churn_zero"       # Ex: churn_zero
MODEL_NAME = "churn_risk_model_xgboost"
model_uri = "runs:/eaa17e01c7e642428f93672463856230/churn_model"
# Nome completo que será registrado no Unity Catalog
UC_MODEL_PATH = f"{CATALOG}.{SCHEMA}.{MODEL_NAME}"

# 3. Registre o Modelo
# O MLflow irá copiar o artefato do seu Run para o Unity Catalog
registered_model: ModelVersion = mlflow.register_model(
    model_uri=model_uri,
    name=UC_MODEL_PATH,
    tags={"project": "TerraSignal_Churn"}
)

print(f"\n✅ Modelo registrado no Unity Catalog:")
print(f"Nome do Modelo: {registered_model.name}")
print(f"Versão: {registered_model.version}")
print("Você pode agora acessar e servir o modelo a partir do seu workspace.")