# Projeto de Portfólio: Churn Prediction End-to-End

Este notebook demonstra um fluxo completo de ciência de dados para classificação de churn com boas práticas de ML: preparação, validação de dados, EDA, pipelines, validação cruzada, tuning, interpretabilidade e serving via API.

## 1) Importações e configuração de ambiente

Vamos importar bibliotecas, fixar seeds e inspecionar versões para reprodutibilidade.

In [None]:
import os, sys, json, random, pathlib, warnings
from datetime import datetime
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, roc_curve,
    precision_recall_curve, average_precision_score, log_loss, f1_score
)
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

try:
    import xgboost as xgb
    from xgboost import XGBClassifier
    XGB_AVAILABLE = True
except Exception:
    XGB_AVAILABLE = False

import optuna
import shap
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

import pandera as pa
from pandera import Column, Check

# Fix seeds para reprodutibilidade
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

print("Versions:")
print("python:", sys.version)
print("numpy:", np.__version__)
print("pandas:", pd.__version__)
print("sklearn:", sklearn.__version__)
print("optuna:", optuna.__version__)
print("shap:", shap.__version__)


## 2) Download e carregamento do dataset público (Telco Churn)

Vamos tentar baixar um CSV público do Telco Customer Churn. Caso não haja internet, usamos `data/sample_churn.csv`.

In [None]:
from pathlib import Path
import pandas as pd

RAW_DIR = Path("../data/raw")
RAW_DIR.mkdir(parents=True, exist_ok=True)

PUBLIC_URLS = [
    # IBM Telco Churn (espelhos públicos; podem cair)
    "https://raw.githubusercontent.com/blastchar/telco-churn/master/WA_Fn-UseC_-Telco-Customer-Churn.csv",
]

fallback = Path("../data/sample_churn.csv")

for url in PUBLIC_URLS:
    try:
        df = pd.read_csv(url)
        src = url
        break
    except Exception as e:
        df = None

if df is None:
    print("Sem internet ou URL indisponível. Usando fallback sample_churn.csv")
    df = pd.read_csv(fallback)
    src = str(fallback)

print("Fonte:", src)
print(df.head())


## 3) Limpeza e tipagem de dados

Normaliza `Churn` em 0/1, converte numéricos e remove colunas irrelevantes como `customerID` (quando existir).

In [None]:
df_raw = df.copy()

# Normaliza target
if 'Churn' in df.columns:
    df['Churn'] = df['Churn'].replace({'Yes': 1, 'No': 0, True: 1, False: 0}).astype(int)
elif 'churn' in df.columns:
    df['churn'] = df['churn'].replace({'Yes': 1, 'No': 0, True: 1, False: 0}).astype(int)

# Remove ID
for id_col in ['customerID', 'CustomerID', 'customer_id', 'id']:
    if id_col in df.columns:
        df = df.drop(columns=[id_col])

# Converte numéricos que vieram como string
for col in df.columns:
    if df[col].dtype == 'object':
        # tenta forçar para número se possível
        try:
            df[col] = pd.to_numeric(df[col])
        except Exception:
            pass

print(df.dtypes.head(20))
print(df.isna().sum().sort_values(ascending=False).head(10))


## 4) Validação de dados com pandera

Definimos um esquema simples para validar colunas básicas (adaptável ao dataset disponível).

In [None]:
import pandera as pa
from pandera import Column, Check

# Define schema mínimo
columns = list(df.columns)
num_cols = [c for c in columns if pd.api.types.is_numeric_dtype(df[c]) and c != 'Churn']
cat_cols = [c for c in columns if c not in num_cols + ['Churn']]

schema = pa.DataFrameSchema({
    **{c: Column(float, nullable=True) for c in num_cols},
    **{c: Column(object, nullable=True) for c in cat_cols},
})

_ = schema.validate(df.drop(columns=[c for c in ['Churn','churn'] if c in df.columns], errors='ignore'), lazy=True)
print("Schema válido para features.")


## 5) EDA rápida

Distribuição do target, contagens por categóricas e correlações numéricas.

In [None]:
target = 'Churn' if 'Churn' in df.columns else 'churn'

fig, ax = plt.subplots(1,2, figsize=(10,4))
df[target].value_counts(normalize=True).plot(kind='bar', ax=ax[0], title='Distribuição do target')
ax[0].set_xlabel('Classe'); ax[0].set_ylabel('Proporção')

# Uma categoria exemplo, se existir
any_cat = next((c for c in df.columns if df[c].dtype == 'object'), None)
if any_cat:
    df.groupby([any_cat, target]).size().unstack(fill_value=0).plot(kind='bar', stacked=True, ax=ax[1], title=f'{any_cat} x {target}')
plt.tight_layout()
plt.show()

corr = df.select_dtypes(include=[np.number]).corr(numeric_only=True)
sns.heatmap(corr, cmap='coolwarm', center=0)
plt.title('Correlação numérica')
plt.show()


## 6) Split estratificado: treino/val/teste (60/20/20)

In [None]:
X = df.drop(columns=[target])
y = df[target]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=SEED, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=SEED, stratify=y_temp)

print({"train": len(X_train), "val": len(X_val), "test": len(X_test)})


## 7) Engenharia de atributos simples

In [None]:
def add_simple_features(df_in: pd.DataFrame) -> pd.DataFrame:
    df = df_in.copy()
    # Exemplo: se 'MonthlyCharges' e 'tenure' estiverem presentes, cria ARPU-like
    if set(['MonthlyCharges','tenure']).issubset(df.columns):
        df['charges_per_tenure'] = df['MonthlyCharges'] / (df['tenure'].replace(0, np.nan))
        df['charges_per_tenure'] = df['charges_per_tenure'].fillna(0)
    # Bins simples de tenure
    if 'tenure' in df.columns:
        df['tenure_bin'] = pd.cut(df['tenure'], bins=[-1, 12, 24, 48, 72, np.inf], labels=['<=12','13-24','25-48','49-72','>72'])
    return df

X_train_fe = add_simple_features(X_train)
X_val_fe = add_simple_features(X_val)
X_test_fe = add_simple_features(X_test)

print(X_train_fe.head(3))


## 8) Pré-processamento (ColumnTransformer) e 9) Balanceamento com SMOTE

In [None]:
def make_preprocessor(train_df: pd.DataFrame):
    cats = [c for c in train_df.columns if train_df[c].dtype == 'object']
    nums = [c for c in train_df.columns if pd.api.types.is_numeric_dtype(train_df[c])]

    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ])
    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ])

    return ColumnTransformer([
        ("cat", cat_pipe, cats),
        ("num", num_pipe, nums),
    ])

preprocessor = make_preprocessor(X_train_fe)


## 10) Baseline com DummyClassifier e métricas

In [None]:
def evaluate_proba(y_true, y_proba):
    y_pred = (y_proba >= 0.5).astype(int)
    auc = roc_auc_score(y_true, y_proba)
    ap = average_precision_score(y_true, y_proba)
    ll = log_loss(y_true, np.vstack([1-y_proba, y_proba]).T)
    f1 = f1_score(y_true, y_pred)
    return {"roc_auc": auc, "ap": ap, "log_loss": ll, "f1": f1}

baseline = Pipeline([
    ("pre", preprocessor),
    ("clf", DummyClassifier(strategy='most_frequent'))
])
baseline.fit(X_train_fe, y_train)

try:
    p_val = baseline.predict_proba(X_val_fe)[:,1]
except Exception:
    p_val = np.zeros_like(y_val, dtype=float)

metrics_baseline = evaluate_proba(y_val.values, p_val)
metrics_baseline


## 11-12) Modelos: LogisticRegression, RandomForest (+ XGBoost se disponível) e CV

In [None]:
def make_pipeline(clf):
    return Pipeline([
        ("pre", preprocessor),
        ("clf", clf)
    ])

models = {
    "logreg": LogisticRegression(max_iter=1000, class_weight='balanced', random_state=SEED),
    "rf": RandomForestClassifier(n_estimators=300, random_state=SEED, class_weight='balanced')
}
if XGB_AVAILABLE:
    models["xgb"] = XGBClassifier(
        n_estimators=300, learning_rate=0.1, subsample=0.9, colsample_bytree=0.8,
        max_depth=5, random_state=SEED, eval_metric='logloss'
    )

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
cv_results = {}
for name, clf in models.items():
    pipe = make_pipeline(clf)
    scores = cross_val_score(pipe, X_train_fe, y_train, cv=cv, scoring='roc_auc')
    cv_results[name] = {"roc_auc_mean": scores.mean(), "roc_auc_std": scores.std()}
cv_results


## 13) Tuning com Optuna (exemplo rápido)

Atenção: pode ser demorado. Aqui um estudo pequeno só para demonstrar o fluxo.

In [None]:
def objective(trial: optuna.Trial):
    C = trial.suggest_float('C', 1e-3, 10.0, log=True)
    clf = LogisticRegression(max_iter=1000, class_weight='balanced', C=C, random_state=SEED)
    pipe = make_pipeline(clf)
    scores = cross_val_score(pipe, X_train_fe, y_train, cv=cv, scoring='roc_auc')
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10, show_progress_bar=False)
study.best_params, study.best_value


## 14) Avaliação no teste e curvas ROC/PR + matriz de confusão

In [None]:
best_C = study.best_params.get('C', 1.0)
best_pipe = make_pipeline(LogisticRegression(max_iter=1000, class_weight='balanced', C=best_C, random_state=SEED))
best_pipe.fit(pd.concat([X_train_fe, X_val_fe]), pd.concat([y_train, y_val]))

proba_test = best_pipe.predict_proba(X_test_fe)[:,1]
metrics_test = evaluate_proba(y_test.values, proba_test)
metrics_test

fpr, tpr, _ = roc_curve(y_test, proba_test)
prec, rec, _ = precision_recall_curve(y_test, proba_test)
fig, ax = plt.subplots(1,2, figsize=(10,4))
ax[0].plot(fpr, tpr); ax[0].set_title('ROC'); ax[0].set_xlabel('FPR'); ax[0].set_ylabel('TPR')
ax[1].plot(rec, prec); ax[1].set_title('Precision-Recall'); ax[1].set_xlabel('Recall'); ax[1].set_ylabel('Precision')
plt.tight_layout(); plt.show()

cm = confusion_matrix(y_test, (proba_test>=0.5).astype(int), normalize='true')
sns.heatmap(cm, annot=True, cmap='Blues'); plt.title('Matriz de Confusão (norm)'); plt.show()


## 15) Interpretabilidade: Permutation Importance e SHAP (se disponível)

In [None]:
from sklearn.inspection import permutation_importance

r = permutation_importance(best_pipe, X_val_fe, y_val, scoring='roc_auc', n_repeats=5, random_state=SEED)
# Como estamos em um Pipeline com OHE, nomes das features expandidas não estão triviais; mostramos top importâncias genéricas
imp = pd.Series(r.importances_mean).sort_values(ascending=False).head(20)
imp.plot(kind='bar', title='Permutation Importance (top 20)')
plt.show()

if XGB_AVAILABLE:
    try:
        explainer = shap.Explainer(best_pipe.named_steps['clf'])
        X_trans = best_pipe.named_steps['pre'].fit_transform(X_train_fe)
        shap_values = explainer(X_trans[:200])
        shap.plots.beeswarm(shap_values)
    except Exception as e:
        print("SHAP plot skip:", e)


## 16) Persistência do pipeline e metadados

In [None]:
import joblib, subprocess
from pathlib import Path

ART_DIR = Path('../models'); ART_DIR.mkdir(exist_ok=True, parents=True)
MODEL_PATH = ART_DIR / 'best_pipeline.joblib'
META_PATH = ART_DIR / 'metadata.json'

joblib.dump(best_pipe, MODEL_PATH)

# Tenta obter hash do git, se for repositório
try:
    commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()
except Exception:
    commit = None

meta = {
    'created_at': datetime.utcnow().isoformat(),
    'metrics_test': metrics_test,
    'lib_versions': {
        'numpy': np.__version__, 'pandas': pd.__version__, 'sklearn': sklearn.__version__,
        'optuna': optuna.__version__, 'shap': shap.__version__
    },
    'git_commit': commit,
}
with open(META_PATH, 'w') as f:
    json.dump(meta, f, indent=2)

MODEL_PATH, META_PATH


## 17) Função de inferência e contrato de I/O

In [None]:
schema_infer = pa.DataFrameSchema({
    **{c: Column(float, nullable=True) for c in X_test_fe.select_dtypes(include=np.number).columns},
    **{c: Column(object, nullable=True) for c in X_test_fe.select_dtypes(exclude=np.number).columns},
})

def predict_proba_batch(df_in: pd.DataFrame):
    _ = schema_infer.validate(df_in, lazy=True)
    proba = best_pipe.predict_proba(df_in)[:,1]
    return pd.DataFrame({"proba": proba, "pred": (proba>=0.5).astype(int)})

predict_proba_batch(X_test_fe.head(5))


## 18) API mínima com FastAPI para servir o modelo

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI()

class Payload(BaseModel):
    records: list[dict]

@app.post('/predict')
def predict_api(payload: Payload):
    df_in = pd.DataFrame(payload.records)
    df_in = add_simple_features(df_in)
    out = predict_proba_batch(df_in)
    return {"predictions": out.to_dict(orient='records')}

print("Para rodar: uvicorn EDA:app --reload --port 8000 (no diretório deste notebook)")


## 19) Testes automatizados (asserts) no notebook

In [None]:
# Verifica que superamos baseline em ROC AUC no teste por um delta mínimo
assert metrics_test["roc_auc"] >= metrics_baseline.get("roc_auc", 0) + 1e-6

# Verifica contrato de I/O
out = predict_proba_batch(X_test_fe.head(3))
assert set(out.columns) == {"proba", "pred"}
assert len(out) == 3

print("Notebooks checks OK")


## 20) Reprodutibilidade e ambiente

Registramos `pip freeze` para um arquivo de artifacts e reimprimimos versões.

In [None]:
ART_ENV = Path('../models'); ART_ENV.mkdir(parents=True, exist_ok=True)
req_path = ART_ENV / 'requirements.txt'

# Grava freeze
try:
    import subprocess
    freeze = subprocess.check_output([sys.executable, '-m', 'pip', 'freeze']).decode()
    with open(req_path, 'w') as f:
        f.write(freeze)
    print('Requisitos salvos em', req_path)
except Exception as e:
    print('Falha ao capturar pip freeze:', e)

print("Versions again:")
print("numpy:", np.__version__, "pandas:", pd.__version__, "sklearn:", sklearn.__version__)
