In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from imblearn.under_sampling import RandomUnderSampler


In [2]:
df = pd.read_csv("C:/Users/hamga/Downloads/EarlyRetirementPrediction.csv")  
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

In [3]:
X = df.drop(columns=["retire_before_65_years_old", "probabilidades_de_retiro_temprano"])
y = df["retire_before_65_years_old"]

In [4]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, stratify=y, random_state=42
)

In [6]:
def ensamblado_modelos(X_train, y_train, X_full, n_modelos=15, random_state=42):
    preds_logit, preds_rf, preds_svm = [], [], []
    importancias = []

    for i in range(n_modelos):
        rus = RandomUnderSampler(random_state=random_state + i)
        X_res, y_res = rus.fit_resample(X_train, y_train)

        logit = LogisticRegression(max_iter=1000)
        rf = RandomForestClassifier(n_estimators=100, random_state=random_state + i)
        svm = SVC(probability=True)

        logit.fit(X_res, y_res)
        rf.fit(X_res, y_res)
        svm.fit(X_res, y_res)

        preds_logit.append(logit.predict_proba(X_full)[:, 1])
        preds_rf.append(rf.predict_proba(X_full)[:, 1])
        preds_svm.append(svm.predict_proba(X_full)[:, 1])

        importancias.append(rf.feature_importances_)

    df_preds = pd.DataFrame({
        "score_logit": np.mean(preds_logit, axis=0),
        "score_rf": np.mean(preds_rf, axis=0),
        "score_svm": np.mean(preds_svm, axis=0)
    })

    df_preds["score_promedio"] = df_preds.mean(axis=1)
    importancia_media = np.mean(importancias, axis=0)

    return df_preds, importancia_media

In [7]:
df_scores_all, importancia = ensamblado_modelos(X_train, y_train, X_scaled)

In [8]:
df_final = pd.concat([df.reset_index(drop=True), df_scores_all], axis=1)

In [9]:
df_final["riesgo"] = pd.cut(df_final["score_promedio"],
                            bins=[0, 0.4, 0.7, 1],
                            labels=["Bajo", "Medio", "Alto"])

In [10]:
df_final.insert(0, "ID", ["CL" + str(i).zfill(4) for i in range(1, len(df_final)+1)])

In [17]:
df_final.to_csv("clientes_con_score.csv", index=False)

# 2. Top 20 clientes con mayor riesgo
df_top20 = df_final.sort_values("score_promedio", ascending=False).head(20)
df_top20.to_csv("top_20_clientes_riesgo_alto.csv", index=False)

# 3. Segmentación de riesgo
df_riesgo = df_final["riesgo"].value_counts().reset_index()
df_riesgo.columns = ["riesgo", "cantidad"]
df_riesgo.to_csv("tabla_riesgos_segmentados.csv", index=False)

# 4. Importancia de variables
nombres_ejecutivos = {
    "dependants": "Personas a cargo",
    "credit_score": "Puntaje crediticio",
    "monthly_income": "Ingreso mensual del cliente",
    "unemployment_rate": "Tasa de desempleo nacional",
    "stock_market": "Estado del mercado bursatil",
    "government_bonds_return": "Rendimiento de bonos gubernamentales",
    "education_level": "Nivel educativo del cliente",
    "gender": "Genero del cliente",
    "desease": "Condiciones de salud",
    "marital_status": "Estado civil del cliente",
    "employee/employer": "Tipo de ocupacion (Empleado/Empleador)"
}

# Crear DataFrame de importancia con nombres ejecutivos
df_importancia = pd.DataFrame({
    "variable": X.columns,
    "importancia_rf": importancia
})

# Reemplazar nombres
df_importancia["variable"] = df_importancia["variable"].replace(nombres_ejecutivos)

# Ordenar y exportar
df_importancia = df_importancia.sort_values("importancia_rf", ascending=False)
df_importancia.to_csv("importancia_variables_rf.csv", index=False)
df_importancia.to_csv("importancia_variables_rf.csv", index=False)

In [12]:
# === Evaluación del modelo con TEST ===
modelo_eval = RandomForestClassifier(n_estimators=100, random_state=42)
modelo_eval.fit(X_train, y_train)
y_pred_prob = modelo_eval.predict_proba(X_test)[:, 1]
y_pred = (y_pred_prob >= 0.5).astype(int)

auc = roc_auc_score(y_test, y_pred_prob)
print(f"\n✅ AUC Score (test): {auc:.4f}")
print("\n📋 Classification Report (corte 0.5):")
print(classification_report(y_test, y_pred))


✅ AUC Score (test): 0.8278

📋 Classification Report (corte 0.5):
              precision    recall  f1-score   support

           0       0.81      0.81      0.81       238
           1       0.79      0.79      0.79       212

    accuracy                           0.80       450
   macro avg       0.80      0.80      0.80       450
weighted avg       0.80      0.80      0.80       450

