# Importar Librerias

In [None]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, roc_curve
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import OneHotEncoder
from pandasql import sqldf
import warnings
from itertools import combinations #Generar combinacion de variables
from optbinning import OptimalBinning
from sklearn.model_selection import StratifiedKFold, cross_val_score


warnings.filterwarnings("ignore")

# Importar datos

In [None]:
df = pd.read_excel("Tabla Trabajo Grupal N°2.xlsx", sheet_name= "Desarrollo")

In [None]:
df.info()

In [None]:
df.isnull().sum() #Confirmo que no tiene nulos

El grafico anterior más la tabla nos confirmen quen o existen valores nulos

In [None]:
df.shape #Data con 12.356 registros

In [None]:
df.nunique() #Id demuestra que la data no tiene registros repetidos

In [None]:
df["Default"].value_counts()

Al tener los mismos ID distintos versus el total de registros nos muestra que no hay registros duplicados  
Además vemos que Nivel educacional es la unica variable realmente categorica además de Default

In [None]:
df.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df[df.columns[1:-1]], df["Default"],
                                                    train_size= 0.7,
                                                    random_state = 7)

df_train = pd.concat([x_train, y_train], axis = 1)
df_test = pd.concat([x_test, y_test], axis = 1)

In [None]:
df_test

# Analisis descriptivo

## Analisis variables

In [None]:
def graficar_histograma(df, variable):
    plt.figure(figsize = (7,4))
    sns.histplot(df, x = variable)
    plt.title(f"{variable}")
    plt.xlabel(f"{variable}")
    plt.ylabel("Cantidad")

In [None]:
graficar_histograma(df_train, df_train.columns[0])

In [None]:
graficar_histograma(df_train, df_train.columns[1])

In [None]:
graficar_histograma(df_train, df_train.columns[2])

In [None]:
graficar_histograma(df_train, df_train.columns[3])

In [None]:
graficar_histograma(df_train, df_train.columns[4])

In [None]:
graficar_histograma(df_train, df_train.columns[5])

In [None]:
graficar_histograma(df_train, df_train.columns[6])

In [None]:
graficar_histograma(df_train, df_train.columns[7])

In [None]:
plt.figure(figsize = (7,7))
sns.heatmap(df_train.select_dtypes(include=['number']).corr(), annot= True, cmap = "coolwarm")
plt.title("Matriz de correlacion de variables numericas")

In [None]:
def graficar_histograma_subplots(df, variable, ax ):
    sns.histplot(df, x = variable, ax = ax)
    plt.title(f"{variable}")
    plt.xlabel(f"{variable}")
    plt.ylabel("Cantidad")

fig, axes = plt.subplots(2,4, figsize = (20,10))
axes = axes.flatten()
for ax, var in zip(axes, df_train.columns):
    graficar_histograma_subplots(df_train, var, ax=ax)

fig.suptitle("Histograma de Variables (Entrenamiento)", fontsize=20)
plt.show()

## Analisis variables vs objetivo

In [None]:
def graficar_histograma_subplots(df, variable, ax ):
    sns.histplot(df, x = variable, hue = "Default" , ax = ax)
    plt.title(f"{variable}")
    plt.xlabel(f"{variable}")
    plt.ylabel("Cantidad")

fig, axes = plt.subplots(2,4, figsize = (35,36))
axes = axes.flatten()
for ax, var in zip(axes, df_train.columns):
    graficar_histograma_subplots(df_train, var, ax=ax)

fig.suptitle("Histograma de Variables (Entrenamiento)", fontsize=20)
plt.show()

In [None]:
sns.pairplot(df_train, hue = "Default")

#  Modelado Exploratorio

## Modelo Base - Sin agregados

### Simple

In [None]:
df_train_backup = df_train.copy()

In [None]:
df_train["Nivel_Educacional"].nunique

In [None]:
df_train= pd.get_dummies(df_train, "Nivel_Educacional", drop_first= True)
df_test =pd.get_dummies(df_test, "Nivel_Educacional", drop_first= True)

In [None]:
df_train.head()

In [None]:
variables = [col for col in df_train.columns if col not in ["Default"]]

In [None]:
x_train = df_train[variables]
y_terain = df_train["Default"]
x_test = df_test[variables]
y_test = df_test["Default"]

In [None]:
RL = LogisticRegression()

RL.fit(x_train, y_train)
y_predic = RL.predict(x_test)

acc = accuracy_score(y_test, y_predic)
print("El accuracy del modelo es")
print(f"El Accuracy del modelo es {acc:.3f}")


coef = RL.coef_[0]
intercepto = RL.intercept_[0]

df_coef = pd.DataFrame({
    "Variable": variables,
    "Coeficiente": coef
})

print("Intercepto:", intercepto)
print(df_coef)


In [None]:
y_scores = RL.predict_proba(x_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_scores)
roc_auc = roc_auc_score(y_test, y_scores)

plt.figure()
plt.plot(fpr, tpr, label=f"ROC (AUC = {roc_auc:.3f})")
plt.xlabel("Tasa de Falsos Positivos (1 - Especificidad)")
plt.ylabel("Tasa de Verdaderos Positivos (Sensibilidad)")
plt.title("Curva ROC - Regresión Logística")
plt.legend()
plt.grid(True)
plt.show()


In [None]:

thresholds = np.arange(0, 1.01, 0.01)   # thresholds de 0.00 a 1.00
accuracies = []

for t in thresholds:
    y_pred_t = (y_scores >= t).astype(int)
    accuracies.append(accuracy_score(y_test, y_pred_t))

best_t = thresholds[np.argmax(accuracies)]
best_acc = max(accuracies)

print(f"Mejor threshold = {best_t:.2f}")
print(f"Accuracy máximo = {best_acc:.4f}")

### Stepwise

In [None]:
y_test.shape

In [None]:
def modelo_stepwise(x_train, x_test, y_train, y_test, direccion):
    modelo = LogisticRegression()
    stepwise = SequentialFeatureSelector(
        modelo,
        direction = direccion,
        scoring = "accuracy",
        cv=5,

    )

    stepwise.fit(x_train, y_train)
    filtro = stepwise.get_support()
    col_stepwise = x_train.columns[filtro]
    modelo.fit(x_train[col_stepwise], y_train)
    y_pred = modelo.predict(x_test[col_stepwise])

    accuracy = accuracy_score(y_pred, y_test)

    print(f"Valor de Accuracy: {accuracy}")
    print("Variables seleccionadas con RFE:")
    print(col_stepwise)
    
    coef = modelo.coef_[0]
    intercepto = modelo.intercept_[0]

    df_coef = pd.DataFrame({
    "Variable": col_stepwise,
    "Coeficiente": coef})

    print("Intercepto:", intercepto)
    print(df_coef)


modelo_stepwise(x_train, x_test, y_train, y_test, "backward")



In [None]:
x_train.shape[1]

In [None]:
modelo_stepwise(x_train, x_test, y_train, y_test, "forward")

In [None]:
x_train_nuevo = x_train.copy()
x_test_nuevo = x_test.copy()


In [None]:
x_train_nuevo.head()

In [None]:
variables_num = ["Edad", "Años_Trabajando","Ingresos", "Deuda_Comercial", "Deuda_Credito", "Otras_Deudas", "Ratio_Ingresos_Deudas"]

In [None]:
x_train_nuevo[variables_num] = np.log( x_train_nuevo[variables_num] +1)
x_test_nuevo[variables_num] = np.log( x_test_nuevo[variables_num] +1)

In [None]:
x_train_nuevo

In [None]:
def graficar_histograma_subplots(df, variable, ax ):
    sns.histplot(df, x = variable, ax = ax)
    plt.title(f"{variable}")
    plt.xlabel(f"{variable}")
    plt.ylabel("Cantidad")

fig, axes = plt.subplots(2,4, figsize = (20,10))
axes = axes.flatten()
for ax, var in zip(axes, df_train.columns):
    graficar_histograma_subplots(df_train, var, ax=ax)

fig.suptitle("Histograma de Variables (Entrenamiento)", fontsize=20)
plt.show()

In [None]:
def graficar_histograma_subplots(df, variable, ax ):
    sns.histplot(df, x = variable, ax = ax)
    plt.title(f"{variable}")
    plt.xlabel(f"{variable}")
    plt.ylabel("Cantidad")

fig, axes = plt.subplots(2,4, figsize = (20,10))
axes = axes.flatten()
for ax, var in zip(axes, df_train_nuevo.columns):
    graficar_histograma_subplots(df_train_nuevo, var, ax=ax)

fig.suptitle("Histograma de Variables (Entrenamiento)", fontsize=20)
plt.show()

In [None]:
modelo_stepwise(x_train_nuevo, x_test_nuevo, y_train, y_test, "backward")

In [None]:
modelo_stepwise(x_train_nuevo, x_test_nuevo, y_train, y_test, "forward")

## Modelo con ponderacion de pesos

### Forma Profesor

In [None]:
media_train = y_train.mean()
peso = media_train/(1-media_train)
peso_train = np.where(y_train ==1, 1, peso)

In [None]:
RL = LogisticRegression()

RL.fit(x_train, y_train, sample_weight=peso_train)
y_predic = RL.predict(x_test)

acc = accuracy_score(y_test, y_predic)
print("El accuracy del modelo es")
print(f"El Accuracy del modelo es {acc:.3f}")


coef = RL.coef_[0]
intercepto = RL.intercept_[0]

df_coef = pd.DataFrame({
    "Variable": variables,
    "Coeficiente": coef
})

print("Intercepto:", intercepto)
print(df_coef)


In [None]:
def modelo_stepwise(x_train, x_test, y_train, y_test, direccion):
    modelo = LogisticRegression()
    stepwise = SequentialFeatureSelector(
        modelo,
        direction = direccion,
        scoring = "accuracy",
        cv=5,

    )

    stepwise.fit(x_train, y_train)
    filtro = stepwise.get_support()
    col_stepwise = x_train.columns[filtro]
    modelo.fit(x_train[col_stepwise], y_train, sample_weight= peso_train)
    y_pred = modelo.predict(x_test[col_stepwise])

    accuracy = accuracy_score(y_pred, y_test)

    print(f"Valor de Accuracy: {accuracy}")
    print("Variables seleccionadas con RFE:")
    print(col_stepwise)
    
    coef = modelo.coef_[0]
    intercepto = modelo.intercept_[0]

    df_coef = pd.DataFrame({
    "Variable": col_stepwise,
    "Coeficiente": coef})

    print("Intercepto:", intercepto)
    print(df_coef)

    return  modelo, col_stepwise


modelo, col_selecc = modelo_stepwise(x_train, x_test, y_train, y_test, "backward")


In [None]:
y_scores = modelo.predict_proba(x_test[col_selecc])[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_scores)
roc_auc = roc_auc_score(y_test, y_scores)

plt.figure()
plt.plot(fpr, tpr, label=f"ROC (AUC = {roc_auc:.3f})")
plt.xlabel("Tasa de Falsos Positivos (1 - Especificidad)")
plt.ylabel("Tasa de Verdaderos Positivos (Sensibilidad)")
plt.title("Curva ROC - Regresión Logística")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
#### Crear otro threshold

thresholds = np.arange(0, 1.01, 0.01)   # thresholds de 0.00 a 1.00
accuracies = []

for t in thresholds:
    y_pred_t = (y_scores >= t).astype(int)
    accuracies.append(accuracy_score(y_test, y_pred_t))

best_t = thresholds[np.argmax(accuracies)]
best_acc = max(accuracies)

print(f"Mejor threshold = {best_t:.2f}")
print(f"Accuracy máximo = {best_acc:.4f}")

### Forma balanceo recomendada Default

In [None]:
N = len(x_train)
N1 = y_train.sum()
N0 = N-N1

w0 = N/(2*N0)
w1 = N/(2*N1)

peso_train2 = np.where(y_train==1, w1,w0)

In [None]:
RL = LogisticRegression()

RL.fit(x_train, y_train, sample_weight=peso_train2)
y_predic = RL.predict(x_test)

acc = accuracy_score(y_test, y_predic)
print("El accuracy del modelo es")
print(f"El Accuracy del modelo es {acc:.3f}")


coef = RL.coef_[0]
intercepto = RL.intercept_[0]

df_coef = pd.DataFrame({
    "Variable": variables,
    "Coeficiente": coef
})

print("Intercepto:", intercepto)
print(df_coef)

In [None]:
def modelo_stepwise(x_train, x_test, y_train, y_test, direccion):
    modelo = LogisticRegression()
    stepwise = SequentialFeatureSelector(
        modelo,
        direction = direccion,
        scoring = "accuracy",
        cv=5,

    )

    stepwise.fit(x_train, y_train)
    filtro = stepwise.get_support()
    col_stepwise = x_train.columns[filtro]
    modelo.fit(x_train[col_stepwise], y_train, sample_weight= peso_train2)
    y_pred = modelo.predict(x_test[col_stepwise])

    accuracy = accuracy_score(y_pred, y_test)

    print(f"Valor de Accuracy: {accuracy}")
    print("Variables seleccionadas con RFE:")
    print(col_stepwise)
    
    coef = modelo.coef_[0]
    intercepto = modelo.intercept_[0]

    df_coef = pd.DataFrame({
    "Variable": col_stepwise,
    "Coeficiente": coef})

    print("Intercepto:", intercepto)
    print(df_coef)


modelo_stepwise(x_train, x_test, y_train, y_test, "backward")

In [None]:
x_train.info()

In [None]:
# --- 1. Definimos qué columnas vamos a binnear ---
cols_binning = [
    "Edad", "Años_Trabajando", "Ingresos",
    "Deuda_Comercial", "Deuda_Credito",
    "Otras_Deudas", "Ratio_Ingresos_Deudas"
]

cols_dummies = [
    "Nivel_Educacional_Med", "Nivel_Educacional_Posg",
    "Nivel_Educacional_SupCom", "Nivel_Educacional_SupInc"
]

# Número de bins
N_BINS = 5  

# Para guardar los bordes de cada bin
bin_edges = {}

# --- 2. Hacemos binning SOLO en x_train ---
x_train_binned = x_train.copy()

for col in cols_binning:
    # Generamos bins por cuantiles en TRAIN
    x_train_binned[col + "_bin"], edges = pd.qcut(
        x_train[col],
        q=N_BINS,
        retbins=True,
        duplicates="drop"
    )
    bin_edges[col] = edges  # Guardamos bordes exactos

# --- 3. Función que aplica los mismos bins a cualquier dataset ---
def aplicar_binning(df, cols_binning, bin_edges):
    df_binned = df.copy()
    for col in cols_binning:
        edges = bin_edges[col]  # bordes del train
        df_binned[col + "_bin"] = pd.cut(
            df[col],
            bins=edges,
            include_lowest=True
        )
    return df_binned

# --- 4. Aplicamos binning a x_test usando los bordes del train ---
x_test_binned = aplicar_binning(x_test, cols_binning, bin_edges)

# --- 5. Dejamos SOLO las columnas necesarias ---
cols_final = [c + "_bin" for c in cols_binning] + cols_dummies

x_train_model = x_train_binned[cols_final]
x_test_model  = x_test_binned[cols_final]

print("Train binned:")
print(x_train_model.head())

print("\nTest binned:")
print(x_test_model.head())


In [None]:
x_test_model.columns

In [None]:
col_binn = x_test_model.columns[0:7]

In [None]:
x_test_model

In [None]:
x_train_model= pd.get_dummies(x_train_model[col_binn], drop_first= True)
x_test_model =pd.get_dummies(x_test_model[col_binn] , drop_first= True)

In [None]:
x_test_model

In [None]:
x_train_model

In [None]:
modelo_binned = LogisticRegression()

modelo_binned.fit(x_train_model, y_train)
y_pred_binned = modelo_binned.predict(x_test_model)
accuracy = accuracy_score(y_test, y_pred_binned)
print(accuracy)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import accuracy_score

# Modelo base
logreg = LogisticRegression(max_iter=1000)

# Número de variables
n_features = x_train_model.shape[1]

mejor_acc = 0
mejores_vars = None

# IMPORTANTE: backward permite seleccionar desde n_features-1 hasta 1
for k in range(1, n_features):
    print(f"Probando con {k} variables...")

    # Backward en sklearn
    sbs = SequentialFeatureSelector(
        logreg,
        n_features_to_select=k,
        direction="backward",
        scoring="accuracy",  # puedes usar "roc_auc"
        cv=5,
        n_jobs=-1
    )

    sbs.fit(x_train_model, y_train)

    # Variables seleccionadas
    selected_mask = sbs.get_support()
    selected_features = x_train_model.columns[selected_mask]

    # Entrenamos modelo con esas variables
    modelo_backward = LogisticRegression(max_iter=1000)
    modelo_backward.fit(x_train_model[selected_features], y_train)

    # Evaluamos en test
    y_pred = modelo_backward.predict(x_test_model[selected_features])
    acc = accuracy_score(y_test, y_pred)

    print(f"Accuracy con {k} variables: {acc:.5f}")

    # Guardar el mejor resultado
    if acc > mejor_acc:
        mejor_acc = acc
        mejores_vars = selected_features

print("\n====================================")
print("✅ Mejor resultado Backward Stepwise")
print("====================================")
print(f"Accuracy: {mejor_acc:.5f}")
print("Variables seleccionadas:")
print(list(mejores_vars))


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from itertools import combinations
import warnings
warnings.filterwarnings('ignore')


df = pd.read_excel("Tabla Trabajo Grupal N°2.xlsx", sheet_name= "Desarrollo")

df.drop(["Id_Cliente"], axis=1, inplace=True)
df.drop(["Ratio_Ingresos_Deudas"], axis=1, inplace=True)
df.drop(["Nivel_Educacional"], axis=1, inplace=True)

# Dropeamos el ratio ingersos deduas, ya que se volverán a generar al momento de generar ratios
df["Total_Deudas"] = df["Deuda_Comercial"] + df["Deuda_Credito"] + df["Otras_Deudas"]
target = 'Default'

# Generar Ratios
col_continuas = [col for col in df.columns if df[col].dtype in ["int64", "float64"] and col != target]

def combinacion_de_variables(df, variables):
    df_2 = df.copy()
    eps = 1e-6 
    for x, y in combinations(variables, 2):
        df_2[f"Prod_{x}_{y}"] = df[x] * df[y]
        df_2[f"Ratio_{x}_{y}"] = df[x] / (df[y] + eps)
        df_2[f"Ratio_{y}_{x}"] = df[y] / (df[x] + eps)
    return df_2

df = combinacion_de_variables(df, col_continuas)

# Aplicacion del split en crudo por ahroa, ya que posteriormente se pasaran a woe
x_en_bruto = df.drop(columns=[target])
y_en_bruto = df[target]
x_train_bruto, X_test_bruto, y_train, y_test = train_test_split(x_en_bruto, y_en_bruto, test_size=0.3, random_state=42, stratify=y_en_bruto)

# Aplicacion del woe e IV
def crear_woes(x_train, target):
    """Calcula WOE sobre una serie y devuelve el mapa y los bins usados"""
    _, bins = pd.qcut(x_train, q=10, duplicates='drop', retbins=True)
    bins[0] = -np.inf
    bins[-1] = np.inf

    # Aplicar bins para calcular WOE
    bineado = pd.cut(x_train, bins=bins).astype(str)

    # Tabla WOE
    df_temp = pd.DataFrame({'bin': bineado, 'target': target})
    grouped = df_temp.groupby('bin')['target'].agg(['count', 'sum'])
    grouped['malos_pagadores'] = grouped['sum'].replace(0, 0.5)
    grouped['buenos_pagadores'] = (grouped['count'] - grouped['sum']).replace(0, 0.5)
    
    total_malos = target.sum()
    total_buenos = target.count() - total_malos
    
    grouped['WOE'] = np.log((grouped['buenos_pagadores'] / total_buenos) / (grouped['malos_pagadores'] / total_malos))
    grouped['IV'] = (grouped['buenos_pagadores']/total_buenos - grouped['malos_pagadores']/total_malos) * grouped['WOE']
    
    return grouped['WOE'].to_dict(), grouped['IV'].sum(), bins

def transformar_a_woe(x_train, tabla_woes, bins):
    """Aplica el WOE a nuevos datos"""
    bineado = pd.cut(x_train, bins=bins).astype(str)
    return bineado.map(tabla_woes).fillna(0) # Si hay nulos (categoría nueva), WOE 0

# --- Aplicar transformación
print("Calculando WOE")

X_train_woe = pd.DataFrame(index=x_train_bruto.index)
X_test_woe = pd.DataFrame(index=X_test_bruto.index)
iv_resumen = {}

for col in x_train_bruto.columns:
    # acalculo woe solo al train, evitamos fuga de informacion
    tabla_woes, iv, bins = crear_woes(x_train_bruto[col], y_train)
    iv_resumen[col] = iv
        
    # luego del calculo sobre test se aplica el woe
    X_train_woe[col] = transformar_a_woe(x_train_bruto[col], tabla_woes, bins)
    X_test_woe[col] = transformar_a_woe(X_test_bruto[col], tabla_woes, bins)

print(f"Variables transformadas: {X_train_woe.shape[1]}")

#ocupamos el IV para descartar variables con IV bajo
iv_umbral = 0.01
variables_fuertes_iv = [k for k, v in iv_resumen.items() if v >= iv_umbral]
print(f"Variables con IV > {iv_umbral}: {len(variables_fuertes_iv)}")

X_train_woe = X_train_woe[variables_fuertes_iv]
X_test_woe = X_test_woe[variables_fuertes_iv]


#APLICACION DE ESTRATEGIAS

# --- A) FORWARD
def aplicar_forward(X_tr, y_tr, X_te, y_te):
    inicial, restante = [], list(X_tr.columns)
    mejor_accuracy = 0
    print("\n----- Forward -----")
    while restante:
        cv_prob = []
        for col in restante:
            m = LogisticRegression(solver='liblinear', random_state=42)
            m.fit(X_tr[inicial + [col]], y_tr)
            cv_prob.append((accuracy_score(y_te, m.predict(X_te[inicial + [col]])), col))
        cv_prob.sort(reverse=True)
        if cv_prob[0][0] > mejor_accuracy:
            mejor_accuracy, mejor_col = cv_prob[0]
            inicial.append(mejor_col)
            restante.remove(mejor_col)
            print(f"Agregada: {cv_prob[0][1]} (Acc: {mejor_accuracy:.4f})")
        else: break
    return inicial, mejor_accuracy

# --- B) BACKWARD CON TOLERANCIA
def aplicar_backward(X_tr, y_tr, X_te, y_te, tolerancia=0.001):
    """
    Elimina variables si el accuracy no baja más que la torelancia.
    tolerancia=0.001 significa que aceptamos perder un 0.01% de accuracy a cambio de quitar una variable.
    """
    cols = list(X_tr.columns)
    m = LogisticRegression(solver='liblinear', random_state=42)
    m.fit(X_tr[cols], y_tr)
    mejor_accuracy = accuracy_score(y_te, m.predict(X_te[cols]))
    
    print(f"\n--- Backward (Tolerancia: {tolerancia}) ---")
    print(f"Inicio: {len(cols)} vars | Acc: {mejor_accuracy:.4f}")
    
    loop = True
    while loop and len(cols) > 1:
        cv_prob = []
        # Evaluar qué pasa si quitamos cada variable
        for col in cols:
            temporal_col = [f for f in cols if f != col]
            m.fit(X_tr[temporal_col], y_tr)
            acc = accuracy_score(y_te, m.predict(X_te[temporal_col]))
            cv_prob.append((acc, col))
        
        cv_prob.sort(reverse=True)
        accuracy_sin_var, variables_quitadas = cv_prob[0]
        
        # Si el accuracy al quitar la variable es MAYOR o IGUAL a (Accuracy Actual - Tolerancia)
        # Significa que la pérdida es aceptable, así que la quitamos
        if accuracy_sin_var >= (mejor_accuracy - tolerancia):
            cols.remove(variables_quitadas)
            # Solo actualizamos el mejor_accuracy si realmente mejoró
            # pero aceptamos el nuevo estado más simple.
            if accuracy_sin_var > mejor_accuracy:
                mejor_accuracy = accuracy_sin_var
            print(f"Eliminada: {variables_quitadas} (Acc: {accuracy_sin_var:.4f})")
        else:
            print("No se puede eliminar más sin sacrificar accuracy")
            loop = False
            
    return cols, mejor_accuracy

# --- C) STEPWISE BIDIRECCIONAL---
def stepwise_bidireccional(X_tr, y_tr, X_te, y_te):
    """Agrega la mejor, luego intenta eliminar la peor"""
    col_actual = []
    restante = list(X_tr.columns)
    mejor_accuracy = 0
    
    print("\n----- Stepwise bidireccional-----")
    while restante:
        # 1. Paso Forward
        acc_agregados = []
        for col in restante:
            m = LogisticRegression(solver='liblinear', random_state=42)
            m.fit(X_tr[col_actual + [col]], y_tr)
            acc = accuracy_score(y_te, m.predict(X_te[col_actual + [col]]))
            acc_agregados.append((acc, col))
        acc_agregados.sort(reverse=True)
        
        if acc_agregados[0][0] > mejor_accuracy:
            mejor_accuracy, agregado = acc_agregados[0]
            col_actual.append(agregado)
            restante.remove(agregado)
            print(f"Agregada: {agregado} (Acc: {mejor_accuracy:.4f})")
            
            # 2. Paso Backward (quita variables)
            # Revisamos si alguna de las QUE YA TENEMOS sobra
            if len(col_actual) > 2:
                acc_quitados = []
                for col in col_actual:
                    temp = [f for f in col_actual if f != col]
                    m.fit(X_tr[temp], y_tr)
                    acc = accuracy_score(y_te, m.predict(X_te[temp]))
                    acc_quitados.append((acc, col))
                acc_quitados.sort(reverse=True)
                
                # Si al quitar una mejoramos el mejor_accuracy actual, la sacamos
                if acc_quitados[0][0] > mejor_accuracy:
                    mejor_accuracy, quitados = acc_quitados[0]
                    col_actual.remove(quitados)
                    restante.append(quitados)
                    print(f"Eliminada por redundancia: {quitados} (Acc subió a: {mejor_accuracy:.4f})")
        else:
            break
            
    return col_actual, mejor_accuracy


mejor_fwd, acc_fwd = aplicar_forward(X_train_woe, y_train, X_test_woe, y_test)

mejor_bwd, acc_bwd = aplicar_backward(X_train_woe, y_train, X_test_woe, y_test, tolerancia=0.0005)

mejor_bidir, acc_bidir = stepwise_bidireccional(X_train_woe, y_train, X_test_woe, y_test)

# --- RESULTADOS
print("\n" + "="*30)
print(" COMPARATIVA FINAL")
print("="*30)
print(f"1. Forward:        {acc_fwd:.4f} ({len(mejor_fwd)} variables)")
print(f"2. Backward:       {acc_bwd:.4f} ({len(mejor_bwd)} variables)")
print(f"3. Bidireccional:  {acc_bidir:.4f} ({len(mejor_bidir)} va   riables)")

# guarda y luego muestra resultados de los intentos y los ordenamos de mayor a menor
resultados = [
    ('Forward', acc_fwd, len(mejor_fwd), mejor_fwd),
    ('Backward Smart', acc_bwd, len(mejor_bwd), mejor_bwd),
    ('Bidireccional', acc_bidir, len(mejor_bidir), mejor_bidir)
]

# Ordenamos por Accuracy descendente
resultados.sort(key=lambda x: x[1], reverse=True)

nombre_ganador, acc_ganador, ganadores_var_len, variables_ganadores = resultados[0]
print(f"\n>>> Estrategia ganadora: {nombre_ganador}")

# ENTRENAMIENTO FINAL
modelo_final = LogisticRegression(solver='liblinear', random_state=42)
modelo_final.fit(X_train_woe[variables_ganadores], y_train)

# Optimización Threshold
probs = modelo_final.predict_proba(X_test_woe[variables_ganadores])[:, 1]
mejor_t = 0.5; max_acc = 0
for t in np.arange(0.1, 0.9, 0.01):
    acc = accuracy_score(y_test, (probs >= t).astype(int))
    if acc > max_acc: max_acc = acc; mejor_t = t

print(f"Accuracy Optimizado: {max_acc:.4f} (Threshold: {mejor_t:.2f})")

In [None]:
col_continuas = [col for col in df_train.columns[:-1] if df_train[col].dtype in ["int64", "float64"]]

def combinacion_de_variables(df, variables):
    df_2 = df.copy()
    for x,y in combinations(variables, 2): 
        df_2[f"Prod_{x}_{y}"] = df[x]*df[y]
        df_2[f"Ratio_{x}_{y}"] = df[x] / (df[y] + 0.00000000001)


    return df_2

df_train_total = combinacion_de_variables(df_train, col_continuas)
df_test_total = combinacion_de_variables(df_test, col_continuas)
        

In [None]:
variables_numericas_total = [col for col in df_train_total.columns if col not in ["Default", "Nivel_Educacional"]]
variables_numericas_total

In [None]:
variables_backward = ['Ingresos', 'Deuda_Credito', 'Prod_Edad_Deuda_Comercial',
       'Prod_Edad_Otras_Deudas', 'Ratio_Edad_Otras_Deudas',
       'Prod_Edad_Ratio_Ingresos_Deudas', 'Ratio_Edad_Ratio_Ingresos_Deudas',
       'Prod_Años_Trabajando_Ingresos', 'Prod_Años_Trabajando_Deuda_Comercial',
       'Prod_Años_Trabajando_Deuda_Credito',
       'Ratio_Años_Trabajando_Deuda_Credito',
       'Prod_Años_Trabajando_Otras_Deudas',
       'Ratio_Años_Trabajando_Otras_Deudas',
       'Prod_Años_Trabajando_Ratio_Ingresos_Deudas',
       'Prod_Ingresos_Deuda_Credito', 'Prod_Ingresos_Otras_Deudas',
       'Ratio_Ingresos_Ratio_Ingresos_Deudas',
       'Ratio_Deuda_Comercial_Deuda_Credito',
       'Ratio_Deuda_Comercial_Otras_Deudas',
       'Prod_Deuda_Comercial_Ratio_Ingresos_Deudas',
       'Prod_Deuda_Credito_Ratio_Ingresos_Deudas',
       'Ratio_Deuda_Credito_Ratio_Ingresos_Deudas',
       'Prod_Otras_Deudas_Ratio_Ingresos_Deudas',
       'Ratio_Otras_Deudas_Ratio_Ingresos_Deudas', 'Años_Trabajando', 'Deuda_Comercial',
       'Otras_Deudas'] #Variables que resultaron de iterar todas las variables en un modelo Backward, debido al tiempo de iteracion (Mayor a 2 horas) es que decidimos tomar ese resultado y dejarlo en un arreglo dado que para dejar el codigo con la iteracion nuevamente tendrìamos que volver a dejar prendido un PC por 1 hora

x_train = df_train_total[variables_backward]
y_train = df_train_total["Default"]

x_test = df_test_total[variables_backward]
y_test = df_test_total["Default"]


def modelo_stepwise(x_train, x_test, y_train, y_test, direccion):
    modelo = LogisticRegression()
    stepwise = SequentialFeatureSelector(
        modelo,
        direction = direccion,
        scoring = "accuracy",
        cv=5,

    )

    stepwise.fit(x_train, y_train)
    filtro = stepwise.get_support()
    col_stepwise = x_train.columns[filtro]
    modelo.fit(x_train[col_stepwise], y_train)
    y_pred = modelo.predict(x_test[col_stepwise])

    accuracy = accuracy_score(y_pred, y_test)

    print(f"Valor de Accuracy: {accuracy}")
    print("Variables seleccionadas con RFE:")
    print(col_stepwise)
    
    coef = modelo.coef_[0]
    intercepto = modelo.intercept_[0]

    df_coef = pd.DataFrame({
    "Variable": col_stepwise,
    "Coeficiente": coef})

    print("Intercepto:", intercepto)
    print(df_coef)


modelo_stepwise(x_train, x_test, y_train, y_test, "backward")


In [None]:
optb = OptimalBinning(name="Edad", dtype="numerical", max_n_bins=10) 
optb.fit(df_train_total["Edad"], df_train_total["Default"])
tabla = optb.binning_table.build()
tabla

In [None]:
ivs = []

for col in variables_numericas_total:
    optb = OptimalBinning(name=col, dtype="numerical", max_n_bins=10)
    optb.fit(df_train_total[col], df_train_total["Default"])
    
    tabla = optb.binning_table.build()
    # Opción 1: sumar IV de todos los bins
    iv = tabla["IV"].sum()
    # Opción 2 (más prolija): tomar la fila Totals si aparece
    # iv = tabla.loc["Totals", "Information value"]
    
    ivs.append((col, iv))

ranking_iv = sorted(ivs, key=lambda x: x[1], reverse=True)


for col, iv in ranking_iv:
    print(f"{col} -> IV={iv:.4f}")



### Modelo final 2

In [None]:
df_woe = pd.DataFrame()

def crear_df_woe(lista_variables):


    for col in lista_variables:

        if col != "Nivel_Educacional":

            optb = OptimalBinning(
                name=col,
                dtype="numerical",
                max_n_bins=20
            )
            
            optb.fit(
                df_train_total[col].values,
                df_train_total["Default"].values,
                sample_weight=peso_train2
            )
            
            woe_values = optb.transform(df_train_total[col].values, metric="woe")
            
            df_woe[col] = woe_values
        
        else:
    
            col_cat = col

            optb_cat = OptimalBinning(
                name=col_cat,
                dtype="categorical"     # <-- IMPORTANTE
            )

            optb_cat.fit(
                df_train_total[col_cat].astype(str).values,     # por seguridad => str
                df_train_total["Default"].values,
                sample_weight=peso_train2
            )

            woe_cat = optb_cat.transform(
            df_train_total[col_cat].astype(str).values,
            metric="woe"
            )

            df_woe[col_cat] = woe_cat   # agregar al dataframe final


    df_woe["Default"] = df_train_total["Default"].values

    return df_woe


In [None]:
variables_pivote = df_train.columns[:-1]
df_woe = crear_df_woe(variables_pivote)

df_woe.head()

In [None]:
df_train_total["Ratio_Años_Trabajando_Deuda_Total"] = df_train_total["Ratio_Años_Trabajando_Deuda_Comercial"] + df_train_total["Ratio_Años_Trabajando_Deuda_Credito"] + df_train_total["Ratio_Años_Trabajando_Otras_Deudas"]

In [None]:
#Variables Jonathan

variables_pivote = ["Edad", "Deuda_Comercial", "Deuda_Credito", "Años_Trabajando", "Ratio_Años_Trabajando_Deuda_Total"]
df_woe = crear_df_woe(variables_pivote)

x_train = df_woe[variables_pivote]
y_train = df_woe["Default"]

x_test = df_woe[variables_pivote]
y_test = df_woe["Default"]


def modelo_stepwise(x_train, x_test, y_train, y_test, direccion):
    modelo = LogisticRegression()
    stepwise = SequentialFeatureSelector(
        modelo,
        direction = direccion,
        scoring = "accuracy",
        cv=5,

    )

    stepwise.fit(x_train, y_train)
    filtro = stepwise.get_support()
    col_stepwise = x_train.columns[filtro]
    modelo.fit(x_train[col_stepwise], y_train, sample_weight= peso_train2)
    y_pred = modelo.predict(x_test[col_stepwise])
    y_scores = modelo.predict_proba(x_test[col_stepwise])[:, 1]

    threshold_manual = 0.36

    accuracy = accuracy_score(y_pred, y_test)

    print(f"Valor de Accuracy: {accuracy}")
    print("Variables seleccionadas con RFE:")
    print(col_stepwise)
    
    coef = modelo.coef_[0]
    intercepto = modelo.intercept_[0]

    df_coef = pd.DataFrame({
    "Variable": col_stepwise,
    "Coeficiente": coef})

    print("Intercepto:", intercepto)
    print(df_coef)

    thresholds = np.arange(0, 1.01, 0.01)   # thresholds de 0.00 a 1.00
    accuracies = []

    for t in thresholds:
        y_pred_t = (y_scores >= t).astype(int)
        accuracies.append(accuracy_score(y_test, y_pred_t))

        best_t = thresholds[np.argmax(accuracies)]
        best_acc = max(accuracies)



    print(f"Con el corte optimo {best_t} el Accuracy es: {best_acc}")

    return y_scores, y_pred

y_scores, y_pred = modelo_stepwise(x_train, x_test, y_train, y_test, "backward")

In [None]:
df_woe = pd.DataFrame()

def crear_df_woe(lista_variables):


    for col in lista_variables:

        if col != "Nivel_Educacional":

            optb = OptimalBinning(
                name=col,
                dtype="numerical",
                max_n_bins=10
            )
            
            optb.fit(
                df_train_total[col].values,
                df_train_total["Default"].values,
                sample_weight=peso_train2
            )
            
            woe_values = optb.transform(df_train_total[col].values, metric="woe")
            
            df_woe[col] = woe_values
        
        else:
    
            col_cat = col

            optb_cat = OptimalBinning(
                name=col_cat,
                dtype="categorical"     # <-- IMPORTANTE
            )

            optb_cat.fit(
                df_train_total[col_cat].astype(str).values,     # por seguridad => str
                df_train_total["Default"].values,
                sample_weight=peso_train2
            )

            woe_cat = optb_cat.transform(
            df_train_total[col_cat].astype(str).values,
            metric="woe"
            )

            df_woe[col_cat] = woe_cat   # agregar al dataframe final


    df_woe["Default"] = df_train_total["Default"].values

    return df_woe

In [None]:
df_train_total["Ratio_Años_Trabajando_Deuda_Total"]

In [None]:
variables_pivote = [
    "Ratio_Años_Trabajando_Deuda_Total",
    "Ratio_Edad_Deuda_Comercial",
    "Ratio_Edad_Ratio_Ingresos_Deudas",
    "Ratio_Años_Trabajando_Ratio_Ingresos_Deudas",
    "Prod_Deuda_Comercial_Ratio_Ingresos_Deudas",
    "Ratio_Ingresos_Deudas",
    "Ratio_Ingresos_Deudas_cuadrado",
    "Prod_Deuda_Credito_Ratio_Ingresos_Deudas",
    "Ratio_Ingresos_Deuda_Credito",
    "Ratio_Ingresos_Deuda_Comercial",
    "Deuda_Comercial",
    "Deuda_Comercial_cuadrado",
    "Prod_Edad_Años_Trabajando",
    "Ratio_Años_Trabajando_Ingresos",
    "Ratio_Ingresos_Ratio_Ingresos_Deudas",
    "Años_Trabajando",
    "Años_Trabajando_cuadrado",
    "Ratio_Edad_Años_Trabajando",
    "Ratio_Edad_Deuda_Credito",
    "Edad",
    "Edad_cuadrado",
    "Prod_Años_Trabajando_Ingresos",
    "Prod_Deuda_Comercial_Deuda_Credito",
    "Prod_Ingresos_Ratio_Ingresos_Deudas",
    "Prod_Otras_Deudas_Ratio_Ingresos_Deudas",
    "Ratio_Ingresos_Otras_Deudas",
    "Prod_Edad_Ratio_Ingresos_Deudas",
    "Ratio_Edad_Otras_Deudas"
]

#variables_pivote = ["Ratio_Años_Trabajando_Deuda_Total", "Ingresos", "Nivel_Educacional", "Deuda_Comercial" ]#, "Prod_Deuda"]# "Edad", "Deuda_Credito", "Años_Trabajando", "Nivel_Educacional"]
df_woe = crear_df_woe(variables_pivote)
x_train = df_woe[variables_pivote]
y_train = df_woe["Default"]

x_test = df_woe[variables_pivote]
y_test = df_woe["Default"]

RL = LogisticRegression(penalty= "l2")

RL.fit(x_train, y_train, sample_weight=peso_train2)
y_predic = RL.predict(x_test)
y_scores = RL.predict_proba(x_test)[:, 1]

acc = accuracy_score(y_test, y_predic)
print("El accuracy del modelo es")
print(f"El Accuracy del modelo es {acc:.3f}")


coef = RL.coef_[0]
intercepto = RL.intercept_[0]

df_coef = pd.DataFrame({
    "Variable": variables_pivote,
    "Coeficiente": coef
})

print("Intercepto:", intercepto)
print(df_coef)

thresholds = np.arange(0, 1.01, 0.01)   # thresholds de 0.00 a 1.00
accuracies = []

for t in thresholds:
        y_pred_t = (y_scores >= t).astype(int)
        accuracies.append(accuracy_score(y_test, y_pred_t))

        best_t = thresholds[np.argmax(accuracies)]
        best_acc = max(accuracies)


print(f"Con el corte optimo {best_t} el Accuracy es: {best_acc}")


In [None]:
variables_pivote = ["Edad", "Deuda_Credito", "Años_Trabajando", "Nivel_Educacional", "Ratio_Años_Trabajando_Deuda_Total"]
df_woe = crear_df_woe(variables_pivote)
df_woe

In [None]:
variables_pivote = ["Edad", "Deuda_Credito", "Años_Trabajando", "Nivel_Educacional", "Ratio_Años_Trabajando_Deuda_Total"]
df_woe = crear_df_woe(variables_pivote)

x_train = df_woe[variables_pivote]
y_train = df_woe["Default"]

x_test = df_woe[variables_pivote]
y_test = df_woe["Default"]

RL = LogisticRegression()

RL.fit(x_train, y_train, sample_weight=peso_train2)
y_predic = RL.predict(x_test)
y_scores = RL.predict_proba(x_test)[:, 1]

acc = accuracy_score(y_test, y_predic)
print("El accuracy del modelo es")
print(f"El Accuracy del modelo es {acc:.3f}")


coef = RL.coef_[0]
intercepto = RL.intercept_[0]

df_coef = pd.DataFrame({
    "Variable": variables_pivote,
    "Coeficiente": coef
})

print("Intercepto:", intercepto)
print(df_coef)

thresholds = np.arange(0, 1.01, 0.01)   # thresholds de 0.00 a 1.00
accuracies = []

for t in thresholds:
        y_pred_t = (y_scores >= t).astype(int)
        accuracies.append(accuracy_score(y_test, y_pred_t))

        best_t = thresholds[np.argmax(accuracies)]
        best_acc = max(accuracies)


print(f"Con el corte optimo {best_t} el Accuracy es: {best_acc}")


# Modelo Final Grupo (INFORME)

En el siguiente codigo se utilizaron todas las herramientas que como grupo pudimos interiorizar anteriormente, las cuales consideran, creacion de variables, transformacion a WOE, seleccion de variables, optimizacion de threshold todo con el fin de entregar el mejor modelo y con el accuracy que como grupo logramos desarrollar  

Es por lo anterior que se vuelve a realizar un modelo y se vuelve a cargar la data desde 0, esto porque generamos un codigo final más limpio y sencillo de leer para que fuera evaluado

In [None]:

def crear_woes(x_train, target):
    """Calcula WOE sobre una serie y devuelve el mapa y los bins usados"""
    _, bins = pd.qcut(x_train, q=10, duplicates='drop', retbins=True)
    bins[0] = -np.inf
    bins[-1] = np.inf

    # Aplicar bins para calcular WOE
    bineado = pd.cut(x_train, bins=bins).astype(str)

    # Tabla WOE
    df_temp = pd.DataFrame({'bin': bineado, 'target': target})
    grouped = df_temp.groupby('bin')['target'].agg(['count', 'sum'])
    grouped['malos_pagadores'] = grouped['sum'].replace(0, 0.5)
    grouped['buenos_pagadores'] = (grouped['count'] - grouped['sum']).replace(0, 0.5)
    
    total_malos = target.sum()
    total_buenos = target.count() - total_malos
    
    grouped['WOE'] = np.log((grouped['buenos_pagadores'] / total_buenos) / (grouped['malos_pagadores'] / total_malos))
    grouped['IV'] = (grouped['buenos_pagadores']/total_buenos - grouped['malos_pagadores']/total_malos) * grouped['WOE']
    
    return grouped['WOE'].to_dict(), grouped['IV'].sum(), bins

def transformar_a_woe(x_train, tabla_woes, bins):
    """Aplica el WOE a nuevos datos"""
    bineado = pd.cut(x_train, bins=bins).astype(str)
    return bineado.map(tabla_woes).fillna(0) # Si hay nulos (categoría nueva), WOE 0


def regresion_y_metricas(df, nombre_de_la_prueba):
    print(f' ============ Estrategia {nombre_de_la_prueba} ============')
    
    if "Id_Cliente" in df.columns: df.drop(["Id_Cliente"], axis=1, inplace=True)
    if "Ratio_Ingresos_Deudas" in df.columns: df.drop(["Ratio_Ingresos_Deudas"], axis=1, inplace=True)

    target = 'Default'

    # Aplicacion del split en crudo por ahroa, ya que posteriormente se pasaran a woe
    x_en_bruto = df.drop(columns=[target])
    y_en_bruto = df[target]
    x_train_bruto, X_test_bruto, y_train, y_test = train_test_split(x_en_bruto, y_en_bruto, test_size=0.3, random_state=42, stratify=y_en_bruto)

    # Aplicacion del woe e IV
    print("Calculando WOE")

    X_train_woe = pd.DataFrame(index=x_train_bruto.index)
    X_test_woe = pd.DataFrame(index=X_test_bruto.index)
    iv_resumen = {}

    for col in x_train_bruto.columns:
        # acalculo del woe solo al train, evitamos fuga de informacion
        tabla_woes, iv, bins = crear_woes(x_train_bruto[col], y_train)
        iv_resumen[col] = iv
            
        # luego del calculo sobre test se aplica el woe
        X_train_woe[col] = transformar_a_woe(x_train_bruto[col], tabla_woes, bins)
        X_test_woe[col] = transformar_a_woe(X_test_bruto[col], tabla_woes, bins)

    print(f"Variables transformadas: {X_train_woe.shape[1]}")

    #ocupamos el IV para descartar variables con IV bajo
    iv_umbral = 0.01
    variables_fuertes_iv = [k for k, v in iv_resumen.items() if v >= iv_umbral]
    print(f"Variables con IV > {iv_umbral}: {len(variables_fuertes_iv)}")

    X_train_woe = X_train_woe[variables_fuertes_iv]
    X_test_woe = X_test_woe[variables_fuertes_iv]

    print("\nResumen del IV:")
    iv_df = pd.DataFrame(list(iv_resumen.items()), columns=['Variable', 'IV']).sort_values('IV', ascending=False)
    print(iv_df)
    
    RL = LogisticRegression(solver='liblinear', random_state=42)
    RL.fit(X_train_woe, y_train)
    y_predic = RL.predict(X_test_woe)

    acc = accuracy_score(y_test, y_predic)
    print(f"\n{nombre_de_la_prueba} El Accuracy del modelo es {acc:.3f}")

    coef = RL.coef_[0]
    intercepto = RL.intercept_[0]

    df_coef = pd.DataFrame({
        "Variable": list(X_train_woe.columns),
        "Coeficiente": coef
    })

    print("Intercepto:", intercepto)
    print(df_coef)

    y_prob = RL.predict_proba(X_test_woe)[:, 1]
    thresholds = np.arange(0, 1.01, 0.01) 
    accuracies = []

    for t in thresholds:
        y_pred_t = (y_prob >= t).astype(int)
        accuracies.append(accuracy_score(y_test, y_pred_t)) 

    mejor_t = thresholds[np.argmax(accuracies)]
    mejor_accuracy = max(accuracies)

    print(f"\nMejor threshold = {mejor_t:.2f}")
    print(f"Accuracy máximo = {mejor_accuracy:.4f}")

    #Probar validacion cruzada para validar el accuracy
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    cv_prob = cross_val_score(RL, X_train_woe, y_train, cv=cv, scoring='accuracy')

    print("\n----- Resultados de Validación Cruzada (10 Folds) ----")
    print(f"Accuracy de cada intento: {cv_prob}")
    print(f"\nAccuracy Promedio Real: {cv_prob.mean():.4f}")
    print(f"Desviación Estándar: {cv_prob.std():.4f}")

    print("\ninterpretación:")
    print(f"modelo oscila entre {(cv_prob.mean() - cv_prob.std()):.4f} y {(cv_prob.mean() + cv_prob.std()):.4f}")


## Creacion de variables

In [None]:
eps = 1e-10

df["Total_Deudas"] = df["Deuda_Comercial"] + df["Deuda_Credito"] + df["Otras_Deudas"]
df['Ratio_Total_Deudas_Edad'] = df['Total_Deudas'] / (df['Edad'] + eps)
df['Ratio_Años_Trabajando_Deuda_Credito'] = df['Años_Trabajando'] / (df['Deuda_Credito'] + eps)
df['Prod_Edad_Ingresos'] = df['Edad'] * df['Ingresos']
df['Ratio_Edad_Deuda_Credito'] = df['Edad'] / (df['Deuda_Credito'] + eps)
df['Prod_Años_Trabajando_Deuda_Comercial'] = df['Años_Trabajando'] * df['Deuda_Comercial']
df['Prod_Ingresos_Deuda_Comercial'] = df['Ingresos'] * df['Deuda_Comercial']
df['Ratio_Ingresos_Edad'] = df['Ingresos'] / (df['Edad'] + eps)
df['Prod_Otras_Deudas_Total_Deudas'] = df['Otras_Deudas'] * df['Total_Deudas']
df['Ratio_Edad_Total_Deudas'] = df['Edad'] / (df['Total_Deudas'] + eps)
df['Ratio_Total_Deudas_Deuda_Credito'] = df['Total_Deudas'] / (df['Deuda_Credito'] + eps)
df['Ratio_Otras_Deudas_Total_Deudas'] = df['Otras_Deudas'] / (df['Total_Deudas'] + eps)

In [None]:
features_elegidas = ['Ratio_Total_Deudas_Edad',
 'Ratio_Años_Trabajando_Deuda_Credito',
 'Prod_Edad_Ingresos',
 'Ratio_Edad_Deuda_Credito',
 'Prod_Años_Trabajando_Deuda_Comercial',
 'Prod_Ingresos_Deuda_Comercial',
 'Ratio_Ingresos_Edad',
 'Prod_Otras_Deudas_Total_Deudas',
 'Ratio_Edad_Total_Deudas',
 'Ratio_Total_Deudas_Deuda_Credito',
 'Ratio_Otras_Deudas_Total_Deudas',
 'Default']

df_features = df[features_elegidas]
 
regresion_y_metricas(df_features, 'Mejores features: WOE + Stepwise(Variables elegidas con backward)')

## Pasos para llegar a las features seleccionadas

1. Se hace una combinacion de variables para crear nuevas features
2. Se calcula el WOE con train y luego se transforma
3. Se evalua el IV para descartar variables
4. Se aplica regresion logistica sobre el modelo con woes
    
    4.1. Se evaluan las features con forward

    4.2. Se evaluan las features con backward

    4.3. Se evaluan las features bidireccionalmente con forward y backward
    
    4.4. Se evaluan las features con Ridge(L2) y Lasso(L1)
    
    4.5. Se comparan los accuracy y se elige la mejor estrategia

    4.6. De la mejor estrategia se optimiza el threshold

5. Se muestra la mejor estrategia, las mejores features y el accuracy


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from itertools import combinations
import warnings
warnings.filterwarnings('ignore')


df = pd.read_excel("Tabla Trabajo Grupal N°2.xlsx", sheet_name= "Desarrollo")

df.drop(["Id_Cliente"], axis=1, inplace=True)
df.drop(["Ratio_Ingresos_Deudas"], axis=1, inplace=True)
df.drop(["Nivel_Educacional"], axis=1, inplace=True)

# Dropeamos el ratio ingersos deduas, ya que se volverán a generar al momento de generar ratios
df["Total_Deudas"] = df["Deuda_Comercial"] + df["Deuda_Credito"] + df["Otras_Deudas"]
target = 'Default'

# Generar Ratios
col_continuas = [col for col in df.columns if df[col].dtype in ["int64", "float64"] and col != target]

def combinacion_de_variables(df, variables):
    df_2 = df.copy()
    eps = 1e-6 
    for x, y in combinations(variables, 2):
        df_2[f"Prod_{x}_{y}"] = df[x] * df[y]
        df_2[f"Ratio_{x}_{y}"] = df[x] / (df[y] + eps)
        df_2[f"Ratio_{y}_{x}"] = df[y] / (df[x] + eps)
    return df_2

df = combinacion_de_variables(df, col_continuas)

# Aplicacion del split en crudo por ahroa, ya que posteriormente se pasaran a woe
x_en_bruto = df.drop(columns=[target])
y_en_bruto = df[target]
x_train_bruto, X_test_bruto, y_train, y_test = train_test_split(x_en_bruto, y_en_bruto, test_size=0.3, random_state=42, stratify=y_en_bruto)

# Aplicacion del woe e IV
def crear_woes(x_train, target):
    """Calcula WOE sobre una serie y devuelve el mapa y los bins usados"""
    _, bins = pd.qcut(x_train, q=10, duplicates='drop', retbins=True)
    bins[0] = -np.inf
    bins[-1] = np.inf

    # Aplicar bins para calcular WOE
    bineado = pd.cut(x_train, bins=bins).astype(str)

    # Tabla WOE
    df_temp = pd.DataFrame({'bin': bineado, 'target': target})
    grouped = df_temp.groupby('bin')['target'].agg(['count', 'sum'])
    grouped['malos_pagadores'] = grouped['sum'].replace(0, 0.5)
    grouped['buenos_pagadores'] = (grouped['count'] - grouped['sum']).replace(0, 0.5)
    
    total_malos = target.sum()
    total_buenos = target.count() - total_malos
    
    grouped['WOE'] = np.log((grouped['buenos_pagadores'] / total_buenos) / (grouped['malos_pagadores'] / total_malos))
    grouped['IV'] = (grouped['buenos_pagadores']/total_buenos - grouped['malos_pagadores']/total_malos) * grouped['WOE']
    
    return grouped['WOE'].to_dict(), grouped['IV'].sum(), bins

def transformar_a_woe(x_train, tabla_woes, bins):
    """Aplica el WOE a nuevos datos"""
    bineado = pd.cut(x_train, bins=bins).astype(str)
    return bineado.map(tabla_woes).fillna(0) # Si hay nulos (categoría nueva), WOE 0

# --- Aplicar transformación
print("Calculando WOE")

X_train_woe = pd.DataFrame(index=x_train_bruto.index)
X_test_woe = pd.DataFrame(index=X_test_bruto.index)
iv_resumen = {}

for col in x_train_bruto.columns:
    # acalculo woe solo al train, evitamos fuga de informacion
    tabla_woes, iv, bins = crear_woes(x_train_bruto[col], y_train)
    iv_resumen[col] = iv
        
    # luego del calculo sobre test se aplica el woe
    X_train_woe[col] = transformar_a_woe(x_train_bruto[col], tabla_woes, bins)
    X_test_woe[col] = transformar_a_woe(X_test_bruto[col], tabla_woes, bins)

print(f"Variables transformadas: {X_train_woe.shape[1]}")

#ocupamos el IV para descartar variables con IV bajo
iv_umbral = 0.01
variables_fuertes_iv = [k for k, v in iv_resumen.items() if v >= iv_umbral]
print(f"Variables con IV > {iv_umbral}: {len(variables_fuertes_iv)}")

X_train_woe = X_train_woe[variables_fuertes_iv]
X_test_woe = X_test_woe[variables_fuertes_iv]


#APLICACION DE ESTRATEGIAS

# --- A) FORWARD
def aplicar_forward(X_tr, y_tr, X_te, y_te):
    inicial, restante = [], list(X_tr.columns)
    mejor_accuracy = 0
    print("\n----- Forward -----")
    while restante:
        cv_prob = []
        for col in restante:
            m = LogisticRegression(solver='liblinear', random_state=42)
            m.fit(X_tr[inicial + [col]], y_tr)
            cv_prob.append((accuracy_score(y_te, m.predict(X_te[inicial + [col]])), col))
        cv_prob.sort(reverse=True)
        if cv_prob[0][0] > mejor_accuracy:
            mejor_accuracy, mejor_col = cv_prob[0]
            inicial.append(mejor_col)
            restante.remove(mejor_col)
            print(f"Agregada: {cv_prob[0][1]} (Acc: {mejor_accuracy:.4f})")
        else: break
    return inicial, mejor_accuracy

# --- B) BACKWARD CON TOLERANCIA
def aplicar_backward(X_tr, y_tr, X_te, y_te, tolerancia=0.001):
    """
    Elimina variables si el accuracy no baja más que la torelancia.
    tolerancia=0.001 significa que aceptamos perder un 0.01% de accuracy a cambio de quitar una variable.
    """
    cols = list(X_tr.columns)
    m = LogisticRegression(solver='liblinear', random_state=42)
    m.fit(X_tr[cols], y_tr)
    mejor_accuracy = accuracy_score(y_te, m.predict(X_te[cols]))
    
    print(f"\n--- Backward (Tolerancia: {tolerancia}) ---")
    print(f"Inicio: {len(cols)} vars | Acc: {mejor_accuracy:.4f}")
    
    loop = True
    while loop and len(cols) > 1:
        cv_prob = []
        # Evaluar qué pasa si quitamos cada variable
        for col in cols:
            temporal_col = [f for f in cols if f != col]
            m.fit(X_tr[temporal_col], y_tr)
            acc = accuracy_score(y_te, m.predict(X_te[temporal_col]))
            cv_prob.append((acc, col))
        
        cv_prob.sort(reverse=True)
        accuracy_sin_var, variables_quitadas = cv_prob[0]
        
        # Si el accuracy al quitar la variable es MAYOR o IGUAL a (Accuracy Actual - Tolerancia)
        # Significa que la pérdida es aceptable, así que la quitamos
        if accuracy_sin_var >= (mejor_accuracy - tolerancia):
            cols.remove(variables_quitadas)
            # Solo actualizamos el mejor_accuracy si realmente mejoró
            # pero aceptamos el nuevo estado más simple.
            if accuracy_sin_var > mejor_accuracy:
                mejor_accuracy = accuracy_sin_var
            print(f"Eliminada: {variables_quitadas} (Acc: {accuracy_sin_var:.4f})")
        else:
            print("No se puede eliminar más sin sacrificar accuracy")
            loop = False
            
    return cols, mejor_accuracy

# --- C) STEPWISE BIDIRECCIONAL---
def stepwise_bidireccional(X_tr, y_tr, X_te, y_te):
    """Agrega la mejor, luego intenta eliminar la peor"""
    col_actual = []
    restante = list(X_tr.columns)
    mejor_accuracy = 0
    
    print("\n----- Stepwise bidireccional-----")
    while restante:
        # 1. Paso Forward
        acc_agregados = []
        for col in restante:
            m = LogisticRegression(solver='liblinear', random_state=42)
            m.fit(X_tr[col_actual + [col]], y_tr)
            acc = accuracy_score(y_te, m.predict(X_te[col_actual + [col]]))
            acc_agregados.append((acc, col))
        acc_agregados.sort(reverse=True)
        
        if acc_agregados[0][0] > mejor_accuracy:
            mejor_accuracy, agregado = acc_agregados[0]
            col_actual.append(agregado)
            restante.remove(agregado)
            print(f"Agregada: {agregado} (Acc: {mejor_accuracy:.4f})")
            
            # 2. Paso Backward (quita variables)
            # Revisamos si alguna de las QUE YA TENEMOS sobra
            if len(col_actual) > 2:
                acc_quitados = []
                for col in col_actual:
                    temp = [f for f in col_actual if f != col]
                    m.fit(X_tr[temp], y_tr)
                    acc = accuracy_score(y_te, m.predict(X_te[temp]))
                    acc_quitados.append((acc, col))
                acc_quitados.sort(reverse=True)
                
                # Si al quitar una mejoramos el mejor_accuracy actual, la sacamos
                if acc_quitados[0][0] > mejor_accuracy:
                    mejor_accuracy, quitados = acc_quitados[0]
                    col_actual.remove(quitados)
                    restante.append(quitados)
                    print(f"Eliminada por redundancia: {quitados} (Acc subió a: {mejor_accuracy:.4f})")
        else:
            break
            
    return col_actual, mejor_accuracy


mejor_fwd, acc_fwd = aplicar_forward(X_train_woe, y_train, X_test_woe, y_test)

mejor_bwd, acc_bwd = aplicar_backward(X_train_woe, y_train, X_test_woe, y_test, tolerancia=0.0005)

mejor_bidir, acc_bidir = stepwise_bidireccional(X_train_woe, y_train, X_test_woe, y_test)

# --- RESULTADOS
print("\n" + "="*30)
print(" COMPARATIVA FINAL")
print("="*30)
print(f"1. Forward:        {acc_fwd:.4f} ({len(mejor_fwd)} variables)")
print(f"2. Backward:       {acc_bwd:.4f} ({len(mejor_bwd)} variables)")
print(f"3. Bidireccional:  {acc_bidir:.4f} ({len(mejor_bidir)} va   riables)")

# guarda y luego muestra resultados de los intentos y los ordenamos de mayor a menor
resultados = [
    ('Forward', acc_fwd, len(mejor_fwd), mejor_fwd),
    ('Backward Smart', acc_bwd, len(mejor_bwd), mejor_bwd),
    ('Bidireccional', acc_bidir, len(mejor_bidir), mejor_bidir)
]

# Ordenamos por Accuracy descendente
resultados.sort(key=lambda x: x[1], reverse=True)

nombre_ganador, acc_ganador, ganadores_var_len, variables_ganadores = resultados[0]
print(f"\n>>> Estrategia ganadora: {nombre_ganador}")

# ENTRENAMIENTO FINAL
modelo_final = LogisticRegression(solver='liblinear', random_state=42)
modelo_final.fit(X_train_woe[variables_ganadores], y_train)

# Optimización Threshold
probs = modelo_final.predict_proba(X_test_woe[variables_ganadores])[:, 1]
mejor_t = 0.5; max_acc = 0
for t in np.arange(0.1, 0.9, 0.01):
    acc = accuracy_score(y_test, (probs >= t).astype(int))
    if acc > max_acc: max_acc = acc; mejor_t = t

print(f"Accuracy Optimizado: {max_acc:.4f} (Threshold: {mejor_t:.2f})")

# Modelo Pipeline

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
fro
