In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, f1_score, precision_score, 
    recall_score, accuracy_score, classification_report
)
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.semi_supervised import LabelSpreading, SelfTrainingClassifier
from keras.models import Model
from keras.layers import Input, Dense, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings('ignore')

print("Librerias cargadas correctamente")

Librerias cargadas correctamente


## Cargar Datos

In [2]:
ruta_datos = os.path.join(os.getcwd(), "data", "data", "application_train.csv")

if not os.path.exists(ruta_datos):
    print(f"Error: No se encuentra el archivo {ruta_datos}")
else:
    datos = pd.read_csv(ruta_datos)
    print(f"Datos cargados: {datos.shape}")
    print(f"Tasa de fraude: {datos['TARGET'].mean()*100:.2f}%")

Datos cargados: (307511, 122)
Tasa de fraude: 8.07%


## Resultados de Evaluaciones

In [3]:
resultados = []

### 1. Autoencoder

In [4]:
print("--- Evaluando Autoencoder ---")

# Variables de tiempo
datos['EDAD'] = -datos['DAYS_BIRTH'] / 365
datos['ANOS_EMPLEADO'] = -datos['DAYS_EMPLOYED'] / 365
datos['ANOS_EMPLEADO'] = datos['ANOS_EMPLEADO'].replace(1000.67, np.nan)

# Ratios financieros
datos['RATIO_CREDITO_INGRESO'] = datos['AMT_CREDIT'] / datos['AMT_INCOME_TOTAL']
datos['RATIO_ANUALIDAD_INGRESO'] = datos['AMT_ANNUITY'] / datos['AMT_INCOME_TOTAL']
datos['INGRESO_PER_CAPITA'] = datos['AMT_INCOME_TOTAL'] / (datos['CNT_FAM_MEMBERS'] + 1)

# Variables de inconsistencia
datos['INCONS_SCORE_INGRESO'] = 0
mask1 = (datos['EXT_SOURCE_2'] < 0.3) & (datos['AMT_INCOME_TOTAL'] > datos['AMT_INCOME_TOTAL'].quantile(0.75))
datos.loc[mask1, 'INCONS_SCORE_INGRESO'] = 1

variables = [
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
    'RATIO_CREDITO_INGRESO', 'RATIO_ANUALIDAD_INGRESO',
    'INGRESO_PER_CAPITA', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
    'EDAD', 'ANOS_EMPLEADO', 'CNT_CHILDREN', 'CNT_FAM_MEMBERS',
    'INCONS_SCORE_INGRESO'
]

datos_trabajo = datos[variables + ['TARGET']].copy()

for col in variables:
    if datos_trabajo[col].isnull().sum() > 0:
        datos_trabajo[col].fillna(datos_trabajo[col].median(), inplace=True)

X = datos_trabajo[variables].values
y = datos_trabajo['TARGET'].values

# Separar datos normales
mascara_normal = y == 0
X_normales = X[mascara_normal]

X_train, X_val = train_test_split(X_normales, test_size=0.2, random_state=42)

escalador = RobustScaler()
X_train_esc = escalador.fit_transform(X_train)
X_val_esc = escalador.transform(X_val)
X_todos_esc = escalador.transform(X)

print(f"Datos preparados - Train: {X_train.shape}, Val: {X_val.shape}")

--- Evaluando Autoencoder ---
Datos preparados - Train: (226148, 13), Val: (56538, 13)


In [5]:
# Arquitectura del autoencoder
dim_entrada = X_train_esc.shape[1]
entrada = Input(shape=(dim_entrada,))

x = Dense(64, activation='relu')(entrada)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

x = Dense(32, activation='relu')(x)
x = BatchNormalization()(x)

codigo = Dense(16, activation='relu')(x)

x = Dense(32, activation='relu')(codigo)
x = BatchNormalization()(x)

x = Dense(64, activation='relu')(x)
salida = Dense(dim_entrada, activation='linear')(x)

autoencoder = Model(entrada, salida)
autoencoder.compile(optimizer='adam', loss='mae')

parada = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=0)

print("Entrenando autoencoder...")
autoencoder.fit(
    X_train_esc, X_train_esc,
    epochs=50,
    batch_size=256,
    validation_data=(X_val_esc, X_val_esc),
    callbacks=[parada],
    verbose=1
)
print("Entrenamiento completado")

Entrenando autoencoder...
Epoch 1/50
[1m884/884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 12ms/step - loss: 1.3689 - val_loss: 0.2686
Epoch 2/50
[1m884/884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - loss: 0.2343 - val_loss: 0.1538
Epoch 3/50
[1m884/884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - loss: 0.1856 - val_loss: 0.1104
Epoch 4/50
[1m884/884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step - loss: 0.1748 - val_loss: 0.1220
Epoch 5/50
[1m884/884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - loss: 0.1660 - val_loss: 0.1032
Epoch 6/50
[1m884/884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - loss: 0.1585 - val_loss: 0.1099
Epoch 7/50
[1m884/884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - loss: 0.1547 - val_loss: 0.1029
Epoch 8/50
[1m884/884[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - loss: 0.1461 - val_loss: 0.1138
Epoch 9

In [6]:
# Calcular errores de reconstruccion
X_reconstruido = autoencoder.predict(X_todos_esc, verbose=0)
error_reconstruccion = np.mean(np.abs(X_todos_esc - X_reconstruido), axis=1)

# Optimizar umbral
umbrales = np.percentile(error_reconstruccion, range(85, 100, 1))
mejor_f1 = 0
mejor_umbral = 0

for umbral in umbrales:
    predicciones = (error_reconstruccion > umbral).astype(int)
    f1 = f1_score(y, predicciones, zero_division=0)
    if f1 > mejor_f1:
        mejor_f1 = f1
        mejor_umbral = umbral

predicciones_finales = (error_reconstruccion > mejor_umbral).astype(int)

# Metricas
metricas_ae = {
    'modelo': 'Autoencoder',
    'accuracy': accuracy_score(y, predicciones_finales),
    'precision': precision_score(y, predicciones_finales, zero_division=0),
    'recall': recall_score(y, predicciones_finales, zero_division=0),
    'f1_score': f1_score(y, predicciones_finales, zero_division=0),
    'roc_auc': roc_auc_score(y, error_reconstruccion)
}

resultados.append(metricas_ae)
print(f"AUC-ROC: {metricas_ae['roc_auc']:.4f}")
print(f"F1-Score: {metricas_ae['f1_score']:.4f}")
print(f"Precision: {metricas_ae['precision']:.4f}")
print(f"Recall: {metricas_ae['recall']:.4f}")

AUC-ROC: 0.4792
F1-Score: 0.0835
Precision: 0.0643
Recall: 0.1194


### 2. Isolation Forest

In [7]:
print("\n--- Evaluando Isolation Forest ---")

columnas_numericas = datos.select_dtypes(include=[np.number]).columns.tolist()
if 'SK_ID_CURR' in columnas_numericas:
    columnas_numericas.remove('SK_ID_CURR')
if 'TARGET' in columnas_numericas:
    columnas_numericas.remove('TARGET')
    
datos_num = datos[columnas_numericas].fillna(datos[columnas_numericas].median())
X_if = datos_num.values
y_if = datos['TARGET'].values

# Tomar muestra
muestra_size = min(10000, len(X_if))
indices = np.random.choice(len(X_if), muestra_size, replace=False)
X_muestra = X_if[indices]
y_muestra = y_if[indices]

modelo_if = IsolationForest(
    contamination=0.08,
    random_state=42,
    n_estimators=100
)

print(f"Entrenando con muestra de {muestra_size} registros...")
predicciones = modelo_if.fit_predict(X_muestra)
predicciones = (predicciones == -1).astype(int)

scores = modelo_if.score_samples(X_muestra)

# Metricas
metricas_if = {
    'modelo': 'Isolation Forest',
    'accuracy': accuracy_score(y_muestra, predicciones),
    'precision': precision_score(y_muestra, predicciones, zero_division=0),
    'recall': recall_score(y_muestra, predicciones, zero_division=0),
    'f1_score': f1_score(y_muestra, predicciones, zero_division=0),
    'roc_auc': roc_auc_score(y_muestra, -scores)
}

resultados.append(metricas_if)
print(f"AUC-ROC: {metricas_if['roc_auc']:.4f}")
print(f"F1-Score: {metricas_if['f1_score']:.4f}")
print(f"Precision: {metricas_if['precision']:.4f}")
print(f"Recall: {metricas_if['recall']:.4f}")


--- Evaluando Isolation Forest ---
Entrenando con muestra de 10000 registros...
AUC-ROC: 0.4590
F1-Score: 0.0590
Precision: 0.0612
Recall: 0.0570


### 3. LOF (Local Outlier Factor)

In [8]:
print("\n--- Evaluando LOF ---")

# Tomar muestra
muestra_size = min(5000, len(X_if))
indices = np.random.choice(len(X_if), muestra_size, replace=False)
X_muestra_lof = X_if[indices]
y_muestra_lof = y_if[indices]

modelo_lof = LocalOutlierFactor(
    n_neighbors=20,
    contamination=0.08,
    novelty=False
)

print(f"Entrenando con muestra de {muestra_size} registros...")
predicciones_lof = modelo_lof.fit_predict(X_muestra_lof)
predicciones_lof = (predicciones_lof == -1).astype(int)

scores_lof = modelo_lof.negative_outlier_factor_

# Metricas
metricas_lof = {
    'modelo': 'LOF',
    'accuracy': accuracy_score(y_muestra_lof, predicciones_lof),
    'precision': precision_score(y_muestra_lof, predicciones_lof, zero_division=0),
    'recall': recall_score(y_muestra_lof, predicciones_lof, zero_division=0),
    'f1_score': f1_score(y_muestra_lof, predicciones_lof, zero_division=0),
    'roc_auc': roc_auc_score(y_muestra_lof, -scores_lof)
}

resultados.append(metricas_lof)
print(f"AUC-ROC: {metricas_lof['roc_auc']:.4f}")
print(f"F1-Score: {metricas_lof['f1_score']:.4f}")
print(f"Precision: {metricas_lof['precision']:.4f}")
print(f"Recall: {metricas_lof['recall']:.4f}")


--- Evaluando LOF ---
Entrenando con muestra de 5000 registros...
AUC-ROC: 0.5051
F1-Score: 0.0941
Precision: 0.0925
Recall: 0.0959


### 4. Self-Training

In [9]:
print("\n--- Evaluando Self-Training ---")

columnas_num = ['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 
                'AMT_ANNUITY', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
                'EXT_SOURCE_2', 'EXT_SOURCE_3']

datos_num_st = datos[columnas_num + ['TARGET']].copy()
datos_num_st = datos_num_st.fillna(datos_num_st.median())

X_st = datos_num_st.drop('TARGET', axis=1).values
y_st = datos_num_st['TARGET'].values

X_train_st, X_test_st, y_train_st, y_test_st = train_test_split(
    X_st, y_st, test_size=0.2, random_state=42, stratify=y_st
)

# Simular datos sin etiquetas
porcentaje_etiquetado = 0.1
n_etiquetados = int(len(X_train_st) * porcentaje_etiquetado)

y_train_semi = y_train_st.copy()
indices_sin_etiquetar = np.random.choice(
    range(n_etiquetados, len(y_train_st)),
    size=len(y_train_st) - n_etiquetados,
    replace=False
)
y_train_semi[indices_sin_etiquetar] = -1

escalador_st = StandardScaler()
X_train_esc_st = escalador_st.fit_transform(X_train_st)
X_test_esc_st = escalador_st.transform(X_test_st)

print(f"Datos etiquetados: {n_etiquetados} ({porcentaje_etiquetado*100:.0f}%)")
print("Entrenando modelo Self-Training...")

modelo_base_st = RandomForestClassifier(n_estimators=50, random_state=42, max_depth=10)
modelo_st = SelfTrainingClassifier(modelo_base_st, threshold=0.9, max_iter=10, verbose=False)

modelo_st.fit(X_train_esc_st, y_train_semi)
predicciones_st = modelo_st.predict(X_test_esc_st)
predicciones_proba_st = modelo_st.predict_proba(X_test_esc_st)[:, 1]

# Metricas
metricas_st = {
    'modelo': 'Self-Training',
    'accuracy': accuracy_score(y_test_st, predicciones_st),
    'precision': precision_score(y_test_st, predicciones_st, zero_division=0),
    'recall': recall_score(y_test_st, predicciones_st, zero_division=0),
    'f1_score': f1_score(y_test_st, predicciones_st, zero_division=0),
    'roc_auc': roc_auc_score(y_test_st, predicciones_proba_st)
}

resultados.append(metricas_st)
print(f"AUC-ROC: {metricas_st['roc_auc']:.4f}")
print(f"F1-Score: {metricas_st['f1_score']:.4f}")
print(f"Precision: {metricas_st['precision']:.4f}")
print(f"Recall: {metricas_st['recall']:.4f}")


--- Evaluando Self-Training ---
Datos etiquetados: 24600 (10%)
Entrenando modelo Self-Training...
AUC-ROC: 0.7092
F1-Score: 0.0012
Precision: 0.7500
Recall: 0.0006


### 5. Label Spreading

In [10]:
print("\n--- Evaluando Label Spreading ---")

columnas_num_ls = ['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
                   'AMT_ANNUITY', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
                   'EXT_SOURCE_2', 'EXT_SOURCE_3']

datos_num_ls = datos[columnas_num_ls + ['TARGET']].copy()
datos_num_ls = datos_num_ls.fillna(datos_num_ls.median())

X_ls = datos_num_ls.drop('TARGET', axis=1).values
y_ls = datos_num_ls['TARGET'].values

# Tomar muestra
muestra_size_ls = min(5000, len(X_ls))
indices_ls = np.random.choice(len(X_ls), muestra_size_ls, replace=False)
X_muestra_ls = X_ls[indices_ls]
y_muestra_ls = y_ls[indices_ls]

X_train_ls, X_test_ls, y_train_ls, y_test_ls = train_test_split(
    X_muestra_ls, y_muestra_ls, test_size=0.2, random_state=42, stratify=y_muestra_ls
)

# Simular datos sin etiquetas
porcentaje_etiquetado_ls = 0.1
n_etiquetados_ls = int(len(X_train_ls) * porcentaje_etiquetado_ls)

y_train_semi_ls = y_train_ls.copy()
indices_sin_etiquetar_ls = np.random.choice(
    range(n_etiquetados_ls, len(y_train_ls)),
    size=len(y_train_ls) - n_etiquetados_ls,
    replace=False
)
y_train_semi_ls[indices_sin_etiquetar_ls] = -1

escalador_ls = StandardScaler()
X_train_esc_ls = escalador_ls.fit_transform(X_train_ls)
X_test_esc_ls = escalador_ls.transform(X_test_ls)

print(f"Muestra: {muestra_size_ls} registros")
print(f"Datos etiquetados: {n_etiquetados_ls} ({porcentaje_etiquetado_ls*100:.0f}%)")
print("Entrenando modelo Label Spreading...")

modelo_ls = LabelSpreading(kernel='rbf', alpha=0.2, max_iter=30)
modelo_ls.fit(X_train_esc_ls, y_train_semi_ls)

predicciones_ls = modelo_ls.predict(X_test_esc_ls)
predicciones_proba_ls = modelo_ls.predict_proba(X_test_esc_ls)[:, 1]

# Metricas
metricas_ls = {
    'modelo': 'Label Spreading',
    'accuracy': accuracy_score(y_test_ls, predicciones_ls),
    'precision': precision_score(y_test_ls, predicciones_ls, zero_division=0),
    'recall': recall_score(y_test_ls, predicciones_ls, zero_division=0),
    'f1_score': f1_score(y_test_ls, predicciones_ls, zero_division=0),
    'roc_auc': roc_auc_score(y_test_ls, predicciones_proba_ls)
}

resultados.append(metricas_ls)
print(f"AUC-ROC: {metricas_ls['roc_auc']:.4f}")
print(f"F1-Score: {metricas_ls['f1_score']:.4f}")
print(f"Precision: {metricas_ls['precision']:.4f}")
print(f"Recall: {metricas_ls['recall']:.4f}")


--- Evaluando Label Spreading ---
Muestra: 5000 registros
Datos etiquetados: 400 (10%)
Entrenando modelo Label Spreading...
AUC-ROC: 0.5348
F1-Score: 0.0839
Precision: 0.0952
Recall: 0.0750


## Comparacion Final de Resultados

In [11]:
df_resultados = pd.DataFrame(resultados)

print("\n" + "="*80)
print("COMPARACION DE MODELOS - DETECCION DE FRAUDE")
print("="*80)
print(df_resultados.to_string(index=False))


COMPARACION DE MODELOS - DETECCION DE FRAUDE
          modelo  accuracy  precision   recall  f1_score  roc_auc
     Autoencoder  0.788547   0.064257 0.119396  0.083549 0.479229
Isolation Forest  0.843800   0.061250 0.056977  0.059036 0.459046
             LOF  0.857600   0.092500 0.095855  0.094148 0.505083
   Self-Training  0.919305   0.750000 0.000604  0.001207 0.709237
 Label Spreading  0.869000   0.095238 0.075000  0.083916 0.534837


In [12]:
# Identificar mejor modelo por metrica
print("\n" + "="*80)
print("MEJOR MODELO POR METRICA")
print("="*80)

for metrica in ['accuracy', 'precision', 'recall', 'f1_score', 'roc_auc']:
    mejor_idx = df_resultados[metrica].idxmax()
    mejor_modelo = df_resultados.loc[mejor_idx, 'modelo']
    mejor_valor = df_resultados.loc[mejor_idx, metrica]
    print(f"{metrica.upper():15s}: {mejor_modelo:20s} ({mejor_valor:.4f})")


MEJOR MODELO POR METRICA
ACCURACY       : Self-Training        (0.9193)
PRECISION      : Self-Training        (0.7500)
RECALL         : Autoencoder          (0.1194)
F1_SCORE       : LOF                  (0.0941)
ROC_AUC        : Self-Training        (0.7092)


In [13]:
# Ranking general
df_rank = df_resultados.copy()
for col in ['accuracy', 'precision', 'recall', 'f1_score', 'roc_auc']:
    df_rank[f'rank_{col}'] = df_rank[col].rank(ascending=False)

df_rank['ranking_promedio'] = df_rank[[f'rank_{col}' for col in ['accuracy', 'precision', 'recall', 'f1_score', 'roc_auc']]].mean(axis=1)
df_rank = df_rank.sort_values('ranking_promedio')

print("\n" + "="*80)
print("RANKING GENERAL (menor es mejor)")
print("="*80)
for idx, row in df_rank.iterrows():
    print(f"{int(row['ranking_promedio']):2d}. {row['modelo']:20s} (Promedio: {row['ranking_promedio']:.2f})")


RANKING GENERAL (menor es mejor)
 2. Label Spreading      (Promedio: 2.20)
 2. LOF                  (Promedio: 2.40)
 2. Self-Training        (Promedio: 2.60)
 3. Autoencoder          (Promedio: 3.40)
 4. Isolation Forest     (Promedio: 4.40)


In [14]:
# Exportar resultados
df_resultados.to_csv('comparacion_modelos.csv', index=False)
print("\nResultados exportados a: comparacion_modelos.csv")


Resultados exportados a: comparacion_modelos.csv


In [15]:
# Mostrar tabla final formateada
df_resultados

Unnamed: 0,modelo,accuracy,precision,recall,f1_score,roc_auc
0,Autoencoder,0.788547,0.064257,0.119396,0.083549,0.479229
1,Isolation Forest,0.8438,0.06125,0.056977,0.059036,0.459046
2,LOF,0.8576,0.0925,0.095855,0.094148,0.505083
3,Self-Training,0.919305,0.75,0.000604,0.001207,0.709237
4,Label Spreading,0.869,0.095238,0.075,0.083916,0.534837
