# SISTEMA HÍBRIDO DE DETECCIÓN DE FRAUDE CREDITICIO
### Integración Optimizada: Isolation Forest + Autoencoder + LSTM + Semi-Supervisado


## PASO 1: IMPORTAR LIBRERÍAS

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score, roc_curve,
    precision_recall_curve, f1_score, precision_score, recall_score,
    accuracy_score, average_precision_score
)

from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, RepeatVector, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

print("Librerías cargadas correctamente")

Librerías cargadas correctamente


## PASO 2: CARGAR DATOS

In [None]:
ruta_datos = "data/data/application_train.csv"
datos = pd.read_csv(ruta_datos)

print("="*80)
print("INFORMACIÓN DEL DATASET")
print("="*80)
print(f"Registros: {len(datos):,}")
print(f"Variables: {datos.shape[1]}")
print(f"\nDistribución TARGET:")
print(f"  Normal (0): {(datos['TARGET']==0).sum():,} ({(datos['TARGET']==0).mean()*100:.2f}%)")
print(f"  Fraude (1): {(datos['TARGET']==1).sum():,} ({(datos['TARGET']==1).mean()*100:.2f}%)")

# Usar muestra estratificada para acelerar desarrollo
TAMAÑO_MUESTRA = 50000
if len(datos) > TAMAÑO_MUESTRA:
    datos, _ = train_test_split(datos, train_size=TAMAÑO_MUESTRA, random_state=42, stratify=datos['TARGET'])
    print(f"\nUsando muestra estratificada: {TAMAÑO_MUESTRA:,} registros")

INFORMACIÓN DEL DATASET
Registros: 307,511
Variables: 122

Distribución TARGET:
  Normal (0): 282,686 (91.93%)
  Fraude (1): 24,825 (8.07%)

Usando muestra estratificada: 50,000 registros


## PASO 3: FEATURE ENGINEERING AVANZADO

In [None]:
print("="*80)
print("FEATURE ENGINEERING")
print("="*80)

df = datos.copy()

# Variables temporales
df['edad'] = -df['DAYS_BIRTH'] / 365
df['años_empleado'] = -df['DAYS_EMPLOYED'] / 365
df['años_empleado'] = df['años_empleado'].replace(1000.67, np.nan)
df['dias_registro'] = -df['DAYS_REGISTRATION']
df['dias_id_publicacion'] = -df['DAYS_ID_PUBLISH']

# Ratios financieros clave
df['ratio_credito_ingreso'] = df['AMT_CREDIT'] / (df['AMT_INCOME_TOTAL'] + 1)
df['ratio_anualidad_ingreso'] = df['AMT_ANNUITY'] / (df['AMT_INCOME_TOTAL'] + 1)
df['ratio_anualidad_credito'] = df['AMT_ANNUITY'] / (df['AMT_CREDIT'] + 1)
df['ratio_bienes_credito'] = df['AMT_GOODS_PRICE'] / (df['AMT_CREDIT'] + 1)

# Variables per capita
df['ingreso_per_capita'] = df['AMT_INCOME_TOTAL'] / (df['CNT_FAM_MEMBERS'] + 1)
df['credito_per_capita'] = df['AMT_CREDIT'] / (df['CNT_FAM_MEMBERS'] + 1)

# Indicadores de riesgo
df['ratio_edad_empleo'] = df['años_empleado'] / (df['edad'] + 1)
df['tiene_telefono'] = df['FLAG_MOBIL'].fillna(0)
df['tiene_email'] = df['FLAG_EMAIL'].fillna(0)
df['tiene_trabajo_telefono'] = df['FLAG_WORK_PHONE'].fillna(0)

# Inconsistencias lógicas (señales de fraude)
df['inconsistencia_empleo_edad'] = (df['años_empleado'] > df['edad']).astype(int)
df['inconsistencia_ingreso_alto'] = ((df['AMT_INCOME_TOTAL'] > 500000) & (df['REGION_RATING_CLIENT'] == 3)).astype(int)

# Codificar categóricas importantes
variables_categoricas = [
    'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
    'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE'
]

label_encoders = {}
for col in variables_categoricas:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = df[col].fillna('Desconocido')
        df[f'{col}_cod'] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

print("Feature Engineering completado")

FEATURE ENGINEERING
Feature Engineering completado


## PASO 4: SELECCIÓN Y PREPARACIÓN DE VARIABLES

In [None]:
variables_numericas = [
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
    'ratio_credito_ingreso', 'ratio_anualidad_ingreso', 'ratio_anualidad_credito',
    'ratio_bienes_credito', 'ingreso_per_capita', 'credito_per_capita',
    'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
    'edad', 'años_empleado', 'dias_registro', 'dias_id_publicacion',
    'CNT_CHILDREN', 'CNT_FAM_MEMBERS',
    'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
    'REG_REGION_NOT_LIVE_REGION', 'REG_CITY_NOT_WORK_CITY',
    'ratio_edad_empleo', 'tiene_telefono', 'tiene_email', 'tiene_trabajo_telefono',
    'inconsistencia_empleo_edad', 'inconsistencia_ingreso_alto'
]

variables_categoricas_cod = [f'{col}_cod' for col in variables_categoricas if f'{col}_cod' in df.columns]
variables_finales = [v for v in variables_numericas if v in df.columns] + variables_categoricas_cod

datos_modelo = df[variables_finales + ['TARGET', 'SK_ID_CURR']].copy()

# Imputar nulos
for col in variables_finales:
    if datos_modelo[col].isnull().sum() > 0:
        if col in ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']:
            datos_modelo[col].fillna(datos_modelo[col].median(), inplace=True)
        else:
            datos_modelo[col].fillna(0, inplace=True)

print(f"Variables finales: {len(variables_finales)}")
print(f"Nulos restantes: {datos_modelo[variables_finales].isnull().sum().sum()}")

Variables finales: 37
Nulos restantes: 0


## PASO 5: PREPARAR DATOS Y NORMALIZAR

In [None]:
X = datos_modelo[variables_finales].values
y = datos_modelo['TARGET'].values
ids = datos_modelo['SK_ID_CURR'].values

# Normalizar con RobustScaler (mejor con outliers)
escalador = RobustScaler()
X_escalado = escalador.fit_transform(X)

# Separar para entrenamiento de modelos no supervisados
X_normales = X_escalado[y == 0]
y_normales = y[y == 0]

# Split para validación
X_train, X_test, y_train, y_test = train_test_split(
    X_escalado, y, test_size=0.3, random_state=42, stratify=y
)

print("Preparación de datos completada")
print(f"Total muestras: {len(X_escalado):,}")
print(f"Train: {len(X_train):,} | Test: {len(X_test):,}")
print(f"Features: {X_escalado.shape[1]}")

Preparación de datos completada
Total muestras: 50,000
Train: 35,000 | Test: 15,000
Features: 37


## PASO 6: MODELO 1 - ISOLATION FOREST OPTIMIZADO

In [None]:
print("="*80)
print("MODELO 1: ISOLATION FOREST")
print("="*80)

# Entrenar con parámetros optimizados
modelo_if = IsolationForest(
    n_estimators=200,
    max_samples='auto',
    contamination=0.08,  # Aproximado al porcentaje real de fraude
    max_features=1.0,
    random_state=42,
    n_jobs=-1
)

modelo_if.fit(X_train)

# Obtener scores (decision_function: más negativo = más anómalo)
scores_if_train = modelo_if.decision_function(X_train)
scores_if_test = modelo_if.decision_function(X_test)

# Normalizar scores a [0,1] donde 1 = más anómalo
def normalizar_scores(scores):
    return (scores.max() - scores) / (scores.max() - scores.min())

scores_if_train_norm = normalizar_scores(scores_if_train)
scores_if_test_norm = normalizar_scores(scores_if_test)

# Buscar mejor umbral en train
umbrales = np.percentile(scores_if_train_norm, np.arange(85, 100, 1))
mejor_f1_if = 0
mejor_umbral_if = 0

for umbral in umbrales:
    pred = (scores_if_train_norm > umbral).astype(int)
    f1 = f1_score(y_train, pred, zero_division=0)
    if f1 > mejor_f1_if:
        mejor_f1_if = f1
        mejor_umbral_if = umbral

# Evaluar en test
pred_if_test = (scores_if_test_norm > mejor_umbral_if).astype(int)
roc_auc_if = roc_auc_score(y_test, scores_if_test_norm)
f1_if = f1_score(y_test, pred_if_test)
precision_if = precision_score(y_test, pred_if_test, zero_division=0)
recall_if = recall_score(y_test, pred_if_test)

print(f"\nResultados Isolation Forest:")
print(f"  ROC-AUC: {roc_auc_if:.4f}")
print(f"  F1-Score: {f1_if:.4f}")
print(f"  Precision: {precision_if:.4f}")
print(f"  Recall: {recall_if:.4f}")
print(f"  Umbral óptimo: {mejor_umbral_if:.4f}")

MODELO 1: ISOLATION FOREST

Resultados Isolation Forest:
  ROC-AUC: 0.4844
  F1-Score: 0.0828
  Precision: 0.0692
  Recall: 0.1032
  Umbral óptimo: 0.4626


## PASO 7: MODELO 2 - AUTOENCODER DENSO OPTIMIZADO

In [7]:
print("="*80)
print("MODELO 2: AUTOENCODER DENSO")
print("="*80)

# Entrenar solo con datos normales
X_normales_train = X_train[y_train == 0]
X_ae_train, X_ae_val = train_test_split(X_normales_train, test_size=0.2, random_state=42)

dim_entrada = X_train.shape[1]

# Arquitectura optimizada
entrada = Input(shape=(dim_entrada,))
x = Dense(128, activation='relu')(entrada)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
cuello_botella = Dense(32, activation='relu', name='bottleneck')(x)
x = Dense(64, activation='relu')(cuello_botella)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(128, activation='relu')(x)
x = BatchNormalization()(x)
salida = Dense(dim_entrada, activation='linear')(x)

autoencoder = Model(inputs=entrada, outputs=salida)
autoencoder.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='mse',
    metrics=['mae']
)

early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=0)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6, verbose=0)

historial_ae = autoencoder.fit(
    X_ae_train, X_ae_train,
    epochs=50,
    batch_size=256,
    validation_data=(X_ae_val, X_ae_val),
    callbacks=[early_stop, reduce_lr],
    verbose=0
)

# Calcular error de reconstrucción
X_train_reconstruido = autoencoder.predict(X_train, verbose=0)
X_test_reconstruido = autoencoder.predict(X_test, verbose=0)

error_ae_train = np.mean(np.abs(X_train - X_train_reconstruido), axis=1)
error_ae_test = np.mean(np.abs(X_test - X_test_reconstruido), axis=1)

# Normalizar
scores_ae_train = (error_ae_train - error_ae_train.min()) / (error_ae_train.max() - error_ae_train.min())
scores_ae_test = (error_ae_test - error_ae_train.min()) / (error_ae_train.max() - error_ae_train.min())
scores_ae_test = np.clip(scores_ae_test, 0, 1)

# Optimizar umbral
umbrales_ae = np.percentile(scores_ae_train, np.arange(85, 100, 1))
mejor_f1_ae = 0
mejor_umbral_ae = 0

for umbral in umbrales_ae:
    pred = (scores_ae_train > umbral).astype(int)
    f1 = f1_score(y_train, pred, zero_division=0)
    if f1 > mejor_f1_ae:
        mejor_f1_ae = f1
        mejor_umbral_ae = umbral

# Evaluar
pred_ae_test = (scores_ae_test > mejor_umbral_ae).astype(int)
roc_auc_ae = roc_auc_score(y_test, scores_ae_test)
f1_ae = f1_score(y_test, pred_ae_test)
precision_ae = precision_score(y_test, pred_ae_test, zero_division=0)
recall_ae = recall_score(y_test, pred_ae_test)

print(f"\nResultados Autoencoder:")
print(f"  ROC-AUC: {roc_auc_ae:.4f}")
print(f"  F1-Score: {f1_ae:.4f}")
print(f"  Precision: {precision_ae:.4f}")
print(f"  Recall: {recall_ae:.4f}")
print(f"  Épocas: {len(historial_ae.history['loss'])}")

MODELO 2: AUTOENCODER DENSO

Resultados Autoencoder:
  ROC-AUC: 0.4902
  F1-Score: 0.0852
  Precision: 0.0654
  Recall: 0.1222
  Épocas: 50
