### 1. Importacion de Librerias y Carga de Datos

In [4]:
# Librerias basicas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, 
    f1_score, confusion_matrix, classification_report,
    roc_auc_score, balanced_accuracy_score, matthews_corrcoef,
    roc_curve, precision_recall_curve, average_precision_score
)

# Algoritmo semi-supervisado
from sklearn.semi_supervised import LabelSpreading
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

# Configuracion de graficos
plt.style.use('ggplot')
sns.set_palette("husl")

# CARGAR DATOS
ruta = '../../data/data/application_train.csv'
datos = pd.read_csv(ruta)

print(f"Datos: {datos.shape} | Fraude: {datos['TARGET'].mean()*100:.1f}%")
datos.head()

Datos: (307511, 122) | Fraude: 8.1%


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Seleccionar variables numericas
datos_numericos = datos.select_dtypes(include=[np.number])
print(f"Variables numericas: {datos_numericos.shape[1]}")

Variables numericas: 106


In [None]:
# Seleccionar variables categoricas
datos_categoricos = datos.select_dtypes(include=['object'])
print(f"Variables categoricas: {datos_categoricos.shape[1]}")

Variables categoricas: 16


In [5]:
# PREPARACION DE DATOS MIXTOS (NUMERICOS Y CATEGORICOS)
nombres_espanol = {
    'TARGET': 'objetivo', 'CNT_CHILDREN': 'num_hijos',
    'AMT_INCOME_TOTAL': 'ingreso_total', 'AMT_CREDIT': 'monto_credito',
    'AMT_ANNUITY': 'anualidad', 'AMT_GOODS_PRICE': 'precio_bienes',
    'DAYS_BIRTH': 'edad_dias', 'DAYS_EMPLOYED': 'dias_empleado',
    'EXT_SOURCE_1': 'score_externo_1', 'EXT_SOURCE_2': 'score_externo_2',
    'EXT_SOURCE_3': 'score_externo_3', 'NAME_CONTRACT_TYPE': 'tipo_contrato',
    'CODE_GENDER': 'genero', 'FLAG_OWN_CAR': 'tiene_auto',
    'NAME_INCOME_TYPE': 'tipo_ingreso', 'NAME_EDUCATION_TYPE': 'nivel_educacion',
    'NAME_FAMILY_STATUS': 'estado_civil', 'NAME_HOUSING_TYPE': 'tipo_vivienda'
}

datos = datos.rename(columns=nombres_espanol)

columnas_importantes = [
    'objetivo', 'num_hijos', 'ingreso_total', 'monto_credito', 'anualidad',
    'precio_bienes', 'edad_dias', 'dias_empleado', 'score_externo_1',
    'score_externo_2', 'score_externo_3', 'tipo_contrato', 'genero',
    'tiene_auto', 'tipo_ingreso', 'nivel_educacion', 'estado_civil', 'tipo_vivienda'
]

columnas_usar = [col for col in columnas_importantes if col in datos.columns]
datos_filtrados = datos[columnas_usar].copy()

datos_numericos = datos_filtrados.select_dtypes(include=[np.number])
datos_categoricos = datos_filtrados.select_dtypes(include=['object'])

In [6]:
# PROCESAMIENTO DE VARIABLES NUMERICAS
for col in datos_numericos.columns:
    if datos_numericos[col].isnull().any():
        datos_numericos[col].fillna(datos_numericos[col].median(), inplace=True)

In [7]:
# PROCESAMIENTO DE VARIABLES CATEGORICAS
datos_categoricos = datos_categoricos.fillna('Desconocido')
label_encoders = {}
for col in datos_categoricos.columns:
    le = LabelEncoder()
    datos_categoricos[col] = le.fit_transform(datos_categoricos[col])
    label_encoders[col] = le

In [8]:
# COMBINACION Y PREPARACION FINAL
datos_procesados = pd.concat([datos_numericos, datos_categoricos], axis=1)
muestra = datos_procesados.sample(n=5000, random_state=42)

X = muestra.drop('objetivo', axis=1)
y = muestra['objetivo']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# CREACION DE CONJUNTO SEMI-SUPERVISADO
porcentaje_etiquetado = 0.2
n_etiquetados = int(len(X_train) * porcentaje_etiquetado)
indices_etiquetados = np.random.choice(len(X_train), n_etiquetados, replace=False)
mask_etiquetados = np.zeros(len(X_train), dtype=bool)
mask_etiquetados[indices_etiquetados] = True
y_semisup = y_train.copy()
y_semisup[~mask_etiquetados] = -1

In [10]:
# ENTRENAMIENTO DE MODELOS
modelo_base = RandomForestClassifier(n_estimators=100, random_state=42)
modelo_base.fit(X_train_scaled[mask_etiquetados], y_train.iloc[mask_etiquetados])

modelo_semisup = LabelSpreading(
    kernel='rbf',
    gamma=0.1,
    alpha=0.2,
    max_iter=30,
    tol=1e-3
)
modelo_semisup.fit(X_train_scaled, y_semisup)

In [13]:
# IMPLEMENTACION DEL ALGORITMO LABEL SPREADING SEGUN PSEUDOCODIGO
class LabelSpreadingCustomCategoricas:
    def __init__(self, alpha=0.2, gamma=0.1, max_iter=30, tol=1e-3):
        self.alpha = alpha  # Parametro de clamping
        self.gamma = gamma  # Parametro del kernel RBF
        self.max_iter = max_iter
        self.tol = tol
        
    def _construir_grafo_similitud(self, X):
        """1. Construccion del Grafo de Similitud"""
        n = X.shape[0]
        W = np.zeros((n, n))
        
        # Calcular matriz de similitud con kernel RBF
        for i in range(n):
            for j in range(n):
                if i != j:
                    distancia_cuadrada = np.sum((X[i] - X[j]) ** 2)
                    W[i, j] = np.exp(-self.gamma * distancia_cuadrada)
        return W
    
    def _normalizar_matriz(self, W):
        """2. Normalizacion Simetrica"""
        # Matriz diagonal D
        D = np.diag(np.sum(W, axis=1))
        D_sqrt_inv = np.diag(1.0 / np.sqrt(np.diag(D) + 1e-8))
        
        # S = D^(-1/2) * W * D^(-1/2)
        S = D_sqrt_inv @ W @ D_sqrt_inv
        return S
    
    def _inicializar_etiquetas(self, y, n_total, n_labeled):
        """3. Inicializacion de Etiquetas"""
        num_clases = len(np.unique(y[y >= 0]))
        Y = np.zeros((n_total, num_clases))
        
        # Para datos etiquetados: one-hot encoding
        for i in range(n_labeled):
            if y[i] >= 0:
                Y[i, int(y[i])] = 1.0
                
        Y_inicial = Y.copy()
        return Y, Y_inicial
    
    def _propagar_etiquetas(self, S, Y, Y_inicial):
        """4. Propagacion Iterativa con Soft Clamping"""
        for iteration in range(self.max_iter):
            Y_anterior = Y.copy()
            
            # Y_new = alpha * S * Y + (1 - alpha) * Y_inicial
            Y_new = self.alpha * (S @ Y) + (1 - self.alpha) * Y_inicial
            
            # Verificar convergencia
            diferencia = np.linalg.norm(Y_new - Y_anterior)
            if diferencia < self.tol:
                break
                
            Y = Y_new
            
        return Y
    
    def _asignar_etiquetas_finales(self, Y):
        """5. Asignacion Final de Etiquetas"""
        return np.argmax(Y, axis=1)
    
    def fit(self, X, y):
        """Metodo principal que implementa el algoritmo completo"""
        n_total = len(X)
        n_labeled = np.sum(y >= 0)
        
        # 1. Construir grafo de similitud
        W = self._construir_grafo_similitud(X)
        
        # 2. Normalizar matriz
        S = self._normalizar_matriz(W)
        
        # 3. Inicializar etiquetas
        Y, Y_inicial = self._inicializar_etiquetas(y, n_total, n_labeled)
        
        # 4. Propagacion iterativa
        Y_final = self._propagar_etiquetas(S, Y, Y_inicial)
        
        # 5. Asignar etiquetas finales
        self.etiquetas_predichas = self._asignar_etiquetas_finales(Y_final)
        self.probabilidades = Y_final
        
        return self
    
    def predict(self, X_test):
        """Prediccion para nuevos datos"""
        return self.etiquetas_predichas[:len(X_test)]
    
    def predict_proba(self, X_test):
        """Probabilidades para nuevos datos"""
        return self.probabilidades[:len(X_test)]

# ENTRENAMIENTO CON IMPLEMENTACION PERSONALIZADA
modelo_custom_cat = LabelSpreadingCustomCategoricas(alpha=0.2, gamma=0.1, max_iter=30, tol=1e-3)

# Preparar datos para el algoritmo personalizado
X_combined_cat = np.vstack([X_train_scaled, X_test_scaled])
y_combined_cat = np.concatenate([y_semisup, np.full(len(y_test), -1)])

modelo_custom_cat.fit(X_combined_cat, y_combined_cat)

<__main__.LabelSpreadingCustomCategoricas at 0x1784304b6d0>

### 4. Modelo Base

In [14]:
# PREDICCIONES Y EVALUACION - MODELOS COMPARATIVOS
y_pred_base = modelo_base.predict(X_test_scaled)
y_pred_semisup = modelo_semisup.predict(X_test_scaled)

# Predicciones del modelo personalizado
y_pred_custom_cat = modelo_custom_cat.predict(X_test_scaled)
y_proba_custom_cat = modelo_custom_cat.predict_proba(X_test_scaled)

# Metricas modelo base
metricas_base = {
    'Accuracy': accuracy_score(y_test, y_pred_base),
    'Precision': precision_score(y_test, y_pred_base, zero_division=0),
    'Recall': recall_score(y_test, y_pred_base),
    'F1-Score': f1_score(y_test, y_pred_base),
    'ROC-AUC': roc_auc_score(y_test, modelo_base.predict_proba(X_test_scaled)[:, 1])
}

# Metricas modelo sklearn
metricas_sklearn = {
    'Accuracy': accuracy_score(y_test, y_pred_semisup),
    'Precision': precision_score(y_test, y_pred_semisup, zero_division=0),
    'Recall': recall_score(y_test, y_pred_semisup),
    'F1-Score': f1_score(y_test, y_pred_semisup),
    'ROC-AUC': roc_auc_score(y_test, modelo_semisup.predict_proba(X_test_scaled)[:, 1])
}

# Metricas modelo personalizado con categoricas
metricas_custom_cat = {
    'Accuracy': accuracy_score(y_test, y_pred_custom_cat),
    'Precision': precision_score(y_test, y_pred_custom_cat, zero_division=0),
    'Recall': recall_score(y_test, y_pred_custom_cat),
    'F1-Score': f1_score(y_test, y_pred_custom_cat),
    'ROC-AUC': roc_auc_score(y_test, y_proba_custom_cat[:, 1]) if y_proba_custom_cat.shape[1] > 1 else 0.5
}

In [16]:
# METRICAS DE EVALUACION FINALES - COMPARACION COMPLETA CON CATEGORICAS
tabla_metricas_cat = pd.DataFrame({
    'Modelo': ['Modelo Base', 'Label Spreading (sklearn)', 'Label Spreading (Pseudocodigo)'],
    'Features': [X_train.shape[1], X_train.shape[1], X_train.shape[1]],
    'ROC-AUC': [metricas_base['ROC-AUC'], metricas_sklearn['ROC-AUC'], metricas_custom_cat['ROC-AUC']],
    'Precision': [metricas_base['Precision'], metricas_sklearn['Precision'], metricas_custom_cat['Precision']],
    'Recall': [metricas_base['Recall'], metricas_sklearn['Recall'], metricas_custom_cat['Recall']],
    'F1-Score': [metricas_base['F1-Score'], metricas_sklearn['F1-Score'], metricas_custom_cat['F1-Score']]
})

print("Comparacion de Modelos - Label Spreading con Variables Categoricas:")
print("=" * 70)
print(tabla_metricas_cat.round(6).to_string(index=False))


Comparacion de Modelos - Label Spreading con Variables Categoricas:
                        Modelo  Features  ROC-AUC  Precision   Recall  F1-Score
                   Modelo Base        17 0.681619        1.0 0.008065     0.016
     Label Spreading (sklearn)        17 0.660857        0.0 0.000000     0.000
Label Spreading (Pseudocodigo)        17 0.562506        0.0 0.000000     0.000
