In [2]:
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# -------------------------
# 1. Controlar aleatoriedad
# -------------------------
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

# -------------------------
# 2. Cargar datasets
# -------------------------
df_real = pd.read_excel("dataset_reales_imputados.xlsx")
df_sint = pd.read_excel("datos_sinteticos_mahalanobis.xlsx")

# -------------------------
# 3. Codificación categórica
# -------------------------
# Unificamos para codificar igual ambos
df_real["es_sintetico"] = 0
df_sint["es_sintetico"] = 1
df_total = pd.concat([df_real, df_sint], ignore_index=True)

# One-hot encoding de variables categóricas
df_total = pd.get_dummies(df_total, columns=["Tipo", "Tipo_vasculitis"], drop_first=True)

# Eliminar columnas innecesarias
cols_a_eliminar = ["Evolucion Final", "es_sintetico"]
df_total = df_total.drop(columns=[col for col in cols_a_eliminar if col in df_total.columns])

# -------------------------
# 4. Generar escenarios
# -------------------------
# Separar nuevamente
df_reales = df_total[df_total.index < len(df_real)]
df_sint = df_total[df_total.index >= len(df_real)]

df_5a = pd.concat([df_reales, df_sint.sample(n=75, random_state=SEED)], ignore_index=True)
df_5b = pd.concat([df_reales, df_sint.sample(n=150, random_state=SEED)], ignore_index=True)

# -------------------------
# 5. Función de entrenamiento
# -------------------------
def entrenar_modelo(df, nombre="Modelo"):
    print(f"\n=== Entrenando {nombre} ===")

    X = df.drop(columns=["target"])
    y = df["target"]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

    accs, precs, recalls, f1s, aucs = [], [], [], [], []

    for train_idx, val_idx in skf.split(X_scaled, y):
        X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = Sequential([
            Input(shape=(X.shape[1],)),
            Dense(128, activation='relu'),
            Dropout(0.3),
            Dense(64, activation='relu'),
            Dropout(0.2),
            Dense(1, activation='sigmoid')
        ])

        model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
        es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

        model.fit(X_train, y_train,
                  validation_data=(X_val, y_val),
                  epochs=100, batch_size=16,
                  callbacks=[es], verbose=0)

        y_pred_probs = model.predict(X_val).ravel()
        y_pred = (y_pred_probs > 0.5).astype(int)

        accs.append(accuracy_score(y_val, y_pred))
        precs.append(precision_score(y_val, y_pred, zero_division=0))
        recalls.append(recall_score(y_val, y_pred, zero_division=0))
        f1s.append(f1_score(y_val, y_pred, zero_division=0))
        aucs.append(roc_auc_score(y_val, y_pred_probs))

    print(f"Accuracy:  {np.mean(accs):.4f} ± {np.std(accs):.4f}")
    print(f"Precision: {np.mean(precs):.4f} ± {np.std(precs):.4f}")
    print(f"Recall:    {np.mean(recalls):.4f} ± {np.std(recalls):.4f}")
    print(f"F1-score:  {np.mean(f1s):.4f} ± {np.std(f1s):.4f}")
    print(f"AUC:       {np.mean(aucs):.4f} ± {np.std(aucs):.4f}")

# -------------------------
# 6. Ejecutar ambos modelos
# -------------------------
entrenar_modelo(df_5a, nombre="Escenario 5a (75 sintéticos)")
entrenar_modelo(df_5b, nombre="Escenario 5b (150 sintéticos)")



=== Entrenando Escenario 5a (75 sintéticos) ===
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Accuracy:  0.7333 ± 0.0699
Precision: 0.7508 ± 0.0596
Recall:    0.8737 ± 0.0537
F1-score:  0.8064 ± 0.0487
AUC:       0.7416 ± 0.1037

=== Entrenando Escenario 5b (150 sintéticos) ===
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Accuracy:  0.7022 ± 0.0301
Precision: 0.7354 ±

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Cargar datasets
df_real = pd.read_excel("dataset_reales_imputados.xlsx")
df_sint = pd.read_excel("datos_sinteticos_mahalanobis.xlsx")

# Crear los escenarios
df_5a = pd.concat([df_real, df_sint.sample(n=75, random_state=42)], ignore_index=True)
df_5b = pd.concat([df_real, df_sint.sample(n=150, random_state=42)], ignore_index=True)

def preparar_dataset(df):
    df = df.drop(columns=["Evolución Final", "IRC", "ERCA", "Muerte", "Dialisis"], errors='ignore')
    df = pd.get_dummies(df)
    X = df.drop(columns=["target"])
    y = df["target"]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y

def entrenar_modelo(X, y, nombre_escenario):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    resultados = []

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = Sequential()
        model.add(Dense(128, activation='relu', input_dim=X.shape[1]))
        model.add(Dropout(0.3))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy')

        early_stop = EarlyStopping(patience=10, restore_best_weights=True, monitor='val_loss', verbose=0)
        model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=16, callbacks=[early_stop], verbose=0)

        y_pred_prob = model.predict(X_val).ravel()
        y_pred = (y_pred_prob >= 0.5).astype(int)

        resultados.append({
            "accuracy": accuracy_score(y_val, y_pred),
            "precision": precision_score(y_val, y_pred),
            "recall": recall_score(y_val, y_pred),
            "f1": f1_score(y_val, y_pred),
            "auc": roc_auc_score(y_val, y_pred_prob)
        })

    df_resultados = pd.DataFrame(resultados)
    print(f"\n=== Resultados por fold - {nombre_escenario} ===")
    print(df_resultados)
    print(f"\nResumen estadístico:\n{df_resultados.describe()}")

# Preparar y entrenar ambos escenarios
X_5a, y_5a = preparar_dataset(df_5a)
X_5b, y_5b = preparar_dataset(df_5b)

entrenar_modelo(X_5a, y_5a, "Escenario 5a (75 sintéticos)")
entrenar_modelo(X_5b, y_5b, "Escenario 5b (150 sintéticos)")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step

=== Resultados por fold - Escenario 5a (75 sintéticos) ===
   accuracy  precision    recall        f1       auc
0  1.000000   1.000000  1.000000  1.000000  1.000000
1  0.933333   0.947368  0.947368  0.947368  0.990431
2  0.866667   0.826087  1.000000  0.904762  0.980861
3  0.933333   0.947368  0.947368  0.947368  0.995215
4  0.800000   0.842105  0.842105  0.842105  0.933014

Resumen estadístico:
       accuracy  precision    recall        f1       auc
count  5.000000   5.000000  5.000000  5.000000  5.000000
mean   0.906667   0.912586  0.947368  0.928321  0.979904
std    0.076012   0.075017  0.064460  0.058847  0.027151
min    0.800000   0.826087  0.842105  0.842105  0.933014
25%    0.866667   0.842105  0.947368  0.904762  0.980861
50%    0.933333   0.947368  0.947368  0.947368  0.990431
75%    0.933333   0.947368  1.000000  0.947368  0.995215
max    1.000000   1.000000  1.000000  1.000000  1.000000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step

=== Resultados por fold - Escenario 5b (150 sintéticos) ===
   accuracy  precision    recall        f1       auc
0  0.911111   0.906250  0.966667  0.935484  0.982222
1  0.977778   0.967742  1.000000  0.983607  1.000000
2  0.977778   0.966667  1.000000  0.983051  1.000000
3  0.911111   0.903226  0.965517  0.933333  0.989224
4  1.000000   1.000000  1.000000  1.000000  1.000000

Resumen estadístico:
       accuracy  precision    recall        f1       auc
count  5.000000   5.000000  5.000000  5.000000  5.000000
mean   0.955556   0.948777  0.986437  0.967095  0.994289
std    0.041574   0.042388  0.018577  0.030615  0.008202
min    0.911111   0.903226  0.965517  0.933333  0.982222
25%    0.911111   0.906250  0.966667  0.935484  0.989224
50%    0.977778   0.966667  1.000000  0.983051  1.000000
75%    0.977778   0.967742  1.000000  0.983607  1.000000
max    1.000000   1.000000  1.000000  1.000000  1.000000


In [5]:
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# -------------------------
# 1. Controlar aleatoriedad
# -------------------------
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

# -------------------------
# 2. Cargar datasets
# -------------------------
df_real = pd.read_excel("dataset_reales_imputados.xlsx")
df_sint = pd.read_excel("datos_sinteticos_mahalanobis.xlsx")

# -------------------------
# 3. Codificación categórica
# -------------------------
df_real["es_sintetico"] = 0
df_sint["es_sintetico"] = 1
df_total = pd.concat([df_real, df_sint], ignore_index=True)

df_total = pd.get_dummies(df_total, columns=["Tipo", "Tipo_vasculitis"], drop_first=True)

cols_a_eliminar = ["Evolucion Final", "es_sintetico"]
df_total = df_total.drop(columns=[col for col in cols_a_eliminar if col in df_total.columns])

# -------------------------
# 4. Generar escenarios
# -------------------------
df_reales = df_total[df_total.index < len(df_real)]
df_sint = df_total[df_total.index >= len(df_real)]

df_5a = pd.concat([df_reales, df_sint.sample(n=75, random_state=SEED)], ignore_index=True)
df_5b = pd.concat([df_reales, df_sint.sample(n=150, random_state=SEED)], ignore_index=True)

# -------------------------
# 5. Función de entrenamiento
# -------------------------
def entrenar_modelo(df, nombre="Modelo"):
    print(f"\n=== Resultados por fold - {nombre} ===")
    X = df.drop(columns=["target"])
    y = df["target"]

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

    resultados = []

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)

        model = Sequential([
            Input(shape=(X.shape[1],)),
            Dense(128, activation='relu'),
            Dropout(0.3),
            Dense(64, activation='relu'),
            Dropout(0.2),
            Dense(1, activation='sigmoid')
        ])

        model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy')
        es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

        model.fit(X_train, y_train, validation_data=(X_val, y_val),
                  epochs=100, batch_size=16, callbacks=[es], verbose=0)

        y_pred_probs = model.predict(X_val).ravel()
        y_pred = (y_pred_probs > 0.5).astype(int)

        resultados.append({
            "accuracy": accuracy_score(y_val, y_pred),
            "precision": precision_score(y_val, y_pred, zero_division=0),
            "recall": recall_score(y_val, y_pred, zero_division=0),
            "f1": f1_score(y_val, y_pred, zero_division=0),
            "auc": roc_auc_score(y_val, y_pred_probs)
        })

    df_resultados = pd.DataFrame(resultados)
    print(df_resultados)
    print("\nResumen estadístico:")
    print(df_resultados.describe())

# -------------------------
# 6. Ejecutar ambos modelos
# -------------------------
entrenar_modelo(df_5a, nombre="Escenario 5a (75 sintéticos)")
entrenar_modelo(df_5b, nombre="Escenario 5b (150 sintéticos)")



=== Resultados por fold - Escenario 5a (75 sintéticos) ===
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
   accuracy  precision    recall        f1       auc
0  0.800000   0.782609  0.947368  0.857143  0.808612
1  0.800000   0.809524  0.894737  0.850000  0.885167
2  0.666667   0.680000  0.894737  0.772727  0.602871
3  0.700000   0.727273  0.842105  0.780488  0.655502
4  0.666667   0.714286  0.789474  0.750000  0.765550

Resumen estadístico:
       accuracy  precision    recall        f1       auc
count  5.000000   5.000000  5.000000  5.000000  5.000000
mean   0.726667   0.742738  0.873684  0.802072  0.743541
std    0.068313   0.052523  0.060009  0.048395  0.114363
min    0.666