In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# === CARGAR LOS DATOS ===
df_reales = pd.read_excel("dataset_reales_imputados.xlsx")
df_sinteticos = pd.read_excel("datos_sinteticos_finales.xlsx")

# Seleccionar subconjuntos
df_reales_esc4 = df_reales.sample(n=75, random_state=42)
df_sinteticos_esc4 = df_sinteticos.sample(n=300, random_state=42)

# Unir ambos conjuntos
df_escenario4 = pd.concat([df_reales_esc4, df_sinteticos_esc4], ignore_index=True)

# Codificar variables categóricas
df_escenario4 = pd.get_dummies(df_escenario4, columns=['Tipo_vasculitis', 'Tipo'], drop_first=True)

# Eliminar columna de texto si existe
if 'Evolucion Final' in df_escenario4.columns:
    df_escenario4.drop(columns=['Evolucion Final'], inplace=True)

# === SEPARAR VARIABLES ===
X = df_escenario4.drop(columns='target')
y = df_escenario4['target']

# === MODELO Y VALIDACIÓN ===
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
resultados = []

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    model = Sequential([
        Input(shape=(X.shape[1],)),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    model.fit(X_train_scaled, y_train,
              validation_data=(X_val_scaled, y_val),
              epochs=100, batch_size=8,
              callbacks=[es], verbose=0)

    y_pred_probs = model.predict(X_val_scaled).ravel()
    y_pred = (y_pred_probs > 0.5).astype(int)

    resultados.append({
        'accuracy': accuracy_score(y_val, y_pred),
        'precision': precision_score(y_val, y_pred, zero_division=0),
        'recall': recall_score(y_val, y_pred, zero_division=0),
        'f1': f1_score(y_val, y_pred, zero_division=0),
        'auc': roc_auc_score(y_val, y_pred_probs)
    })

# Mostrar resultados
df_resultados = pd.DataFrame(resultados)
print(df_resultados)
print("\nResumen estadístico:")
print(df_resultados.describe().round(4))



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
   accuracy  precision    recall        f1       auc
0  0.666667   0.678571  0.844444  0.752475  0.666667
1  0.573333   0.622642  0.733333  0.673469  0.618519
2  0.626667   0.644068  0.844444  0.730769  0.563704
3  0.560000   0.603448  0.777778  0.679612  0.579259
4  0.773333   0.773585  0.891304  0.828283  0.742879

Resumen estadístico:
       accuracy  precision  recall      f1     auc
count    5.0000     5.0000  5.0000  5.0000  5.0000
mean     0.6400     0.6645  0.8183  0.7329  0.6342
std      0.0859     0.0671  0.0624  0.0630  0.0726
min      0.5600     0.6034  0.7333  0.6735  0.5637
25%      0.5733     0.6226  0.7778  0.6796  0

In [6]:
##PARA QUE LOS RESULTADOS SEAN REPRODUCIBLES 

import os
import random
import numpy as np
import tensorflow as tf

# Establecer todas las semillas
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Forzar comportamiento determinista en TensorFlow (puede hacer más lento el entrenamiento)
os.environ['TF_DETERMINISTIC_OPS'] = '1'


from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)




Incluso con esto, en algunas configuraciones hardware (especialmente si usas GPU y ciertas versiones de TensorFlow), puede seguir habiendo pequeñas variaciones debido a operaciones no deterministas. Pero en CPU o con determinismo habilitado, los resultados deberían ser idénticos en cada ejecución.

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# === CARGAR LOS DATOS ===
df_reales = pd.read_excel("dataset_reales_imputados.xlsx")
df_sinteticos = pd.read_excel("datos_sinteticos_finales.xlsx")

# Seleccionar subconjuntos
df_reales_esc4 = df_reales.sample(n=75, random_state=42)
df_sinteticos_esc4 = df_sinteticos.sample(n=300, random_state=42)

# Unir ambos conjuntos
df_escenario4 = pd.concat([df_reales_esc4, df_sinteticos_esc4], ignore_index=True)

# Codificar variables categóricas
df_escenario4 = pd.get_dummies(df_escenario4, columns=['Tipo_vasculitis', 'Tipo'], drop_first=True)

# Eliminar columna de texto si existe
if 'Evolucion Final' in df_escenario4.columns:
    df_escenario4.drop(columns=['Evolucion Final'], inplace=True)

# === SEPARAR VARIABLES ===
X = df_escenario4.drop(columns='target')
y = df_escenario4['target']

# === MODELO Y VALIDACIÓN ===
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
resultados = []

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    model = Sequential([
        Input(shape=(X.shape[1],)),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    model.fit(X_train_scaled, y_train,
              validation_data=(X_val_scaled, y_val),
              epochs=100, batch_size=8,
              callbacks=[es], verbose=0)

    y_pred_probs = model.predict(X_val_scaled).ravel()
    y_pred = (y_pred_probs > 0.5).astype(int)

    resultados.append({
        'accuracy': accuracy_score(y_val, y_pred),
        'precision': precision_score(y_val, y_pred, zero_division=0),
        'recall': recall_score(y_val, y_pred, zero_division=0),
        'f1': f1_score(y_val, y_pred, zero_division=0),
        'auc': roc_auc_score(y_val, y_pred_probs)
    })

# Mostrar resultados
df_resultados = pd.DataFrame(resultados)
print(df_resultados)
print("\nResumen estadístico:")
print(df_resultados.describe().round(4))



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
   accuracy  precision    recall        f1       auc
0  0.640000   0.655172  0.844444  0.737864  0.605185
1  0.693333   0.683333  0.911111  0.780952  0.716296
2  0.613333   0.633333  0.844444  0.723810  0.585926
3  0.586667   0.616667  0.822222  0.704762  0.517778
4  0.546667   0.607143  0.739130  0.666667  0.479760

Resumen estadístico:
       accuracy  precision  recall      f1     auc
count    5.0000     5.0000  5.0000  5.0000  5.0000
mean     0.6160     0.6391  0.8323  0.7228  0.5810
std      0.0553     0.0307  0.0618  0.0421  0.0911
min      0.5467     0.6071  0.7391  0.6667  0.4798
25%      0.5867     0.6167  0.8222  0.7048  0

In [10]:
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# ================================
# Controlar aleatoriedad global
# ================================
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

# ====================================
# Cargar y preparar datasets
# ====================================
df_reales = pd.read_excel("dataset_reales_imputados.xlsx")
df_sinteticos = pd.read_excel("datos_sinteticos_finales.xlsx")

# Seleccionar 300 registros sintéticos aleatorios (si hay suficientes)
df_sinteticos_muestra = df_sinteticos.sample(n=300, random_state=SEED)

# Unificar datos
df_total = pd.concat([df_reales, df_sinteticos_muestra], ignore_index=True)

# Codificar variables categóricas
df_total = pd.get_dummies(df_total, columns=['Tipo_vasculitis', 'Tipo'], drop_first=True)

# Eliminar variable textual
if 'Evolucion Final' in df_total.columns:
    df_total.drop(columns=['Evolucion Final'], inplace=True)

# Separar variables
X = df_total.drop(columns=['target'])
y = df_total['target']

# ====================================
# Validación cruzada
# ====================================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
resultados = []

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Escalado
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Modelo base
    model = Sequential([
        Input(shape=(X.shape[1],)),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    model.fit(X_train_scaled, y_train,
              validation_data=(X_val_scaled, y_val),
              epochs=100, batch_size=16,
              callbacks=[es], verbose=0)

    # Evaluación
    y_pred_probs = model.predict(X_val_scaled).ravel()
    y_pred = (y_pred_probs > 0.5).astype(int)

    resultados.append({
        'accuracy': accuracy_score(y_val, y_pred),
        'precision': precision_score(y_val, y_pred, zero_division=0),
        'recall': recall_score(y_val, y_pred, zero_division=0),
        'f1': f1_score(y_val, y_pred, zero_division=0),
        'auc': roc_auc_score(y_val, y_pred_probs)
    })

# ====================================
# Mostrar resultados
# ====================================
df_resultados = pd.DataFrame(resultados)
print("\nResultados por fold:\n", df_resultados.round(4))
print("\nResumen estadístico:\n", df_resultados.describe().round(4))


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step

Resultados por fold:
    accuracy  precision  recall      f1     auc
0    0.6267     0.6552  0.8261  0.7308  0.6522
1    0.6133     0.6538  0.7556  0.7010  0.6415
2    0.6133     0.6333  0.8444  0.7238  0.5504
3    0.5867     0.6129  0.8444  0.7103  0.5504
4    0.6667     0.6923  0.8000  0.7423  0.6578

Resumen estadístico:
        accuracy  precision  recall      f1     auc
count    5.0000     5.0000  5.0000  5.0000  5.0000
mean     0.6213     0.6495  0.8141  0.7216  0.6104
std      0.0292     0.0295  0.0375  0.0163  0.0551
min      0.5867     0.6129  0.7556  0.7010  0.5504
25%      0.6133     0.6333  0.8000  0.7103  0.5504
50%   