In [7]:
import pandas as pd
import numpy as np

# --- Dataset reales imputados ---
df_reales = pd.read_excel("dataset_resultado_categorizado.xlsx")

# Imputar valores numéricos con la mediana
df_reales_imputado = df_reales.copy()
for col in df_reales_imputado.select_dtypes(include=[np.number]).columns:
    df_reales_imputado[col] = df_reales_imputado[col].fillna(df_reales_imputado[col].median())

# Eliminar columnas innecesarias
df_reales_imputado = df_reales_imputado.loc[:, ~df_reales_imputado.columns.str.contains('^Unnamed')]

# Crear columna target
df_reales_imputado['target'] = df_reales_imputado['Evolucion Final'].apply(lambda x: 1 if x != 'Nada' else 0)

# Guardar el dataset limpio
df_reales_imputado.to_excel("dataset_reales_imputados.xlsx", index=False)


# --- Dataset sintéticos ---
df_sinteticos = pd.read_excel("datos_sinteticos_CTGAN_solo_positivos.xlsx")

# Eliminar columnas innecesarias
df_sinteticos = df_sinteticos.loc[:, ~df_sinteticos.columns.str.contains('^Unnamed')]

# Crear columna target
df_sinteticos['target'] = df_sinteticos['Evolucion Final'].apply(lambda x: 1 if x != 'Nada' else 0)

# Guardar el dataset limpio
df_sinteticos.to_excel("datos_sinteticos_finales.xlsx", index=False)


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Cargar los datasets preparados
df_reales = pd.read_excel("dataset_reales_imputados.xlsx")
df_sinteticos = pd.read_excel("datos_sinteticos_finales.xlsx")

# Unificar datasets
df_total = pd.concat([df_reales, df_sinteticos], ignore_index=True)

# Codificación one-hot
df_total = pd.get_dummies(df_total, columns=['Tipo_vasculitis', 'Tipo'], drop_first=True)

# Eliminar columna de texto no necesaria
if 'Evolucion Final' in df_total.columns:
    df_total.drop(columns=['Evolucion Final'], inplace=True)

# Separar X e y
X = df_total.drop(columns='target')
y = df_total['target']

# Validación cruzada estratificada
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
resultados = []

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    model = Sequential([
        Input(shape=(X.shape[1],)),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    model.fit(X_train_scaled, y_train,
              validation_data=(X_val_scaled, y_val),
              epochs=100, batch_size=8, callbacks=[es], verbose=0)

    y_pred_probs = model.predict(X_val_scaled).ravel()
    y_pred = (y_pred_probs > 0.5).astype(int)

    resultados.append({
        'accuracy': accuracy_score(y_val, y_pred),
        'precision': precision_score(y_val, y_pred, zero_division=0),
        'recall': recall_score(y_val, y_pred, zero_division=0),
        'f1': f1_score(y_val, y_pred, zero_division=0),
        'auc': roc_auc_score(y_val, y_pred_probs)
    })

# Mostrar resultados
df_resultados = pd.DataFrame(resultados)
print(df_resultados)
print("\nResumen estadístico:")
print(df_resultados.describe().round(4))



[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
   accuracy  precision    recall        f1       auc
0  0.595174   0.608069  0.933628  0.736475  0.532298
1  0.576408   0.611842  0.823009  0.701887  0.533412
2  0.600536   0.621451  0.871681  0.725599  0.528445
3  0.568365   0.606667  0.808889  0.693333  0.562673
4  0.621984   0.618644  0.973333  0.756477  0.527477

Resumen estadístico:
       accuracy  precision  recall      f1     auc
count    5.0000     5.0000  5.0000  5.0000  5.0000
mean     0.5925     0.6133  0.8821  0.7228  0.5369
std      0.0211     0.0065  0.0706  0.0257  0.0146
min      0.5684     0.6067  0.8089  0.6933  0.5275
25%      0.5764     0.6081  0.8230 

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Cargar los datasets preparados
df_reales = pd.read_excel("dataset_reales_imputados.xlsx")
df_sinteticos = pd.read_excel("datos_sinteticos_finales.xlsx")

# Unificar los datos
df_total = pd.concat([df_reales, df_sinteticos], ignore_index=True)

# One-hot encoding (por si acaso)
df_total = pd.get_dummies(df_total, columns=['Tipo_vasculitis', 'Tipo'], drop_first=True)

# Variables predictoras y objetivo
X = df_total.drop(columns=['Evolucion Final', 'target'])
y = df_total['target']

# Validación cruzada
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
resultados = []

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Modelo profundo
    model = Sequential([
        Input(shape=(X.shape[1],)),
        Dense(256, activation='relu'),
        Dropout(0.4),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    model.fit(X_train_scaled, y_train,
              validation_data=(X_val_scaled, y_val),
              epochs=100, batch_size=8, callbacks=[es], verbose=0)

    y_pred_probs = model.predict(X_val_scaled).ravel()
    y_pred = (y_pred_probs > 0.5).astype(int)

    resultados.append({
        'accuracy': accuracy_score(y_val, y_pred),
        'precision': precision_score(y_val, y_pred, zero_division=0),
        'recall': recall_score(y_val, y_pred, zero_division=0),
        'f1': f1_score(y_val, y_pred, zero_division=0),
        'auc': roc_auc_score(y_val, y_pred_probs)
    })

df_resultados = pd.DataFrame(resultados)
print(df_resultados)
print("\nResumen estadístico:")
print(df_resultados.describe().round(4))


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
   accuracy  precision    recall        f1       auc
0  0.597855   0.607345  0.951327  0.741379  0.506983
1  0.613941   0.620588  0.933628  0.745583  0.542141
2  0.616622   0.635179  0.862832  0.731707  0.577539
3  0.597855   0.601078  0.991111  0.748322  0.574024
4  0.611260   0.611111  0.977778  0.752137  0.560751

Resumen estadístico:
       accuracy  precision  recall      f1     auc
count    5.0000     5.0000  5.0000  5.0000  5.0000
mean     0.6075     0.6151  0.9433  0.7438  0.5523
std      0.0090     0.0133  0.0503  0.0078  0.0289
min      0.5979     0.6011  0.8628  0.7317  0.5070
25%      0.5979     0.6073  0.9336 

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Cargar los datasets ya preparados
df_reales = pd.read_excel("dataset_reales_imputados.xlsx")
df_sinteticos = pd.read_excel("datos_sinteticos_finales.xlsx")

# Unir ambos datasets
df_total = pd.concat([df_reales, df_sinteticos], ignore_index=True)

# Codificar variables categóricas si no lo están
df_total = pd.get_dummies(df_total, columns=['Tipo_vasculitis', 'Tipo'], drop_first=True)

# Separar variables predictoras y objetivo
X = df_total.drop(columns=['Evolucion Final', 'target'])
y = df_total['target']

# Validación cruzada estratificada
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
resultados = []

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Escalado de características
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Arquitectura más ligera
    model = Sequential([
        Input(shape=(X.shape[1],)),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.1),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Entrenamiento
    model.fit(X_train_scaled, y_train,
              validation_data=(X_val_scaled, y_val),
              epochs=100, batch_size=16, callbacks=[es], verbose=0)

    # Predicción y evaluación
    y_pred_probs = model.predict(X_val_scaled).ravel()
    y_pred = (y_pred_probs > 0.5).astype(int)

    resultados.append({
        'accuracy': accuracy_score(y_val, y_pred),
        'precision': precision_score(y_val, y_pred, zero_division=0),
        'recall': recall_score(y_val, y_pred, zero_division=0),
        'f1': f1_score(y_val, y_pred, zero_division=0),
        'auc': roc_auc_score(y_val, y_pred_probs)
    })

# Mostrar resultados
df_resultados = pd.DataFrame(resultados)
print(df_resultados)
print("\nResumen estadístico:")
print(df_resultados.describe().round(4))


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
   accuracy  precision    recall        f1       auc
0  0.592493   0.614198  0.880531  0.723636  0.506833
1  0.589812   0.610272  0.893805  0.725314  0.505840
2  0.576408   0.612583  0.818584  0.700758  0.574409
3  0.611260   0.621212  0.911111  0.738739  0.601021
4  0.533512   0.597701  0.693333  0.641975  0.492342

Resumen estadístico:
       accuracy  precision  recall      f1     auc
count    5.0000     5.0000  5.0000  5.0000  5.0000
mean     0.5807     0.6112  0.8395  0.7061  0.5361
std      0.0292     0.0086  0.0888  0.0383  0.0484
min      0.5335     0.5977  0.6933  0.6420  0.4923
25%      0.5764     0.6103  0.8186 

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# 1. Cargar datasets
df_reales = pd.read_excel("dataset_reales_imputados.xlsx")
df_sinteticos = pd.read_excel("datos_sinteticos_finales.xlsx")

# 2. Seleccionar 75 reales y 75 sintéticos
df_reales_75 = df_reales.sample(n=75, random_state=42)
df_sinteticos_75 = df_sinteticos.sample(n=75, random_state=42)

# 3. Concatenar ambos datasets
df_total = pd.concat([df_reales_75, df_sinteticos_75], ignore_index=True)

# 4. One-hot encoding para variables categóricas
df_total = pd.get_dummies(df_total, columns=['Tipo_vasculitis', 'Tipo'], drop_first=True)

# 5. Eliminar columna de texto si existe
if 'Evolucion Final' in df_total.columns:
    df_total.drop(columns=['Evolucion Final'], inplace=True)

# 6. Separar variables predictoras y variable objetivo
X = df_total.drop(columns='target')
y = df_total['target']

# 7. Validación cruzada
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 8. Resultados
resultados = []

# 9. MODELO - Arquitectura PROFUNDA (opción 2, la mejor anterior)
for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Escalar
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Modelo
    model = Sequential([
        Input(shape=(X.shape[1],)),
        Dense(256, activation='relu'),
        Dropout(0.4),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    model.fit(X_train_scaled, y_train,
              validation_data=(X_val_scaled, y_val),
              epochs=100, batch_size=8, callbacks=[es], verbose=0)

    # Predicción
    y_pred_probs = model.predict(X_val_scaled).ravel()
    y_pred = (y_pred_probs > 0.5).astype(int)

    # Evaluación
    resultados.append({
        'accuracy': accuracy_score(y_val, y_pred),
        'precision': precision_score(y_val, y_pred, zero_division=0),
        'recall': recall_score(y_val, y_pred, zero_division=0),
        'f1': f1_score(y_val, y_pred, zero_division=0),
        'auc': roc_auc_score(y_val, y_pred_probs)
    })

# 10. Mostrar resultados
df_resultados = pd.DataFrame(resultados)
print(df_resultados)
print("\nResumen estadístico:")
print(df_resultados.describe().round(4))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
   accuracy  precision    recall        f1       auc
0  0.733333   0.708333  0.944444  0.809524  0.726852
1  0.633333   0.652174  0.833333  0.731707  0.587963
2  0.833333   0.842105  0.888889  0.864865  0.800926
3  0.800000   0.809524  0.894737  0.850000  0.779904
4  0.633333   0.653846  0.894737  0.755556  0.736842

Resumen estadístico:
       accuracy  precision  recall      f1     auc
count    5.0000     5.0000  5.0000  5.0000  5.0000
mean     0.7267     0.7332  0.8912  0.8023  0.7265
std      0.0925     0.0883  0.0394  0.0579  0.0832
min      0.6333     0.6522  0.8333  0.7317  0.5880
25%      0.6333     0.6538  0.8889  0.7556  0

VERSION REPRODUCIBLE

In [1]:
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# 0. Fijar semillas para reproducibilidad
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

# 1. Cargar datasets
df_reales = pd.read_excel("dataset_reales_imputados.xlsx")
df_sinteticos = pd.read_excel("datos_sinteticos_finales.xlsx")

# 2. Seleccionar 75 reales y 75 sintéticos
df_reales_75 = df_reales.sample(n=75, random_state=SEED)
df_sinteticos_75 = df_sinteticos.sample(n=75, random_state=SEED)

# 3. Concatenar ambos datasets
df_total = pd.concat([df_reales_75, df_sinteticos_75], ignore_index=True)

# 4. One-hot encoding para variables categóricas
df_total = pd.get_dummies(df_total, columns=['Tipo_vasculitis', 'Tipo'], drop_first=True)

# 5. Eliminar columna de texto si existe
if 'Evolucion Final' in df_total.columns:
    df_total.drop(columns=['Evolucion Final'], inplace=True)

# 6. Separar variables predictoras y variable objetivo
X = df_total.drop(columns='target')
y = df_total['target']

# 7. Validación cruzada estratificada
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

# 8. Resultados por fold
resultados = []

# 9. Modelo: Arquitectura profunda (opción 2)
for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    model = Sequential([
        Input(shape=(X.shape[1],)),
        Dense(256, activation='relu'),
        Dropout(0.4),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    model.fit(X_train_scaled, y_train,
              validation_data=(X_val_scaled, y_val),
              epochs=100, batch_size=8, callbacks=[es], verbose=0)

    y_pred_probs = model.predict(X_val_scaled).ravel()
    y_pred = (y_pred_probs > 0.5).astype(int)

    resultados.append({
        'accuracy': accuracy_score(y_val, y_pred),
        'precision': precision_score(y_val, y_pred, zero_division=0),
        'recall': recall_score(y_val, y_pred, zero_division=0),
        'f1': f1_score(y_val, y_pred, zero_division=0),
        'auc': roc_auc_score(y_val, y_pred_probs)
    })

# 10. Mostrar resultados
df_resultados = pd.DataFrame(resultados)
print(df_resultados)
print("\nResumen estadístico:")
print(df_resultados.describe().round(4))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
   accuracy  precision    recall        f1       auc
0  0.700000   0.680000  0.944444  0.790698  0.685185
1  0.700000   0.680000  0.944444  0.790698  0.625000
2  0.833333   0.809524  0.944444  0.871795  0.842593
3  0.700000   0.750000  0.789474  0.769231  0.712919
4  0.733333   0.761905  0.842105  0.800000  0.775120

Resumen estadístico:
       accuracy  precision  recall      f1     auc
count    5.0000     5.0000  5.0000  5.0000  5.0000
mean     0.7333     0.7363  0.8930  0.8045  0.7282
std      0.0577     0.0560  0.0729  0.0393  0.0837
min      0.7000     0.6800  0.7895  0.7692  0.6250
25%      0.7000     0.6800  0.8421  0.7907  0