In [None]:
!pip install gradio

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import sklearn.metrics as mt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report,confusion_matrix, ConfusionMatrixDisplay, roc_curve, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from scipy.stats import norm
from plotnine import*
import matplotlib.pyplot as plt
from collections import Counter
from PIL import Image
import io
import gradio as gr

import warnings
warnings.filterwarnings("ignore")

In [None]:
def runExperiments(n_samples, n_features, weights_array, prop_partition, repetitions):
  X, y = make_classification(n_samples = n_samples, n_features= n_features, n_classes = 2,
                             weights = weights_array, class_sep= 0.82, hypercube = False, random_state=111)

  auc_train = np.zeros(repetitions)
  auc_test = np.zeros(repetitions)
  all_test_index = set()
  index = np.arange(0,X.shape[0])


  for i in range(0,repetitions):

    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=prop_partition)
    train_index, test_index, _, _ = train_test_split(index, index, test_size=prop_partition)
    all_test_index.update([int(element) for element in test_index])

    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    model = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=4, min_samples_split=2, min_samples_leaf=1, random_state=0)
    model.fit(X_train, y_train)
    pred_score_train = model.predict_proba(X_train)[:,1]
    pred_score_test = model.predict_proba(X_test)[:,1]
    fpr_train, tpr_train, _ = mt.roc_curve(y_train, pred_score_train, pos_label=1)
    auc_train[i] = round(mt.auc(fpr_train,tpr_train),3)
    fpr_test, tpr_test, _ = mt.roc_curve(y_test, pred_score_test, pos_label=1)
    auc_test[i] = round(mt.auc(fpr_test,tpr_test),3)

  return(auc_train, auc_test, X, y, all_test_index)

def createDataFrame(auc_train,auc_test,repetitions ):
  df_results = pd.DataFrame()
  df_results['iteracion'] = list(range(0,repetitions))
  df_results['auc_train'] = auc_train
  df_results['auc_test'] = auc_test
  df_results2 = pd.melt(df_results, id_vars=['iteracion'], value_vars=['auc_train', 'auc_test'], var_name='particion')
  return(df_results2.fillna(0))


def ROC_graph(pred_score_train, pred_score_test, y_train, y_test):

  plt.figure(figsize=(12,10))
  fpr_train, tpr_train, _ = mt.roc_curve(y_train, pred_score_train, pos_label=1)
  auc_train = round(mt.auc(fpr_train,tpr_train),3)
  plt.plot(fpr_train, tpr_train,label="AUC data entrenamiento, AUC=" + str(auc_train), color='blueviolet')
  fpr_test, tpr_test, _ = mt.roc_curve(y_test, pred_score_test, pos_label=1)
  auc_test = round(mt.auc(fpr_test,tpr_test),3)
  plt.plot(fpr_test, tpr_test,label="AUC data test, AUC=" + str(auc_test), color='orange')
  plt.plot([0,1], [0,1],color='gray', lw=2, linestyle="--")
  plt.title('ROC Curves')
  plt.legend(fontsize=14)
  plt.show()

In [None]:
n_samples = 1000
n_features = 10
weights_array=[0.7, 0.3]

X, y = make_classification(n_samples = n_samples, n_features= n_features, n_classes = 2, weights = weights_array, class_sep= 0.82, hypercube = False, random_state=111)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Visualizar

df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
df['y'] = y

(
 ggplot(df) +
 aes(x = 'PC1', y= 'PC2', fill='factor(y)') +
 geom_point(size=2.5, alpha=0.6) +
 theme_bw() +
 labs(title='Visualización de los datos simulados', x='PC1', y='PC2', fill='Enfermedad') +
 scale_fill_manual(values=['lightgray', 'brown']) +
 theme(figure_size=(8,6))

)


In [None]:
## Partición hold out
prop_partition = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=prop_partition, random_state=50)
print("Dimensiones de set train:", X_train.shape)
print("Dimensiones de set test:", X_test.shape)

### Crear modelo
model = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=4, min_samples_split=2, min_samples_leaf=1, random_state=0)
model.fit(X_train, y_train)

## Predicciones
pred_score_train = model.predict_proba(X_train)[:,1]
pred_score_test = model.predict_proba(X_test)[:,1]

## Gráfico
ROC_graph(pred_score_train, pred_score_test, y_train, y_test)

In [None]:
def ejecutar_experimento(n_samples, n_features, prop_partition, repetitions, weight_major):
    weights_array = [weight_major, 1 - weight_major]
    auc_train, auc_test, X, y, all_test_index = runExperiments(n_samples, n_features, weights_array, prop_partition, repetitions)

    df = createDataFrame(auc_train, auc_test, repetitions)

    df_test = df[df.particion == 'auc_test'].copy()
    std = df_test['value'].std()
    df_test['std'] = std

    # Gráfico 1: AUC Test (plotnine)
    plot = (
        ggplot(df_test)
        + aes(x='factor(iteracion)', y='value')
        + geom_col(fill='dodgerblue', color='black')
        + geom_errorbar(aes(ymin='value - std', ymax='value + std'), width=0.2)
        + theme_bw()
        + labs(title='Valores de AUC de grupo test en cada experimento', x='Iteración', y='AUC')
    )

    fig_auc = plot.draw()
    buf_auc = io.BytesIO()
    fig_auc.savefig(buf_auc, format='png', dpi=150, bbox_inches='tight')
    buf_auc.seek(0)
    image_auc = Image.open(buf_auc)

    # Gráfico 2: PCA (matplotlib)
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)

    df_test_index = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
    df_test_index['cases'] = df_test_index.index.isin(all_test_index)
    df_test_index['y'] = y

    plot_pca = (
    ggplot(df_test_index) +
    aes(x = 'PC1', y= 'PC2', fill='factor(cases)') +
    geom_point(size=2.7, alpha=0.5) +
    theme_bw() +
    labs(title='Casos incluidos en grupo TEST', x='PC1', y='PC2', fill='Test') +
    scale_fill_manual(values=['gray', 'brown']))

    fig_pca = plot_pca.draw()
    buf_pca = io.BytesIO()
    fig_pca.savefig(buf_pca, format='png', dpi=150, bbox_inches='tight')
    buf_pca.seek(0)
    image_pca = Image.open(buf_pca)


    return f"Desviación estándar (test AUC): {std:.4f}", image_auc, image_pca

# Interfaz Gradio con 2 salidas de imagen
iface = gr.Interface(
    fn=ejecutar_experimento,
    inputs=[
        gr.Slider(100, 5000, value=100, label="Número de muestras"),
        gr.Slider(4, 20, value=5, step= 1.0, label="Número de características"),
        gr.Slider(0.1, 0.5, value=0.1, step=0.05, label="Proporción test"),
        gr.Slider(1, 30, value=5, step=1.0, label="Repeticiones"),
        gr.Slider(0.5, 0.99, value=0.9, step=0.05, label="Peso clase mayoritaria (sanos)"),
    ],
    outputs=[
        gr.Textbox(label="Resultado"),
        gr.Image(type="pil", label="Gráfico AUC"),
        gr.Image(type="pil", label="Visualización Casos testeados")
    ],
    title="🧪 Interfaz Interactiva: AUC + Visualización PCA",
    description="Ajusta los parámetros para ver el rendimiento del modelo en cada iteración y cómo se distribuyen los pacientes testeados en el espacio de datos usando PCA",
    article="Ciclo de Machine Learning para Radiología: @Héctor Henríquez"

)

iface.launch(debug=True)

### Validación cruzada k-fold

In [None]:
# Parámetros
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
model = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=4, min_samples_split=2, min_samples_leaf=1, random_state=0)
# Guardar curvas
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]

    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc = roc_auc_score(y_test, y_proba)
    aucs.append(auc)

    # Interpolamos tpr a los mismos fpr
   # Interpolamos tpr a los mismos fpr
    tpr_interp = np.interp(mean_fpr, fpr, tpr)
    tpr_interp[0] = 0.0  # aseguramos que arranca en 0
    tprs.append(tpr_interp)

# Cálculo de estadísticas
tprs = np.array(tprs)
mean_tpr = np.mean(tprs, axis=0)
std_tpr = np.std(tprs, axis=0, ddof=1)
n = len(tprs)
z = norm.ppf(0.975)

# Intervalo de confianza
tpr_lower = mean_tpr - z * std_tpr / np.sqrt(n)
tpr_upper = mean_tpr + z * std_tpr / np.sqrt(n)
tpr_lower = np.clip(tpr_lower, 0, 1)
tpr_upper = np.clip(tpr_upper, 0, 1)

# Promedio AUC
mean_auc = np.mean(aucs)
ci_auc_low = mean_auc - z * np.std(aucs, ddof=1) / np.sqrt(k)
ci_auc_high = mean_auc + z * np.std(aucs, ddof=1) / np.sqrt(k)

print(f"AUC promedio: {mean_auc:.2f} ({ci_auc_low:.2f} - {ci_auc_high:.2f})")
print("")
print("Resultados por fold:")
for i in range(len(aucs)):
  print(f"AUC Fold número {i + 1}: {round(aucs[i],2)}")


In [None]:

#Plot ROC promedio
plt.figure(figsize=(8, 6))
plt.plot(mean_fpr, mean_tpr, color='blue', label=f'Promedio ROC (AUC = {mean_auc:.2f})')
plt.fill_between(mean_fpr, tpr_lower, tpr_upper, color='dodgerblue', alpha=0.2,
                 label='IC 95% de TPR')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Aleatorio')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title(f'Curva ROC promedio (AUC IC95%: {ci_auc_low:.2f} - {ci_auc_high:.2f})')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()