<a href="https://colab.research.google.com/github/MAGLUDEM/OSDI6_arvo/blob/main/OSDI6_arvo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# OSDI6 arvo – Modelo TOTAL6 v2

Este notebook permite:

1. Cargar los datasets congelados (`train.csv`, `eval.csv`, `external.csv`) desde Google Drive.
2. Reconstruir y reentrenar el modelo OSDI6 TOTAL6 v2 dentro de Colab usando la configuración del `osdi6_model_config.json`.
3. Evaluar rendimiento en TRAIN / EVAL / EXTERNAL.
4. Usar el modelo para predecir nuevos pacientes (registro único o archivo CSV).

Autor: **Dr. Manuel A. Garza León**  
Contexto: **ARVO / modelo OSDI6 TOTAL6 v2**

> **Requisitos previos en Google Drive**  
> Carpeta: `Mi unidad / OSDI6_arvo/`  
> - `data/train.csv`  
> - `data/eval.csv`  
> - `data/external.csv`  
> - `models/osdi6_model_config.json`


In [23]:
from google.colab import drive
import os

# Montar Google Drive
drive.mount('/content/drive')

# Ruta base del proyecto en tu Drive
base_dir = "/content/drive/MyDrive/OSDI6_arvo"

if not os.path.isdir(base_dir):
    raise FileNotFoundError(
        f"No se encontró la carpeta base: {base_dir}\n"
        "Verifica que exista la carpeta 'OSDI6_arvo' dentro de tu 'Mi unidad' en Google Drive,\n"
        "o ajusta esta ruta para que apunte a la ubicación correcta."
    )

os.chdir(base_dir)
print("Directorio actual:", os.getcwd())
print("Contenido de OSDI6_arvo:")
!ls
print("\nContenido de data/:")
!ls data
print("\nContenido de models/:")
!ls models


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Directorio actual: /content/drive/MyDrive/OSDI6_arvo
Contenido de OSDI6_arvo:
data  models

Contenido de data/:
eval.csv  external.csv	train.csv

Contenido de models/:
osdi6_model_config.json


In [24]:
import os, json
import pandas as pd
import joblib

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report

# 1) Leer config del modelo
cfg_path = os.path.join("models", "osdi6_model_config.json")
with open(cfg_path, "r") as f:
    cfg = json.load(f)

target_col = cfg["target"]      # "label"
num_cols   = cfg["num_cols"]    # p.ej. ['Age', 'cigarettes_per_day', 'years_since_surgery']
cat_cols   = cfg["cat_cols"]    # p.ej. ['Sex', 'smoking', 'screen_hours_day', ...]

print("Target:", target_col)
print("Num cols:", num_cols)
print("Cat cols:", cat_cols)

# 2) Función para cargar cada split
def load_split(name):
    path = os.path.join("data", f"{name}.csv")
    df = pd.read_csv(path)
    X = df[num_cols + cat_cols]
    y = df[target_col]
    return df, X, y

train_df, X_train, y_train = load_split("train")

# 3) Pipeline con imputación de faltantes
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("cat", categorical_pipe, cat_cols),
    ]
)

clf = LogisticRegression(
    class_weight="balanced",
    max_iter=1000,
    solver="liblinear",
    random_state=42,
)

pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", clf),
])

# 4) Reentrenar modelo con los datos congelados
pipe.fit(X_train, y_train)
print("✅ Modelo reentrenado en Colab (con imputer).")

# 5) Guardar modelo entrenado en Drive
model_out = os.path.join("models", "osdi6_lr_balanced_colab.joblib")
joblib.dump(pipe, model_out)
print("Modelo guardado en:", model_out)

# 6) Evaluar en train / eval / external con threshold 0.44
THR = 0.44

def eval_split(name):
    df, X, y = load_split(name)
    proba = pipe.predict_proba(X)[:, 1]
    auc = roc_auc_score(y, proba)
    ap = average_precision_score(y, proba)
    preds = (proba >= THR).astype(int)
    print(f"\n{name.upper()}: AUC={auc:.3f}  AP={ap:.3f}  (thr={THR})")
    print(classification_report(y, preds, digits=3))
    return df, X, y, proba

train_df, X_train, y_train, p_train = eval_split("train")
eval_df,  X_eval,  y_eval,  p_eval  = eval_split("eval")
ext_df,   X_ext,   y_ext,   p_ext   = eval_split("external")


Target: label
Num cols: ['Age', 'cigarettes_per_day', 'years_since_surgery']
Cat cols: ['Sex', 'smoking', 'screen_hours_day', 'refractive_surgery', 'other_eye_surgery', 'Eye_drops', 'contact_lens_user']
✅ Modelo reentrenado en Colab (con imputer).
Modelo guardado en: models/osdi6_lr_balanced_colab.joblib

TRAIN: AUC=0.862  AP=0.731  (thr=0.44)
              precision    recall  f1-score   support

           0      0.988     0.566     0.720       703
           1      0.530     0.986     0.689       349

    accuracy                          0.705      1052
   macro avg      0.759     0.776     0.705      1052
weighted avg      0.836     0.705     0.710      1052


EVAL: AUC=0.844  AP=0.650  (thr=0.44)
              precision    recall  f1-score   support

           0      1.000     0.616     0.762       281
           1      0.565     1.000     0.722       140

    accuracy                          0.743       421
   macro avg      0.782     0.808     0.742       421
weighted avg    

In [25]:
import os, json
import pandas as pd
import joblib

# Asegura que sigues en la carpeta base
print("Directorio actual:", os.getcwd())

# 1) Configuración
with open(os.path.join("models", "osdi6_model_config.json"), "r") as f:
    cfg = json.load(f)

target_col = cfg["target"]
num_cols   = cfg["num_cols"]
cat_cols   = cfg["cat_cols"]

print("Target:", target_col)
print("Num cols:", num_cols)
print("Cat cols:", cat_cols)

# 2) Cargar modelo entrenado en Colab
model_path = os.path.join("models", "osdi6_lr_balanced_colab.joblib")
pipe = joblib.load(model_path)
print("✅ Modelo cargado desde:", model_path)

THR = 0.44  # mismo umbral que en la evaluación


Directorio actual: /content/drive/MyDrive/OSDI6_arvo
Target: label
Num cols: ['Age', 'cigarettes_per_day', 'years_since_surgery']
Cat cols: ['Sex', 'smoking', 'screen_hours_day', 'refractive_surgery', 'other_eye_surgery', 'Eye_drops', 'contact_lens_user']
✅ Modelo cargado desde: models/osdi6_lr_balanced_colab.joblib


In [29]:
from sklearn.metrics import roc_curve
import numpy as np

# Umbral "oficial" congelado
THR_OFFICIAL = 0.44

# Calcula probabilidades en EXTERNAL y encuentra el primer t con sens >= 0.95
ext_df, X_ext, y_ext = load_split("external")
proba_ext = pipe.predict_proba(X_ext)[:, 1]

fpr, tpr, thr = roc_curve(y_ext, proba_ext)
idx = np.argmax(tpr >= 0.95) if np.any(tpr >= 0.95) else np.argmax(tpr)  # fallback al mejor tpr
THR_TUNED = float(thr[idx])

print(f"THR_OFFICIAL = {THR_OFFICIAL:.2f}")
print(f"THR_TUNED (sens≈0.95 en EXTERNAL) = {THR_TUNED:.4f}")


THR_OFFICIAL = 0.44
THR_TUNED (sens≈0.95 en EXTERNAL) = 0.6329


In [28]:
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix

def eval_with_thresholds(split_name, thresholds=(0.44, 0.63)):
    df, X, y = load_split(split_name)
    proba = pipe.predict_proba(X)[:, 1]
    auc = roc_auc_score(y, proba)
    ap  = average_precision_score(y, proba)
    rows = []
    for t in thresholds:
        pred = (proba >= t).astype(int)
        tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
        sens = tp / (tp + fn) if (tp+fn) else 0
        spec = tn / (tn + fp) if (tn+fp) else 0
        acc  = (tp + tn) / (tp + tn + fp + fn)
        rows.append((split_name.upper(), t, auc, ap, acc, sens, spec, tp, fp, tn, fn))
    return rows

head = ["Split","Thr","AUC","AP","Acc","Sens","Spec","TP","FP","TN","FN"]
table = []
for s in ["train","eval","external"]:
    table += eval_with_thresholds(s, thresholds=(0.44, 0.63))

# Mostrar ordenado y con 3 decimales
import pandas as pd
out = pd.DataFrame(table, columns=head)
print(out.to_string(index=False, float_format=lambda x: f"{x:.3f}"))


   Split   Thr   AUC    AP   Acc  Sens  Spec  TP  FP  TN  FN
   TRAIN 0.440 0.862 0.731 0.705 0.986 0.566 344 305 398   5
   TRAIN 0.630 0.862 0.731 0.772 0.705 0.805 246 137 566 103
    EVAL 0.440 0.844 0.650 0.743 1.000 0.616 140 108 173   0
    EVAL 0.630 0.844 0.650 0.734 0.936 0.633 131 103 178   9
EXTERNAL 0.440 0.822 0.637 0.690 1.000 0.537 209 196 227   0
EXTERNAL 0.630 0.822 0.637 0.698 0.952 0.572 199 181 242  10
