In [None]:
import sys, torch, torch_geometric
print("Python:", sys.version)
print("Exe:", sys.executable)
print("Torch:", torch.__version__, "| CUDA?", torch.cuda.is_available(), "| CUDA runtime:", torch.version.cuda)
print("PyG:", torch_geometric.__version__)



Python: 3.11.2 (tags/v3.11.2:878ead1, Feb  7 2023, 16:38:35) [MSC v.1934 64 bit (AMD64)]
Exe: c:\Users\guzmam19\VSCode\tfm-gnn\.venv\Scripts\python.exe
Torch: 2.8.0+cpu | CUDA? False | CUDA runtime: None
PyG: 2.6.1


In [1]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv

dataset = Planetoid(root="data/Planetoid", name="Cora")
data = dataset[0]   # un grafo con .x (features), .edge_index (aristas), .y (labels)

class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

device = torch.device("cpu")        # "cuda" si luego usas GPU
model = GCN(dataset.num_node_features, 16, dataset.num_classes).to(device)
data = data.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test():
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    accs = []
    for mask in [data.train_mask, data.val_mask, data.test_mask]:
        acc = (pred[mask] == data.y[mask]).sum() / mask.sum()
        accs.append(acc.item())
    return accs

for epoch in range(1, 51):
    loss = train()
    train_acc, val_acc, test_acc = test()
    if epoch % 10 == 0:
        print(f"Epoch {epoch:03d} | Loss {loss:.4f} | ValAcc {val_acc:.3f} | TestAcc {test_acc:.3f}")


Epoch 010 | Loss 0.8703 | ValAcc 0.716 | TestAcc 0.712
Epoch 020 | Loss 0.2379 | ValAcc 0.754 | TestAcc 0.790
Epoch 030 | Loss 0.0914 | ValAcc 0.758 | TestAcc 0.787
Epoch 040 | Loss 0.0478 | ValAcc 0.762 | TestAcc 0.789
Epoch 050 | Loss 0.0415 | ValAcc 0.762 | TestAcc 0.785


In [10]:
import pandas as pd
import os

csv_path = "./data/adni/demographics/PTDEMOG.csv"
df = pd.read_csv(csv_path)


df['VISDATE'] = pd.to_datetime(df['VISDATE'], errors='coerce')

df_sorted = df.sort_values(by=['RID','VISDATE'])

df_unique = df_sorted.drop_duplicates(subset='RID', keep='first')

df_unique.shape


(4945, 84)

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

csv_path = "./data/adni/demographics/PTDEMOG.csv"
df = pd.read_csv(csv_path)


cols = ["RID", "PTDOBYY", "PTEDUCAT", "PTGENDER", "PTMARRY", "PTADDX"]
for c in cols:
    if c not in df.columns:
        print("⚠️ Falta columna:", c)
df = df[[c for c in cols if c in df.columns]].copy()

target_col = "PTADDX" if "PTADDX" in df.columns else "PTADBEG"
df = df[~df[target_col].isna()].copy()

X = df[["PTDOBYY", "PTEDUCAT", "PTGENDER", "PTMARRY"]].copy()

for num_col in ["PTDOBYY", "PTEDUCAT"]:
    X[num_col] = pd.to_numeric(X[num_col], errors="coerce")
X["PTEDUCAT"] = X["PTEDUCAT"].fillna(X["PTEDUCAT"].median())
X["PTDOBYY"] = X["PTDOBYY"].fillna(X["PTDOBYY"].median())

X = pd.get_dummies(X, columns=["PTGENDER", "PTMARRY"], drop_first=True)

scaler = StandardScaler()
X[["PTDOBYY","PTEDUCAT"]] = scaler.fit_transform(X[["PTDOBYY","PTEDUCAT"]])

y = pd.to_numeric(df[target_col], errors="coerce").astype(float)
mask_valid = ~y.isna()
X = X.loc[mask_valid].reset_index(drop=True)
y = y.loc[mask_valid].reset_index(drop=True)

print("X:", X.shape, "y:", y.shape)
print("Ejemplo columnas:", list(X.columns)[:10])
print("Rango y (año diagnóstico):", y.min(), "→", y.max())


X: (3977, 8) y: (3977,)
Ejemplo columnas: ['PTDOBYY', 'PTEDUCAT', 'PTGENDER_2.0', 'PTMARRY_2.0', 'PTMARRY_3.0', 'PTMARRY_4.0', 'PTMARRY_5.0', 'PTMARRY_6.0']
Rango y (año diagnóstico): 1980.0 → 9999.0


In [16]:
import torch
from sklearn.neighbors import NearestNeighbors
from torch_geometric.data import Data
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

X = X.apply(pd.to_numeric, errors='coerce').fillna(0)
y = pd.to_numeric(y, errors='coerce').fillna(0)

print(X.dtypes)
print(y.dtypes)
print(X.isna().sum().sum(), "missing after coercion")

import torch
x = torch.tensor(X.values, dtype=torch.float)
y_t = torch.tensor(y.values, dtype=torch.float)


x = torch.tensor(X.values, dtype=torch.float)
y_t = torch.tensor(y.values, dtype=torch.float)

k = 8
nbrs = NearestNeighbors(n_neighbors=k+1, metric="euclidean").fit(X.values)
_, idx = nbrs.kneighbors(X.values)
src, dst = [], []
for i in range(idx.shape[0]):
    for j in idx[i, 1:]:
        src.append(i); dst.append(j)
edge_index = torch.tensor([src, dst], dtype=torch.long)
edge_index = torch.cat([edge_index, edge_index.flip(0)], dim=1)

n = len(y_t)
perm = torch.randperm(n)
tr_n, va_n = int(0.7*n), int(0.15*n)
train_mask = torch.zeros(n, dtype=torch.bool); train_mask[perm[:tr_n]] = True
val_mask   = torch.zeros(n, dtype=torch.bool); val_mask[perm[tr_n:tr_n+va_n]] = True
test_mask  = torch.zeros(n, dtype=torch.bool); test_mask[perm[tr_n+va_n:]] = True

data = Data(x=x, edge_index=edge_index, y=y_t,
            train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)

from torch_geometric.nn import GCNConv
class GNNRegressor(nn.Module):
    def __init__(self, in_ch, hid=32, p=0.3):
        super().__init__()
        self.c1 = GCNConv(in_ch, hid)
        self.c2 = GCNConv(hid, 1)
        self.p = p
    def forward(self, x, edge_index):
        x = F.relu(self.c1(x, edge_index))
        x = F.dropout(x, p=self.p, training=self.training)
        x = self.c2(x, edge_index)
        return x.squeeze(-1)

device = torch.device("cpu")
model = GNNRegressor(data.num_node_features).to(device)
data = data.to(device)

opt = torch.optim.AdamW(model.parameters(), lr=1e-2, weight_decay=1e-4)
loss_fn = nn.MSELoss()

for ep in range(1, 61):
    model.train(); opt.zero_grad()
    out = model(data.x, data.edge_index)
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
    loss.backward(); opt.step()
    if ep % 10 == 0:
        model.eval()
        with torch.no_grad():
            v = loss_fn(out[data.val_mask], data.y[data.val_mask]).item()
            t = loss_fn(out[data.test_mask], data.y[data.test_mask]).item()
        print(f"ep {ep:03d} | train {loss.item():.3f} | val {v:.3f} | test {t:.3f}")


PTDOBYY         float64
PTEDUCAT        float64
PTGENDER_2.0       bool
PTMARRY_2.0        bool
PTMARRY_3.0        bool
PTMARRY_4.0        bool
PTMARRY_5.0        bool
PTMARRY_6.0        bool
dtype: object
float64
0 missing after coercion


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

In [17]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

torch.manual_seed(42)
np.random.seed(42)

csv_path = "./data/adni/demographics/PTDEMOG.csv"
df = pd.read_csv(csv_path)

base_cols = ["RID", "PTDOBYY", "PTEDUCAT", "PTGENDER", "PTMARRY"]
target_col = "PTADDX" if "PTADDX" in df.columns else ("PTADBEG" if "PTADBEG" in df.columns else None)

missing = [c for c in base_cols if c not in df.columns]
if missing:
    raise ValueError(f"Faltan columnas en el CSV: {missing}")

if target_col is None:
    raise ValueError("No encuentro PTADDX ni PTADBEG en el CSV para usar como target.")

df = df[base_cols + [target_col]].copy()

df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
df = df.dropna(subset=[target_col]).reset_index(drop=True)

for c in ["PTDOBYY", "PTEDUCAT"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

df["PTDOBYY"] = df["PTDOBYY"].fillna(df["PTDOBYY"].median())
df["PTEDUCAT"] = df["PTEDUCAT"].fillna(df["PTEDUCAT"].median())

X = df[["PTDOBYY", "PTEDUCAT", "PTGENDER", "PTMARRY"]].copy()
X = pd.get_dummies(X, columns=["PTGENDER", "PTMARRY"], drop_first=True)

scaler = StandardScaler()
X[["PTDOBYY", "PTEDUCAT"]] = scaler.fit_transform(X[["PTDOBYY", "PTEDUCAT"]])

X = X.astype(float)

y = df[target_col].astype(float)

print("Shapes -> X:", X.shape, " y:", y.shape)
print("Tipos de X:")
print(X.dtypes)

k = 8  # ajústalo si quieres más/menos aristas por nodo
nbrs = NearestNeighbors(n_neighbors=k+1, metric="euclidean")
nbrs.fit(X.values)
_, idx = nbrs.kneighbors(X.values)

src, dst = [], []
for i in range(idx.shape[0]):
    for j in idx[i, 1:]:  # saltamos self
        src.append(i); dst.append(j)
edge_index = torch.tensor([src, dst], dtype=torch.long)
edge_index = torch.cat([edge_index, edge_index.flip(0)], dim=1)

x = torch.tensor(X.values, dtype=torch.float)
y_t = torch.tensor(y.values, dtype=torch.float)

n = len(y_t)
perm = torch.randperm(n)
tr_n, va_n = int(0.7*n), int(0.15*n)

train_mask = torch.zeros(n, dtype=torch.bool)
val_mask   = torch.zeros(n, dtype=torch.bool)
test_mask  = torch.zeros(n, dtype=torch.bool)

train_mask[perm[:tr_n]] = True
val_mask[perm[tr_n:tr_n+va_n]] = True
test_mask[perm[tr_n+va_n:]] = True

data = Data(x=x, edge_index=edge_index, y=y_t,
            train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
print(data)

class GNNRegressor(nn.Module):
    def __init__(self, in_ch, hid=64, dropout=0.3):
        super().__init__()
        self.c1 = GCNConv(in_ch, hid)
        self.c2 = GCNConv(hid, 1)
        self.dropout = dropout
    def forward(self, x, edge_index):
        x = F.relu(self.c1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.c2(x, edge_index)
        return x.squeeze(-1)

device = torch.device("cpu")  # si luego usas GPU: torch.device("cuda")
model = GNNRegressor(in_ch=data.num_node_features, hid=64, dropout=0.3).to(device)
data = data.to(device)

opt = torch.optim.AdamW(model.parameters(), lr=1e-2, weight_decay=1e-4)
loss_fn = nn.MSELoss()

def evaluate(split="val"):
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        if split == "val":
            mask = data.val_mask
        elif split == "test":
            mask = data.test_mask
        else:
            mask = data.train_mask
        loss = loss_fn(out[mask], data.y[mask]).item()
    return loss

for ep in range(1, 61):
    model.train()
    opt.zero_grad()
    out = model(data.x, data.edge_index)
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    opt.step()

    if ep % 10 == 0:
        val_loss = evaluate("val")
        test_loss = evaluate("test")
        print(f"ep {ep:03d} | train {loss.item():.3f} | val {val_loss:.3f} | test {test_loss:.3f}")



Shapes -> X: (3977, 8)  y: (3977,)
Tipos de X:
PTDOBYY         float64
PTEDUCAT        float64
PTGENDER_2.0    float64
PTMARRY_2.0     float64
PTMARRY_3.0     float64
PTMARRY_4.0     float64
PTMARRY_5.0     float64
PTMARRY_6.0     float64
dtype: object
Data(x=[3977, 8], edge_index=[2, 63632], y=[3977], train_mask=[3977], val_mask=[3977], test_mask=[3977])
ep 010 | train 87087480.000 | val 88189488.000 | test 86944544.000
ep 020 | train 87023864.000 | val 88120576.000 | test 86873456.000
ep 030 | train 86913832.000 | val 88003952.000 | test 86752720.000
ep 040 | train 86746304.000 | val 87825568.000 | test 86567800.000
ep 050 | train 86506264.000 | val 87575352.000 | test 86308248.000
ep 060 | train 86191952.000 | val 87245608.000 | test 85966152.000


In [19]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

torch.manual_seed(42)
np.random.seed(42)

csv_path = "./data/adni/demographics/PTDEMOG.csv"  # <-- ajusta la ruta si procede
df = pd.read_csv(csv_path)

base_cols = ["RID", "PTDOBYY", "PTEDUCAT", "PTGENDER", "PTMARRY"]
target_col = "PTADDX" if "PTADDX" in df.columns else ("PTADBEG" if "PTADBEG" in df.columns else None)

missing = [c for c in base_cols if c not in df.columns]
if missing:
    raise ValueError(f"Faltan columnas en el CSV: {missing}")

if target_col is None:
    raise ValueError("No encuentro PTADDX ni PTADBEG en el CSV para usar como target.")

df = df[base_cols + [target_col]].copy()

df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
df = df.dropna(subset=[target_col]).reset_index(drop=True)


def norm_codes_to_labels(s: pd.Series, mapping: dict) -> pd.Series:
    out = s.astype(str).str.strip().str.replace(r"\.0$", "", regex=True)
    out = out.map(mapping)
    return out

gender_map = {
    "1": "male",
    "2": "female",
    "male": "male",
    "female": "female",
    "m": "male",
    "f": "female"
}

marry_map = {
    "1": "married",
    "2": "widowed",
    "3": "divorced",
    "4": "never_married",
    "6": "domestic_partnership"
}

if "PTGENDER" in df.columns:
    df["PTGENDER"] = norm_codes_to_labels(df["PTGENDER"], gender_map)

if "PTMARRY" in df.columns:
    df["PTMARRY"] = norm_codes_to_labels(df["PTMARRY"], marry_map)

print("PTGENDER (normalizado):")
print(df["PTGENDER"].value_counts(dropna=False))
print("\nPTMARRY  (normalizado):")
print(df["PTMARRY"].value_counts(dropna=False))

for c in ["PTDOBYY", "PTEDUCAT"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

df["PTDOBYY"] = df["PTDOBYY"].fillna(df["PTDOBYY"].median())
df["PTEDUCAT"] = df["PTEDUCAT"].fillna(df["PTEDUCAT"].median())

num_cols = ["PTDOBYY", "PTEDUCAT"]
cat_cols = ["PTGENDER", "PTMARRY"]

X_parts = []

scaler = StandardScaler()
X_num = pd.DataFrame(
    scaler.fit_transform(df[num_cols]),
    columns=num_cols,
    index=df.index
)
X_parts.append(X_num)

X_cat = pd.get_dummies(
    df[cat_cols],
    prefix=cat_cols,
    drop_first=False,
    dtype=float
)
X_parts.append(X_cat)

X = pd.concat(X_parts, axis=1).astype(float)

y = df[target_col].astype(float)

print("Shapes -> X:", X.shape, " y:", y.shape)
print("Tipos de X:")
print(X.dtypes)

n_samples = X.shape[0]
k = 8
k = min(k, max(1, n_samples - 1))

nbrs = NearestNeighbors(n_neighbors=k+1, metric="euclidean")
nbrs.fit(X.values)
_, idx = nbrs.kneighbors(X.values)

src, dst = [], []
for i in range(idx.shape[0]):
    for j in idx[i, 1:]:  # saltamos self
        src.append(i); dst.append(j)
edge_index = torch.tensor([src, dst], dtype=torch.long)
edge_index = torch.cat([edge_index, edge_index.flip(0)], dim=1)

x = torch.tensor(X.values, dtype=torch.float)
y_t = torch.tensor(y.values, dtype=torch.float)

n = len(y_t)
perm = torch.randperm(n)
tr_n, va_n = int(0.7*n), int(0.15*n)

train_mask = torch.zeros(n, dtype=torch.bool)
val_mask   = torch.zeros(n, dtype=torch.bool)
test_mask  = torch.zeros(n, dtype=torch.bool)

train_mask[perm[:tr_n]] = True
val_mask[perm[tr_n:tr_n+va_n]] = True
test_mask[perm[tr_n+va_n:]] = True

data = Data(x=x, edge_index=edge_index, y=y_t,
            train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
print(data)

class GNNRegressor(nn.Module):
    def __init__(self, in_ch, hid=64, dropout=0.3):
        super().__init__()
        self.c1 = GCNConv(in_ch, hid)
        self.c2 = GCNConv(hid, 1)
        self.dropout = dropout
    def forward(self, x, edge_index):
        x = F.relu(self.c1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.c2(x, edge_index)
        return x.squeeze(-1)

device = torch.device("cpu")  # si luego usas GPU: torch.device("cuda")
model = GNNRegressor(in_ch=data.num_node_features, hid=64, dropout=0.3).to(device)
data = data.to(device)

opt = torch.optim.AdamW(model.parameters(), lr=1e-2, weight_decay=1e-4)
loss_fn = nn.MSELoss()

def evaluate(split="val"):
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        if split == "val":
            mask = data.val_mask
        elif split == "test":
            mask = data.test_mask
        else:
            mask = data.train_mask
        loss = loss_fn(out[mask], data.y[mask]).item()
    return loss

for ep in range(1, 61):
    model.train()
    opt.zero_grad()
    out = model(data.x, data.edge_index)
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    opt.step()

    if ep % 10 == 0:
        val_loss = evaluate("val")
        test_loss = evaluate("test")
        print(f"ep {ep:03d} | train {loss.item():.3f} | val {val_loss:.3f} | test {test_loss:.3f}")



PTGENDER (normalizado):
PTGENDER
female    2127
male      1850
Name: count, dtype: int64

PTMARRY  (normalizado):
PTMARRY
married                 2742
divorced                 496
widowed                  431
never_married            259
domestic_partnership      30
NaN                       19
Name: count, dtype: int64
Shapes -> X: (3977, 9)  y: (3977,)
Tipos de X:
PTDOBYY                         float64
PTEDUCAT                        float64
PTGENDER_female                 float64
PTGENDER_male                   float64
PTMARRY_divorced                float64
PTMARRY_domestic_partnership    float64
PTMARRY_married                 float64
PTMARRY_never_married           float64
PTMARRY_widowed                 float64
dtype: object
Data(x=[3977, 9], edge_index=[2, 63632], y=[3977], train_mask=[3977], val_mask=[3977], test_mask=[3977])
ep 010 | train 87065392.000 | val 88166616.000 | test 86922920.000
ep 020 | train 86969400.000 | val 88066136.000 | test 86822432.000
ep 030 | train 868

In [20]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

torch.manual_seed(42)
np.random.seed(42)

csv_path = "./data/adni/demographics/PTDEMOG.csv"  # <-- ajusta la ruta si procede
df = pd.read_csv(csv_path)

base_cols = ["RID", "PTDOBYY", "PTEDUCAT", "PTGENDER", "PTMARRY"]
onset_cols = [c for c in ["PTCOGBEG", "PTADBEG", "PTADDX"] if c in df.columns]

missing = [c for c in base_cols if c not in df.columns]
if missing:
    raise ValueError(f"Faltan columnas en el CSV: {missing}")

if not onset_cols:
    raise ValueError("No encuentro PTCOGBEG/PTADBEG/PTADDX en el CSV.")

df = df[base_cols + onset_cols].copy()

for c in onset_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

def row_min_nonnull(row):
    vals = [row[c] for c in onset_cols if pd.notna(row[c])]
    return min(vals) if vals else np.nan

df["YEAR_ONSET"] = df.apply(row_min_nonnull, axis=1)

def first_notnull(s):
    return next((v for v in s if pd.notna(v)), np.nan)

agg_dict = {
    "YEAR_ONSET": "min",
    "PTDOBYY": "median",
    "PTEDUCAT": "median",
    "PTGENDER": first_notnull,
    "PTMARRY": first_notnull,
}
df = df.groupby("RID", as_index=False).agg(agg_dict)

df["YEAR_ONSET"] = pd.to_numeric(df["YEAR_ONSET"], errors="coerce")
before = len(df)
df = df.dropna(subset=["YEAR_ONSET"]).reset_index(drop=True)
print(f"Pacientes con YEAR_ONSET válido: {len(df)} / {before}")


def norm_codes_to_labels(s: pd.Series, mapping: dict) -> pd.Series:
    out = s.astype(str).str.strip().str.replace(r"\.0$", "", regex=True)
    out = out.map(mapping)
    return out

gender_map = {
    "1": "male",
    "2": "female",
    "male": "male",
    "female": "female",
    "m": "male",
    "f": "female"
}

marry_map = {
    "1": "married",
    "2": "widowed",
    "3": "divorced",
    "4": "never_married",
    "6": "domestic_partnership"
}

if "PTGENDER" in df.columns:
    df["PTGENDER"] = norm_codes_to_labels(df["PTGENDER"], gender_map)

if "PTMARRY" in df.columns:
    df["PTMARRY"] = norm_codes_to_labels(df["PTMARRY"], marry_map)

print("PTGENDER (normalizado):")
print(df["PTGENDER"].value_counts(dropna=False))
print("\nPTMARRY  (normalizado):")
print(df["PTMARRY"].value_counts(dropna=False))

for c in ["PTDOBYY", "PTEDUCAT"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

df["PTDOBYY"] = df["PTDOBYY"].fillna(df["PTDOBYY"].median())
df["PTEDUCAT"] = df["PTEDUCAT"].fillna(df["PTEDUCAT"].median())

num_cols = ["PTDOBYY", "PTEDUCAT"]
cat_cols = ["PTGENDER", "PTMARRY"]

X_parts = []

scaler = StandardScaler()
X_num = pd.DataFrame(
    scaler.fit_transform(df[num_cols]),
    columns=num_cols,
    index=df.index
)
X_parts.append(X_num)

X_cat = pd.get_dummies(
    df[cat_cols],
    prefix=cat_cols,
    drop_first=False,
    dtype=float
)
X_parts.append(X_cat)

X = pd.concat(X_parts, axis=1).astype(float)

y = df["YEAR_ONSET"].astype(float)

print("Shapes -> X:", X.shape, " y:", y.shape)
print("Tipos de X:")
print(X.dtypes)

n_samples = X.shape[0]
k = 8
k = min(k, max(1, n_samples - 1))

nbrs = NearestNeighbors(n_neighbors=k+1, metric="euclidean")
nbrs.fit(X.values)
_, idx = nbrs.kneighbors(X.values)

src, dst = [], []
for i in range(idx.shape[0]):
    for j in idx[i, 1:]:  # saltamos self
        src.append(i); dst.append(j)
edge_index = torch.tensor([src, dst], dtype=torch.long)
edge_index = torch.cat([edge_index, edge_index.flip(0)], dim=1)

x = torch.tensor(X.values, dtype=torch.float)
y_t = torch.tensor(y.values, dtype=torch.float)

n = len(y_t)
perm = torch.randperm(n)
tr_n, va_n = int(0.7*n), int(0.15*n)

train_mask = torch.zeros(n, dtype=torch.bool)
val_mask   = torch.zeros(n, dtype=torch.bool)
test_mask  = torch.zeros(n, dtype=torch.bool)

train_mask[perm[:tr_n]] = True
val_mask[perm[tr_n:tr_n+va_n]] = True
test_mask[perm[tr_n+va_n:]] = True

data = Data(x=x, edge_index=edge_index, y=y_t,
            train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
print(data)

class GNNRegressor(nn.Module):
    def __init__(self, in_ch, hid=64, dropout=0.3):
        super().__init__()
        self.c1 = GCNConv(in_ch, hid)
        self.c2 = GCNConv(hid, 1)
        self.dropout = dropout
    def forward(self, x, edge_index):
        x = F.relu(self.c1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.c2(x, edge_index)
        return x.squeeze(-1)

device = torch.device("cpu")  # si luego usas GPU: torch.device("cuda")
model = GNNRegressor(in_ch=data.num_node_features, hid=64, dropout=0.3).to(device)
data = data.to(device)

opt = torch.optim.AdamW(model.parameters(), lr=1e-2, weight_decay=1e-4)
loss_fn = nn.MSELoss()

def evaluate(split="val"):
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        if split == "val":
            mask = data.val_mask
        elif split == "test":
            mask = data.test_mask
        else:
            mask = data.train_mask
        loss = loss_fn(out[mask], data.y[mask]).item()
    return loss

for ep in range(1, 61):
    model.train()
    opt.zero_grad()
    out = model(data.x, data.edge_index)
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    opt.step()

    if ep % 10 == 0:
        val_loss = evaluate("val")
        test_loss = evaluate("test")
        print(f"ep {ep:03d} | train {loss.item():.3f} | val {val_loss:.3f} | test {test_loss:.3f}")



Pacientes con YEAR_ONSET válido: 4876 / 4945
PTGENDER (normalizado):
PTGENDER
female    2479
male      2397
Name: count, dtype: int64

PTMARRY  (normalizado):
PTMARRY
married                 3467
divorced                 555
widowed                  498
never_married            298
NaN                       38
domestic_partnership      20
Name: count, dtype: int64
Shapes -> X: (4876, 9)  y: (4876,)
Tipos de X:
PTDOBYY                         float64
PTEDUCAT                        float64
PTGENDER_female                 float64
PTGENDER_male                   float64
PTMARRY_divorced                float64
PTMARRY_domestic_partnership    float64
PTMARRY_married                 float64
PTMARRY_never_married           float64
PTMARRY_widowed                 float64
dtype: object
Data(x=[4876, 9], edge_index=[2, 78016], y=[4876], train_mask=[4876], val_mask=[4876], test_mask=[4876])
ep 010 | train 30461614.000 | val 31072988.000 | test 32362452.000
ep 020 | train 30424528.000 | val 310327

In [21]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

torch.manual_seed(42)
np.random.seed(42)

csv_path = "./data/adni/demographics/PTDEMOG.csv"
df = pd.read_csv(csv_path)

def norm_codes_to_labels(s: pd.Series, mapping: dict) -> pd.Series:
    out = s.astype(str).str.strip().str.replace(r"\.0$", "", regex=True)
    out = out.map(mapping)
    return out

gender_map = {"1":"male","2":"female","male":"male","female":"female","m":"male","f":"female"}
marry_map  = {"1":"married","2":"widowed","3":"divorced","4":"never_married","6":"domestic_partnership"}

onset_cols = [c for c in ["PTCOGBEG","PTADBEG","PTADDX"] if c in df.columns]
for c in onset_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

def row_min_nonnull(row):
    vals = [row[c] for c in onset_cols if pd.notna(row[c])]
    return min(vals) if vals else np.nan

df["YEAR_ONSET"] = df.apply(row_min_nonnull, axis=1) if onset_cols else np.nan
df["YEAR_ONSET"] = pd.to_numeric(df["YEAR_ONSET"], errors="coerce")

for c in ["PTDOBYY","PTEDUCAT"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

if "PTGENDER" in df.columns:
    df["PTGENDER"] = norm_codes_to_labels(df["PTGENDER"], gender_map)
if "PTMARRY" in df.columns:
    df["PTMARRY"]  = norm_codes_to_labels(df["PTMARRY"], marry_map)

date_col = "EXAMDATE" if "EXAMDATE" in df.columns else ("VISDATE" if "VISDATE" in df.columns else None)
has_visits = ("RID" in df.columns) and (date_col is not None)

if has_visits:
    print(f">> Modo VISITAS (usando {date_col} como fecha de visita)")
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    df["EXAM_YEAR"] = df[date_col].dt.year

    df["AGE_AT_VISIT"] = np.where(
        df["EXAM_YEAR"].notna() & df["PTDOBYY"].notna(),
        df["EXAM_YEAR"] - df["PTDOBYY"],
        np.nan
    )

    df["YEARS_TO_ONSET"] = np.where(
        df["YEAR_ONSET"].notna() & df["EXAM_YEAR"].notna(),
        df["YEAR_ONSET"] - df["EXAM_YEAR"],
        np.nan
    )
    df["HAS_LABEL"] = df["YEARS_TO_ONSET"].notna() & (df["YEARS_TO_ONSET"] >= 0)

    num_cols = [c for c in ["AGE_AT_VISIT","PTEDUCAT"] if c in df.columns]
    cat_cols = [c for c in ["PTGENDER","PTMARRY"] if c in df.columns]

    for c in num_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(df[c].median())

    parts = []
    if num_cols:
        scaler = StandardScaler()
        X_num = pd.DataFrame(
            scaler.fit_transform(df[num_cols]),
            columns=num_cols, index=df.index
        )
        parts.append(X_num)
    if cat_cols:
        X_cat = pd.get_dummies(df[cat_cols], prefix=cat_cols, drop_first=False, dtype=float)
        parts.append(X_cat)
    if not parts:
        raise ValueError("No hay columnas de features seleccionadas en modo visitas.")

    X = pd.concat(parts, axis=1).astype(float)

    y_full = df["YEARS_TO_ONSET"].astype(float)
    y_t = torch.tensor(y_full.fillna(0).values, dtype=torch.float32)
    label_mask = torch.tensor(df["HAS_LABEL"].fillna(False).values, dtype=torch.bool)

    X_clean = X.replace([np.inf,-np.inf], np.nan).fillna(0.0)
    n_samples = X_clean.shape[0]
    k = min(8, max(1, n_samples - 1))
    n_neighbors = min(n_samples, k + 1)

    if n_samples >= 2:
        nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric="euclidean")
        nbrs.fit(X_clean.values)
        _, idx = nbrs.kneighbors(X_clean.values)
        src_knn, dst_knn = [], []
        for i in range(idx.shape[0]):
            for j in idx[i, 1:]:
                src_knn.append(i); dst_knn.append(j)
        edge_knn = torch.tensor([src_knn, dst_knn], dtype=torch.long)
    else:
        edge_knn = torch.empty((2, 0), dtype=torch.long)

    if "RID" not in df.columns:
        raise ValueError("PTDEMOG no tiene RID; necesario para aristas temporales.")
    tmp = df.reset_index()[["index","RID", date_col]].dropna(subset=[date_col])
    tmp = tmp.sort_values(["RID", date_col])
    src_tmp, dst_tmp = [], []
    for rid, g in tmp.groupby("RID"):
        ids = g["index"].tolist()
        for a, b in zip(ids[:-1], ids[1:]):
            src_tmp.append(a); dst_tmp.append(b)
    edge_tmp = torch.tensor([src_tmp, dst_tmp], dtype=torch.long) if src_tmp else torch.empty((2,0), dtype=torch.long)

    def undirected(e): 
        return torch.cat([e, e.flip(0)], dim=1) if e.numel() else e

    edges = []
    if edge_knn.numel(): edges.append(undirected(edge_knn))
    if edge_tmp.numel(): edges.append(undirected(edge_tmp))
    edge_index = torch.cat(edges, dim=1) if edges else torch.empty((2,0), dtype=torch.long)
    if edge_index.numel():
        edge_index = torch.unique(edge_index, dim=1)

else:
    print(">> Modo PACIENTE (fallback: no hay EXAMDATE/VISDATE)")
    if "RID" not in df.columns:
        raise ValueError("PTDEMOG no tiene RID.")
    keep_cols = [c for c in ["RID","PTDOBYY","PTEDUCAT","PTGENDER","PTMARRY","YEAR_ONSET"] if c in df.columns]
    df = df[keep_cols].copy()

    def first_notnull(s):
        return next((v for v in s if pd.notna(v)), np.nan)
    agg = {
        "PTDOBYY":"median",
        "PTEDUCAT":"median",
        "PTGENDER":first_notnull,
        "PTMARRY":first_notnull,
        "YEAR_ONSET":"min"
    }
    df = df.groupby("RID", as_index=False).agg({k:v for k,v in agg.items() if k in df.columns})

    num_cols = [c for c in ["PTDOBYY","PTEDUCAT"] if c in df.columns]
    cat_cols = [c for c in ["PTGENDER","PTMARRY"] if c in df.columns]
    for c in num_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(df[c].median())

    parts = []
    if num_cols:
        scaler = StandardScaler()
        X_num = pd.DataFrame(
            scaler.fit_transform(df[num_cols]),
            columns=num_cols, index=df.index
        )
        parts.append(X_num)
    if cat_cols:
        X_cat = pd.get_dummies(df[cat_cols], prefix=cat_cols, drop_first=False, dtype=float)
        parts.append(X_cat)
    if not parts:
        raise ValueError("No hay columnas de features en modo paciente.")
    X = pd.concat(parts, axis=1).astype(float)

    y_full = df["YEAR_ONSET"].astype(float) if "YEAR_ONSET" in df.columns else pd.Series(np.nan, index=df.index)
    y_t = torch.tensor(y_full.fillna(0).values, dtype=torch.float32)
    label_mask = torch.tensor(y_full.notna().values, dtype=torch.bool)

    X_clean = X.replace([np.inf,-np.inf], np.nan).fillna(0.0)
    n_samples = X_clean.shape[0]
    k = min(8, max(1, n_samples - 1))
    n_neighbors = min(n_samples, k + 1)
    if n_samples >= 2:
        nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric="euclidean")
        nbrs.fit(X_clean.values)
        _, idx = nbrs.kneighbors(X_clean.values)
        src, dst = [], []
        for i in range(idx.shape[0]):
            for j in idx[i, 1:]:
                src.append(i); dst.append(j)
        edge_index = torch.tensor([src, dst], dtype=torch.long)
        edge_index = torch.cat([edge_index, edge_index.flip(0)], dim=1)
    else:
        edge_index = torch.empty((2, 0), dtype=torch.long)

x = torch.tensor(X_clean.values, dtype=torch.float32)
n = x.size(0)

labeled_idx = torch.nonzero(label_mask, as_tuple=False).view(-1)
n_lab = labeled_idx.numel()
print(f"Nodos: {n} | etiquetados: {n_lab} | edges: {edge_index.size(1)}")

if n_lab > 0:
    perm = labeled_idx[torch.randperm(n_lab)]
    tr_n = max(1, int(0.7*n_lab)) if n_lab >= 3 else max(1, n_lab)
    va_n = max(0, int(0.15*n_lab)) if n_lab >= 7 else 0
    if tr_n + va_n > max(0, n_lab-1):
        va_n = max(0, n_lab - 1 - tr_n)

    train_idx = perm[:tr_n]
    val_idx   = perm[tr_n:tr_n+va_n]
    test_idx  = perm[tr_n+va_n:]
else:
    train_idx = val_idx = test_idx = torch.tensor([], dtype=torch.long)

train_mask = torch.zeros(n, dtype=torch.bool); train_mask[train_idx] = True
val_mask   = torch.zeros(n, dtype=torch.bool); val_mask[val_idx]     = True
test_mask  = torch.zeros(n, dtype=torch.bool); test_mask[test_idx]   = True

data = Data(x=x, edge_index=edge_index, y=y_t,
            train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
print(data)

class GNNRegressor(nn.Module):
    def __init__(self, in_ch, hid=64, dropout=0.3):
        super().__init__()
        self.c1 = GCNConv(in_ch, hid)
        self.c2 = GCNConv(hid, 1)
        self.dropout = dropout
    def forward(self, x, edge_index):
        x = F.relu(self.c1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.c2(x, edge_index)
        return x.squeeze(-1)

device = torch.device("cpu")
model = GNNRegressor(in_ch=data.num_node_features, hid=64, dropout=0.3).to(device)
data = data.to(device)

opt = torch.optim.AdamW(model.parameters(), lr=1e-2, weight_decay=1e-4)
loss_fn = nn.MSELoss()

def evaluate(split="val"):
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        mask = data.train_mask if split=="train" else (data.val_mask if split=="val" else data.test_mask)
        if not mask.any():
            return float("nan")
        return loss_fn(out[mask], data.y[mask]).item()

for ep in range(1, 61):
    model.train()
    opt.zero_grad()
    out = model(data.x, data.edge_index)
    if data.train_mask.any():
        loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
        if torch.isnan(loss) or torch.isinf(loss):
            raise RuntimeError("Loss NaN/Inf; revisa X/y/fechas.")
        loss.backward()
        opt.step()
    else:
        loss = torch.tensor(float("nan"))
    if ep % 10 == 0:
        print(f"ep {ep:03d} | train {float(loss):.3f} | val {evaluate('val'):.3f} | test {evaluate('test'):.3f}")


>> Modo VISITAS (usando VISDATE como fecha de visita)
Nodos: 6210 | etiquetados: 1942 | edges: 72424
Data(x=[6210, 9], edge_index=[2, 72424], y=[6210], train_mask=[6210], val_mask=[6210], test_mask=[6210])


Consider using tensor.detach() first. (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\pytorch\torch\csrc\autograd\generated\python_variable_methods.cpp:836.)
  print(f"ep {ep:03d} | train {float(loss):.3f} | val {evaluate('val'):.3f} | test {evaluate('test'):.3f}")


ep 010 | train 60674200.000 | val 61875528.000 | test 61226168.000
ep 020 | train 60587500.000 | val 61782024.000 | test 61136012.000
ep 030 | train 60441728.000 | val 61627844.000 | test 60987496.000
ep 040 | train 60221352.000 | val 61396260.000 | test 60765316.000
ep 050 | train 59914252.000 | val 61075724.000 | test 60458104.000
ep 060 | train 59512608.000 | val 60654476.000 | test 60054388.000


In [22]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

torch.manual_seed(42)
np.random.seed(42)

csv_path = "./data/adni/demographics/PTDEMOG.csv"
df = pd.read_csv(csv_path)

def norm_codes_to_labels(s: pd.Series, mapping: dict) -> pd.Series:
    out = s.astype(str).str.strip().str.replace(r"\.0$", "", regex=True)
    out = out.map(mapping)
    return out

gender_map = {"1":"male","2":"female","male":"male","female":"female","m":"male","f":"female"}
marry_map  = {"1":"married","2":"widowed","3":"divorced","4":"never_married","6":"domestic_partnership"}

def to_year(s):
    s = pd.to_numeric(s, errors="coerce")
    s = s.where((s >= 1900) & (s <= 2100))
    return s

onset_cols = [c for c in ["PTCOGBEG","PTADBEG","PTADDX"] if c in df.columns]
for c in onset_cols:
    df[c] = to_year(df[c])

def row_min_nonnull(row):
    vals = [row[c] for c in onset_cols if pd.notna(row[c])]
    return min(vals) if vals else np.nan

df["YEAR_ONSET"] = row_min_nonnull(df) if False else (
    df.apply(row_min_nonnull, axis=1) if onset_cols else np.nan
)
df["YEAR_ONSET"] = to_year(df["YEAR_ONSET"])

for c in ["PTDOBYY","PTEDUCAT"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

if "PTGENDER" in df.columns:
    df["PTGENDER"] = norm_codes_to_labels(df["PTGENDER"], gender_map)
if "PTMARRY" in df.columns:
    df["PTMARRY"]  = norm_codes_to_labels(df["PTMARRY"], marry_map)

date_col = "EXAMDATE" if "EXAMDATE" in df.columns else ("VISDATE" if "VISDATE" in df.columns else None)
has_visits = ("RID" in df.columns) and (date_col is not None)

if has_visits:
    print(f">> Modo VISITAS (usando {date_col} como fecha de visita)")
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    df["EXAM_YEAR"] = df[date_col].dt.year
    df["EXAM_YEAR"] = to_year(df["EXAM_YEAR"])

    df["AGE_AT_VISIT"] = np.where(
        df["EXAM_YEAR"].notna() & df["PTDOBYY"].notna(),
        df["EXAM_YEAR"] - df["PTDOBYY"],
        np.nan
    )

    df["YEARS_TO_ONSET"] = np.where(
        df["YEAR_ONSET"].notna() & df["EXAM_YEAR"].notna(),
        df["YEAR_ONSET"] - df["EXAM_YEAR"],
        np.nan
    )

    df.loc[(df["YEARS_TO_ONSET"] < 0) & (df["YEAR_ONSET"].notna()), "YEARS_TO_ONSET"] = np.nan
    df.loc[df["YEARS_TO_ONSET"] > 50, "YEARS_TO_ONSET"] = np.nan

    df["HAS_LABEL"] = df["YEARS_TO_ONSET"].notna()

    num_cols = [c for c in ["AGE_AT_VISIT","PTEDUCAT"] if c in df.columns]
    cat_cols = [c for c in ["PTGENDER","PTMARRY"] if c in df.columns]

    for c in num_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(df[c].median())

    parts = []
    if num_cols:
        scaler = StandardScaler()
        X_num = pd.DataFrame(
            scaler.fit_transform(df[num_cols]),
            columns=num_cols, index=df.index
        )
        parts.append(X_num)
    if cat_cols:
        X_cat = pd.get_dummies(df[cat_cols], prefix=cat_cols, drop_first=False, dtype=float)
        parts.append(X_cat)
    if not parts:
        raise ValueError("No hay columnas de features seleccionadas en modo visitas.")

    X = pd.concat(parts, axis=1).astype(float)

    y_full = df["YEARS_TO_ONSET"].astype(float)
    label_mask_np = df["HAS_LABEL"].fillna(False).to_numpy()
    label_mask = torch.tensor(label_mask_np, dtype=torch.bool)

    y_mu = float(y_full[label_mask_np].mean()) if label_mask_np.any() else 0.0
    y_std = float(y_full[label_mask_np].std(ddof=0)) if label_mask_np.any() else 1.0
    if y_std == 0 or np.isnan(y_std):
        y_std = 1.0

    y_scaled = (y_full - y_mu) / y_std
    y_t = torch.tensor(y_scaled.fillna(0).values, dtype=torch.float32)

    X_clean = X.replace([np.inf,-np.inf], np.nan).fillna(0.0)
    n_samples = X_clean.shape[0]
    k = min(8, max(1, n_samples - 1))
    n_neighbors = min(n_samples, k + 1)

    if n_samples >= 2:
        nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric="euclidean")
        nbrs.fit(X_clean.values)
        _, idx = nbrs.kneighbors(X_clean.values)
        src_knn, dst_knn = [], []
        for i in range(idx.shape[0]):
            for j in idx[i, 1:]:
                src_knn.append(i); dst_knn.append(j)
        edge_knn = torch.tensor([src_knn, dst_knn], dtype=torch.long)
    else:
        edge_knn = torch.empty((2, 0), dtype=torch.long)

    if "RID" not in df.columns:
        raise ValueError("PTDEMOG no tiene RID; necesario para aristas temporales.")
    tmp = df.reset_index()[["index","RID", date_col]].dropna(subset=[date_col])
    tmp = tmp.sort_values(["RID", date_col])
    src_tmp, dst_tmp = [], []
    for rid, g in tmp.groupby("RID"):
        ids = g["index"].tolist()
        for a, b in zip(ids[:-1], ids[1:]):
            src_tmp.append(a); dst_tmp.append(b)
    edge_tmp = torch.tensor([src_tmp, dst_tmp], dtype=torch.long) if src_tmp else torch.empty((2,0), dtype=torch.long)

    def undirected(e): 
        return torch.cat([e, e.flip(0)], dim=1) if e.numel() else e

    edges = []
    if edge_knn.numel(): edges.append(undirected(edge_knn))
    if edge_tmp.numel(): edges.append(undirected(edge_tmp))
    edge_index = torch.cat(edges, dim=1) if edges else torch.empty((2,0), dtype=torch.long)
    if edge_index.numel():
        edge_index = torch.unique(edge_index, dim=1)

else:
    print(">> Modo PACIENTE (fallback: no hay EXAMDATE/VISDATE)")
    if "RID" not in df.columns:
        raise ValueError("PTDEMOG no tiene RID.")
    keep_cols = [c for c in ["RID","PTDOBYY","PTEDUCAT","PTGENDER","PTMARRY","YEAR_ONSET"] if c in df.columns]
    df = df[keep_cols].copy()

    def first_notnull(s):
        return next((v for v in s if pd.notna(v)), np.nan)
    agg = {"PTDOBYY":"median","PTEDUCAT":"median","PTGENDER":first_notnull,"PTMARRY":first_notnull,"YEAR_ONSET":"min"}
    df = df.groupby("RID", as_index=False).agg({k:v for k,v in agg.items() if k in df.columns})

    num_cols = [c for c in ["PTDOBYY","PTEDUCAT"] if c in df.columns]
    cat_cols = [c for c in ["PTGENDER","PTMARRY"] if c in df.columns]
    for c in num_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(df[c].median())

    parts = []
    if num_cols:
        scaler = StandardScaler()
        X_num = pd.DataFrame(scaler.fit_transform(df[num_cols]), columns=num_cols, index=df.index); parts.append(X_num)
    if cat_cols:
        X_cat = pd.get_dummies(df[cat_cols], prefix=cat_cols, drop_first=False, dtype=float); parts.append(X_cat)
    if not parts:
        raise ValueError("No hay features en modo paciente.")
    X = pd.concat(parts, axis=1).astype(float)

    y_full = to_year(df["YEAR_ONSET"]).astype(float)
    label_mask_np = y_full.notna().to_numpy()
    label_mask = torch.tensor(label_mask_np, dtype=torch.bool)

    y_mu = float(y_full[label_mask_np].mean()) if label_mask_np.any() else 0.0
    y_std = float(y_full[label_mask_np].std(ddof=0)) if label_mask_np.any() else 1.0
    if y_std == 0 or np.isnan(y_std): y_std = 1.0

    y_scaled = (y_full - y_mu) / y_std
    y_t = torch.tensor(y_scaled.fillna(0).values, dtype=torch.float32)

    X_clean = X.replace([np.inf,-np.inf], np.nan).fillna(0.0)
    n_samples = X_clean.shape[0]
    k = min(8, max(1, n_samples - 1))
    n_neighbors = min(n_samples, k + 1)
    if n_samples >= 2:
        nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric="euclidean")
        nbrs.fit(X_clean.values)
        _, idx = nbrs.kneighbors(X_clean.values)
        src, dst = [], []
        for i in range(idx.shape[0]):
            for j in idx[i, 1:]:
                src.append(i); dst.append(j)
        edge_index = torch.tensor([src, dst], dtype=torch.long)
        edge_index = torch.cat([edge_index, edge_index.flip(0)], dim=1)
    else:
        edge_index = torch.empty((2, 0), dtype=torch.long)

x = torch.tensor(X_clean.values, dtype=torch.float32)
n = x.size(0)

labeled_idx = torch.nonzero(label_mask, as_tuple=False).view(-1)
n_lab = labeled_idx.numel()
print(f"Nodos: {n} | etiquetados: {n_lab} | edges: {edge_index.size(1)}")

if n_lab > 0:
    perm = labeled_idx[torch.randperm(n_lab)]
    tr_n = max(1, int(0.7*n_lab)) if n_lab >= 3 else max(1, n_lab)
    va_n = max(0, int(0.15*n_lab)) if n_lab >= 7 else 0
    if tr_n + va_n > max(0, n_lab-1):
        va_n = max(0, n_lab - 1 - tr_n)

    train_idx = perm[:tr_n]
    val_idx   = perm[tr_n:tr_n+va_n]
    test_idx  = perm[tr_n+va_n:]
else:
    train_idx = val_idx = test_idx = torch.tensor([], dtype=torch.long)

train_mask = torch.zeros(n, dtype=torch.bool); train_mask[train_idx] = True
val_mask   = torch.zeros(n, dtype=torch.bool); val_mask[val_idx]     = True
test_mask  = torch.zeros(n, dtype=torch.bool); test_mask[test_idx]   = True

data = Data(x=x, edge_index=edge_index, y=y_t,
            train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
print(data)

class GNNRegressor(nn.Module):
    def __init__(self, in_ch, hid=64, dropout=0.3):
        super().__init__()
        self.c1 = GCNConv(in_ch, hid)
        self.c2 = GCNConv(hid, 1)
        self.dropout = dropout
    def forward(self, x, edge_index):
        x = F.relu(self.c1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.c2(x, edge_index)
        return x.squeeze(-1)

device = torch.device("cpu")
model = GNNRegressor(in_ch=data.num_node_features, hid=64, dropout=0.3).to(device)
data = data.to(device)

opt = torch.optim.AdamW(model.parameters(), lr=1e-2, weight_decay=1e-4)
loss_fn = nn.MSELoss()

def eval_metrics(split="val", y_mu=0.0, y_std=1.0):
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        mask = data.train_mask if split=="train" else (data.val_mask if split=="val" else data.test_mask)
        if not mask.any():
            return float("nan"), float("nan")
        mae_scaled = torch.mean(torch.abs(out[mask] - data.y[mask])).item()
        rmse_scaled = torch.sqrt(loss_fn(out[mask], data.y[mask])).item()
        mae_years = mae_scaled * y_std
        rmse_years = rmse_scaled * y_std
        return mae_years, rmse_years

for ep in range(1, 61):
    model.train()
    opt.zero_grad()
    out = model(data.x, data.edge_index)
    if data.train_mask.any():
        loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
        if torch.isnan(loss) or torch.isinf(loss):
            raise RuntimeError("Loss NaN/Inf; revisa X/y/fechas.")
        loss.backward()
        opt.step()
        loss_val = loss.detach().item()
    else:
        loss_val = float("nan")

    if ep % 10 == 0:
        tr_mae, tr_rmse = eval_metrics("train", y_mu, y_std)
        va_mae, va_rmse = eval_metrics("val", y_mu, y_std)
        te_mae, te_rmse = eval_metrics("test", y_mu, y_std)
        print(
            f"ep {ep:03d} | train_loss(MSE_scaled) {loss_val:.4f} "
            f"| TR MAE {tr_mae:.3f}y RMSE {tr_rmse:.3f}y "
            f"| VAL MAE {va_mae:.3f}y RMSE {va_rmse:.3f}y "
            f"| TEST MAE {te_mae:.3f}y RMSE {te_rmse:.3f}y"
        )


>> Modo VISITAS (usando VISDATE como fecha de visita)
Nodos: 6210 | etiquetados: 82 | edges: 72424
Data(x=[6210, 9], edge_index=[2, 72424], y=[6210], train_mask=[6210], val_mask=[6210], test_mask=[6210])
ep 010 | train_loss(MSE_scaled) 0.0090 | TR MAE 0.006y RMSE 0.007y | VAL MAE 0.092y RMSE 0.289y | TEST MAE 0.009y RMSE 0.013y
ep 020 | train_loss(MSE_scaled) 0.0024 | TR MAE 0.002y RMSE 0.003y | VAL MAE 0.087y RMSE 0.289y | TEST MAE 0.004y RMSE 0.005y
ep 030 | train_loss(MSE_scaled) 0.0022 | TR MAE 0.002y RMSE 0.003y | VAL MAE 0.086y RMSE 0.289y | TEST MAE 0.003y RMSE 0.004y
ep 040 | train_loss(MSE_scaled) 0.0012 | TR MAE 0.002y RMSE 0.003y | VAL MAE 0.087y RMSE 0.289y | TEST MAE 0.005y RMSE 0.007y
ep 050 | train_loss(MSE_scaled) 0.0014 | TR MAE 0.001y RMSE 0.002y | VAL MAE 0.085y RMSE 0.289y | TEST MAE 0.002y RMSE 0.003y
ep 060 | train_loss(MSE_scaled) 0.0009 | TR MAE 0.001y RMSE 0.002y | VAL MAE 0.085y RMSE 0.288y | TEST MAE 0.003y RMSE 0.004y


In [26]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

torch.manual_seed(42)
np.random.seed(42)

csv_path = "./data/adni/demographics/PTDEMOG.csv"
df = pd.read_csv(csv_path)

def norm_codes_to_labels(s: pd.Series, mapping: dict) -> pd.Series:
    out = s.astype(str).str.strip().str.replace(r"\.0$", "", regex=True)
    out = out.map(mapping)
    return out

gender_map = {"1":"male","2":"female","male":"male","female":"female","m":"male","f":"female"}
marry_map  = {"1":"married","2":"widowed","3":"divorced","4":"never_married","6":"domestic_partnership"}

def to_year(s):
    s = pd.to_numeric(s, errors="coerce")
    s = s.where((s >= 1900) & (s <= 2100))
    return s

onset_cols = [c for c in ["PTCOGBEG","PTADBEG","PTADDX"] if c in df.columns]
for c in onset_cols:
    df[c] = to_year(df[c])

def row_min_nonnull(row):
    vals = [row[c] for c in onset_cols if pd.notna(row[c])]
    return min(vals) if vals else np.nan

df["YEAR_ONSET"] = df.apply(row_min_nonnull, axis=1) if onset_cols else np.nan
df["YEAR_ONSET"] = to_year(df["YEAR_ONSET"])

for c in ["PTDOBYY","PTEDUCAT"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

if "PTGENDER" in df.columns:
    df["PTGENDER"] = norm_codes_to_labels(df["PTGENDER"], gender_map)
if "PTMARRY" in df.columns:
    df["PTMARRY"]  = norm_codes_to_labels(df["PTMARRY"], marry_map)

date_col = "EXAMDATE" if "EXAMDATE" in df.columns else ("VISDATE" if "VISDATE" in df.columns else None)
has_visits = ("RID" in df.columns) and (date_col is not None)

if not has_visits:
    raise ValueError("No encuentro columna de fecha de visita (VISDATE/EXAMDATE) o RID en PTDEMOG.")

print(f">> Modo VISITAS (usando {date_col} como fecha de visita)")
df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
df["EXAM_YEAR"] = to_year(df[date_col].dt.year)

df["AGE_AT_VISIT"] = np.where(
    df["EXAM_YEAR"].notna() & df["PTDOBYY"].notna(),
    df["EXAM_YEAR"] - df["PTDOBYY"],
    np.nan
)

df["YEARS_TO_ONSET"] = np.where(
    df["YEAR_ONSET"].notna() & df["EXAM_YEAR"].notna(),
    df["YEAR_ONSET"] - df["EXAM_YEAR"],
    np.nan
)
df.loc[(df["YEARS_TO_ONSET"] < 0) & df["YEAR_ONSET"].notna(), "YEARS_TO_ONSET"] = np.nan
df.loc[df["YEARS_TO_ONSET"] > 50, "YEARS_TO_ONSET"] = np.nan

df["HAS_LABEL"] = df["YEARS_TO_ONSET"].notna()

num_cols = [c for c in ["AGE_AT_VISIT","PTEDUCAT"] if c in df.columns]
cat_cols = [c for c in ["PTGENDER","PTMARRY"] if c in df.columns]

for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(df[c].median())

parts = []
if num_cols:
    scaler = StandardScaler()
    X_num = pd.DataFrame(
        scaler.fit_transform(df[num_cols]),
        columns=num_cols, index=df.index
    )
    parts.append(X_num)
if cat_cols:
    X_cat = pd.get_dummies(df[cat_cols], prefix=cat_cols, drop_first=False, dtype=float)
    parts.append(X_cat)
if not parts:
    raise ValueError("No hay columnas de features seleccionadas en modo visitas.")

X = pd.concat(parts, axis=1).astype(float)
X_clean = X.replace([np.inf, -np.inf], np.nan).fillna(0.0)

n_samples = X_clean.shape[0]
k = min(8, max(1, n_samples - 1))
n_neighbors = min(n_samples, k + 1)

if n_samples >= 2:
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric="euclidean")
    nbrs.fit(X_clean.values)
    _, idx = nbrs.kneighbors(X_clean.values)
    src_knn, dst_knn = [], []
    for i in range(idx.shape[0]):
        for j in idx[i, 1:]:
            src_knn.append(i); dst_knn.append(j)
    edge_knn = torch.tensor([src_knn, dst_knn], dtype=torch.long)
else:
    edge_knn = torch.empty((2, 0), dtype=torch.long)

tmp = df.reset_index()[["index","RID", date_col]].dropna(subset=[date_col]).sort_values(["RID", date_col])
src_tmp, dst_tmp = [], []
for rid, g in tmp.groupby("RID"):
    ids = g["index"].tolist()
    for a, b in zip(ids[:-1], ids[1:]):
        src_tmp.append(a); dst_tmp.append(b)
edge_tmp = torch.tensor([src_tmp, dst_tmp], dtype=torch.long) if src_tmp else torch.empty((2,0), dtype=torch.long)

def undirected(e): 
    return torch.cat([e, e.flip(0)], dim=1) if e.numel() else e

edges = []
if edge_knn.numel(): edges.append(undirected(edge_knn))
if edge_tmp.numel(): edges.append(undirected(edge_tmp))
edge_index = torch.cat(edges, dim=1) if edges else torch.empty((2,0), dtype=torch.long)
if edge_index.numel():
    edge_index = torch.unique(edge_index, dim=1)

print(f"Nodos: {len(df)} | etiquetados (pre-evento): {int(df['HAS_LABEL'].sum())} | edges totales: {edge_index.size(1)}")

df["USE_FOR_LABEL"] = False
if df["HAS_LABEL"].any():
    idx_last_pre = df.loc[df["HAS_LABEL"]].groupby("RID")[date_col].idxmax()
    df.loc[idx_last_pre, "USE_FOR_LABEL"] = True

rids_with_label = df.loc[df["USE_FOR_LABEL"], "RID"].dropna().unique()
rng = np.random.default_rng(42)
rng.shuffle(rids_with_label)
n_lab_rids = len(rids_with_label)
tr_n = max(1, int(0.7 * n_lab_rids))
va_n = max(0, int(0.15 * n_lab_rids))
if tr_n + va_n > max(0, n_lab_rids - 1):
    va_n = max(0, n_lab_rids - 1 - tr_n)

train_rids = set(rids_with_label[:tr_n])
val_rids   = set(rids_with_label[tr_n:tr_n+va_n])
test_rids  = set(rids_with_label[tr_n+va_n:])

node_split = np.full(len(df), "train", dtype=object)
node_rids = df["RID"].to_numpy()
node_split[np.isin(node_rids, list(val_rids))]  = "val"
node_split[np.isin(node_rids, list(test_rids))] = "test"

use_for_label = df["USE_FOR_LABEL"].to_numpy()
train_mask_np = (node_split == "train") & use_for_label
val_mask_np   = (node_split == "val")   & use_for_label
test_mask_np  = (node_split == "test")  & use_for_label

split_map = {"train":0, "val":1, "test":2}
split_idx = np.vectorize(split_map.get)(node_split)
src_np = edge_index[0].cpu().numpy()
dst_np = edge_index[1].cpu().numpy()
keep_edges = split_idx[src_np] == split_idx[dst_np]
edge_index = edge_index[:, torch.tensor(keep_edges)]

y_full = df["YEARS_TO_ONSET"].astype(float)  # NaNs fuera de USE_FOR_LABEL no importan
y_mu  = float(y_full[train_mask_np].mean()) if train_mask_np.any() else 0.0
y_std = float(y_full[train_mask_np].std(ddof=0)) if train_mask_np.any() else 1.0
if not np.isfinite(y_std) or y_std == 0.0:
    y_std = 1.0

y_scaled = (y_full - y_mu) / y_std
y_t = torch.tensor(y_scaled.fillna(0).values, dtype=torch.float32)

x = torch.tensor(X_clean.values, dtype=torch.float32)
train_mask = torch.tensor(train_mask_np, dtype=torch.bool)
val_mask   = torch.tensor(val_mask_np,   dtype=torch.bool)
test_mask  = torch.tensor(test_mask_np,  dtype=torch.bool)

data = Data(x=x, edge_index=edge_index, y=y_t,
            train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)

print(f"RIDs con etiqueta: {n_lab_rids} | "
      f"train_rids={len(train_rids)}, val_rids={len(val_rids)}, test_rids={len(test_rids)}")
print(f"Labels -> train:{train_mask.sum().item()} val:{val_mask.sum().item()} test:{test_mask.sum().item()}")
print(f"Edges intra-split: {edge_index.size(1)}")

Y_MEAN_TRAIN, Y_STD_TRAIN = y_mu, y_std

class GNNRegressor(nn.Module):
    def __init__(self, in_ch, hid=64, dropout=0.3):
        super().__init__()
        self.c1 = GCNConv(in_ch, hid)
        self.c2 = GCNConv(hid, 1)
        self.dropout = dropout
    def forward(self, x, edge_index):
        x = F.relu(self.c1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.c2(x, edge_index)
        return x.squeeze(-1)

device = torch.device("cpu")
model = GNNRegressor(in_ch=data.num_node_features, hid=64, dropout=0.3).to(device)
data = data.to(device)

opt = torch.optim.AdamW(model.parameters(), lr=1e-2, weight_decay=1e-4)
loss_fn = nn.MSELoss()

def eval_metrics(split="val"):
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        mask = data.train_mask if split=="train" else (data.val_mask if split=="val" else data.test_mask)
        if not mask.any():
            return float("nan"), float("nan")
        mae_scaled = torch.mean(torch.abs(out[mask] - data.y[mask])).item()
        rmse_scaled = torch.sqrt(loss_fn(out[mask], data.y[mask])).item()
        return mae_scaled * Y_STD_TRAIN, rmse_scaled * Y_STD_TRAIN  # en AÑOS

for ep in range(1, 61):
    model.train()
    opt.zero_grad()
    out = model(data.x, data.edge_index)
    if data.train_mask.any():
        loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
        if torch.isnan(loss) or torch.isinf(loss):
            raise RuntimeError("Loss NaN/Inf; revisa X/y/fechas.")
        loss.backward()
        opt.step()
        loss_val = loss.detach().item()
    else:
        loss_val = float("nan")

    if ep % 10 == 0:
        tr_mae, tr_rmse = eval_metrics("train")
        va_mae, va_rmse = eval_metrics("val")
        te_mae, te_rmse = eval_metrics("test")
        print(
            f"ep {ep:03d} | train_loss(MSE_scaled) {loss_val:.4f} "
            f"| TR MAE {tr_mae:.3f}y RMSE {tr_rmse:.3f}y "
            f"| VAL MAE {va_mae:.3f}y RMSE {va_rmse:.3f}y "
            f"| TEST MAE {te_mae:.3f}y RMSE {te_rmse:.3f}y"
        )


>> Modo VISITAS (usando VISDATE como fecha de visita)
Nodos: 6210 | etiquetados (pre-evento): 82 | edges totales: 72424
RIDs con etiqueta: 82 | train_rids=57, val_rids=12, test_rids=13
Labels -> train:57 val:12 test:13
Edges intra-split: 71756
ep 010 | train_loss(MSE_scaled) 0.8334 | TR MAE 0.046y RMSE 0.120y | VAL MAE 0.038y RMSE 0.056y | TEST MAE 0.033y RMSE 0.061y
ep 020 | train_loss(MSE_scaled) 0.7920 | TR MAE 0.044y RMSE 0.115y | VAL MAE 0.036y RMSE 0.049y | TEST MAE 0.042y RMSE 0.072y
ep 030 | train_loss(MSE_scaled) 0.7105 | TR MAE 0.040y RMSE 0.110y | VAL MAE 0.047y RMSE 0.068y | TEST MAE 0.043y RMSE 0.089y
ep 040 | train_loss(MSE_scaled) 0.6600 | TR MAE 0.041y RMSE 0.105y | VAL MAE 0.049y RMSE 0.086y | TEST MAE 0.049y RMSE 0.114y
ep 050 | train_loss(MSE_scaled) 0.5895 | TR MAE 0.037y RMSE 0.099y | VAL MAE 0.069y RMSE 0.110y | TEST MAE 0.056y RMSE 0.135y
ep 060 | train_loss(MSE_scaled) 0.5556 | TR MAE 0.036y RMSE 0.095y | VAL MAE 0.068y RMSE 0.119y | TEST MAE 0.060y RMSE 0.155y


In [27]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

torch.manual_seed(42)
np.random.seed(42)

csv_path = "./data/adni/demographics/PTDEMOG.csv"
df = pd.read_csv(csv_path)

def norm_codes_to_labels(s: pd.Series, mapping: dict) -> pd.Series:
    out = s.astype(str).str.strip().str.replace(r"\.0$", "", regex=True)
    out = out.map(mapping)
    return out

gender_map = {"1":"male","2":"female","male":"male","female":"female","m":"male","f":"female"}
marry_map  = {"1":"married","2":"widowed","3":"divorced","4":"never_married","6":"domestic_partnership"}

def to_year(s):
    s = pd.to_numeric(s, errors="coerce")
    s = s.where((s >= 1900) & (s <= 2100))
    return s

onset_cols = [c for c in ["PTCOGBEG","PTADBEG","PTADDX"] if c in df.columns]
for c in onset_cols:
    df[c] = to_year(df[c])

def row_min_nonnull(row):
    vals = [row[c] for c in onset_cols if pd.notna(row[c])]
    return min(vals) if vals else np.nan

df["YEAR_ONSET"] = df.apply(row_min_nonnull, axis=1) if onset_cols else np.nan
df["YEAR_ONSET"] = to_year(df["YEAR_ONSET"])

for c in ["PTDOBYY","PTEDUCAT"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

if "PTGENDER" in df.columns:
    df["PTGENDER"] = norm_codes_to_labels(df["PTGENDER"], gender_map)
if "PTMARRY" in df.columns:
    df["PTMARRY"]  = norm_codes_to_labels(df["PTMARRY"], marry_map)

date_col = "EXAMDATE" if "EXAMDATE" in df.columns else ("VISDATE" if "VISDATE" in df.columns else None)
has_visits = ("RID" in df.columns) and (date_col is not None)
if not has_visits:
    raise ValueError("No encuentro columna de fecha de visita (VISDATE/EXAMDATE) o RID en PTDEMOG.")

print(f">> Modo VISITAS (usando {date_col} como fecha de visita)")
df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
df["EXAM_YEAR"] = to_year(df[date_col].dt.year)

df["AGE_AT_VISIT"] = np.where(
    df["EXAM_YEAR"].notna() & df["PTDOBYY"].notna(),
    df["EXAM_YEAR"] - df["PTDOBYY"],
    np.nan
)

df["ONSET_DATE"] = pd.NaT
mask_on = df["YEAR_ONSET"].notna()
df.loc[mask_on, "ONSET_DATE"] = pd.to_datetime(
    df.loc[mask_on, "YEAR_ONSET"].astype(int).astype(str) + "-07-01",
    errors="coerce"
)
df["YEARS_TO_ONSET"] = np.where(
    mask_on & df[date_col].notna(),
    (df["ONSET_DATE"] - df[date_col]).dt.days / 365.25,
    np.nan
)
df.loc[df["YEARS_TO_ONSET"] < 0, "YEARS_TO_ONSET"] = np.nan
df.loc[df["YEARS_TO_ONSET"] > 15, "YEARS_TO_ONSET"] = np.nan
df.drop(columns=["ONSET_DATE"], inplace=True)

df["HAS_LABEL"] = df["YEARS_TO_ONSET"].notna()

num_cols = [c for c in ["AGE_AT_VISIT","PTEDUCAT"] if c in df.columns]
cat_cols = [c for c in ["PTGENDER","PTMARRY"] if c in df.columns]

for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(df[c].median())

parts = []
if num_cols:
    scaler = StandardScaler()
    X_num = pd.DataFrame(
        scaler.fit_transform(df[num_cols]),
        columns=num_cols, index=df.index
    )
    parts.append(X_num)
if cat_cols:
    X_cat = pd.get_dummies(df[cat_cols], prefix=cat_cols, drop_first=False, dtype=float)
    parts.append(X_cat)
if not parts:
    raise ValueError("No hay columnas de features seleccionadas en modo visitas.")

X = pd.concat(parts, axis=1).astype(float)
X_clean = X.replace([np.inf, -np.inf], np.nan).fillna(0.0)

n_samples = X_clean.shape[0]
k = min(8, max(1, n_samples - 1))
n_neighbors = min(n_samples, k + 1)

if n_samples >= 2:
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric="euclidean")
    nbrs.fit(X_clean.values)
    _, idx = nbrs.kneighbors(X_clean.values)
    src_knn, dst_knn = [], []
    for i in range(idx.shape[0]):
        for j in idx[i, 1:]:
            src_knn.append(i); dst_knn.append(j)
    edge_knn = torch.tensor([src_knn, dst_knn], dtype=torch.long)
else:
    edge_knn = torch.empty((2, 0), dtype=torch.long)

tmp = df.reset_index()[["index","RID", date_col]].dropna(subset=[date_col]).sort_values(["RID", date_col])
src_tmp, dst_tmp = [], []
for rid, g in tmp.groupby("RID"):
    ids = g["index"].tolist()
    for a, b in zip(ids[:-1], ids[1:]):
        src_tmp.append(a); dst_tmp.append(b)
edge_tmp = torch.tensor([src_tmp, dst_tmp], dtype=torch.long) if src_tmp else torch.empty((2,0), dtype=torch.long)

def undirected(e): 
    return torch.cat([e, e.flip(0)], dim=1) if e.numel() else e

edges = []
if edge_knn.numel(): edges.append(undirected(edge_knn))
if edge_tmp.numel(): edges.append(undirected(edge_tmp))
edge_index = torch.cat(edges, dim=1) if edges else torch.empty((2,0), dtype=torch.long)
if edge_index.numel():
    edge_index = torch.unique(edge_index, dim=1)

print(f"Nodos: {len(df)} | etiquetados (pre-evento): {int(df['HAS_LABEL'].sum())} | edges totales: {edge_index.size(1)}")

df["USE_FOR_LABEL"] = False
if df["HAS_LABEL"].any():
    idx_last_pre = df.loc[df["HAS_LABEL"]].groupby("RID")[date_col].idxmax()
    df.loc[idx_last_pre, "USE_FOR_LABEL"] = True

rids_with_label = df.loc[df["USE_FOR_LABEL"], "RID"].dropna().unique()
rng = np.random.default_rng(42)
rng.shuffle(rids_with_label)
n_lab_rids = len(rids_with_label)
tr_n = max(1, int(0.7 * n_lab_rids))
va_n = max(0, int(0.15 * n_lab_rids))
if tr_n + va_n > max(0, n_lab_rids - 1):
    va_n = max(0, n_lab_rids - 1 - tr_n)

train_rids = set(rids_with_label[:tr_n])
val_rids   = set(rids_with_label[tr_n:tr_n+va_n])
test_rids  = set(rids_with_label[tr_n+va_n:])

node_split = np.full(len(df), "train", dtype=object)
node_rids = df["RID"].to_numpy()
node_split[np.isin(node_rids, list(val_rids))]  = "val"
node_split[np.isin(node_rids, list(test_rids))] = "test"

use_for_label = df["USE_FOR_LABEL"].to_numpy()
train_mask_np = (node_split == "train") & use_for_label
val_mask_np   = (node_split == "val")   & use_for_label
test_mask_np  = (node_split == "test")  & use_for_label

split_map = {"train":0, "val":1, "test":2}
split_idx = np.vectorize(split_map.get)(node_split)
src_np = edge_index[0].cpu().numpy()
dst_np = edge_index[1].cpu().numpy()
keep_edges = split_idx[src_np] == split_idx[dst_np]
edge_index = edge_index[:, torch.tensor(keep_edges)]

y_full = df["YEARS_TO_ONSET"].astype(float)  # años (fraccional)
y_mu  = float(y_full[train_mask_np].mean()) if train_mask_np.any() else 0.0
y_std = float(y_full[train_mask_np].std(ddof=0)) if train_mask_np.any() else 1.0
if not np.isfinite(y_std) or y_std == 0.0:
    y_std = 1.0

y_scaled = (y_full - y_mu) / y_std
y_t = torch.tensor(y_scaled.fillna(0).values, dtype=torch.float32)

x = torch.tensor(X_clean.values, dtype=torch.float32)
train_mask = torch.tensor(train_mask_np, dtype=torch.bool)
val_mask   = torch.tensor(val_mask_np,   dtype=torch.bool)
test_mask  = torch.tensor(test_mask_np,  dtype=torch.bool)

data = Data(x=x, edge_index=edge_index, y=y_t,
            train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)

print(f"RIDs con etiqueta: {n_lab_rids} | "
      f"train_rids={len(train_rids)}, val_rids={len(val_rids)}, test_rids={len(test_rids)}")
print(f"Labels -> train:{train_mask.sum().item()} val:{val_mask.sum().item()} test:{test_mask.sum().item()}")
print(f"Edges intra-split: {edge_index.size(1)}")
Y_MEAN_TRAIN, Y_STD_TRAIN = y_mu, y_std

class GNNRegressor(nn.Module):
    def __init__(self, in_ch, hid=32, dropout=0.5):
        super().__init__()
        self.c1 = GCNConv(in_ch, hid)
        self.c2 = GCNConv(hid, 1)
        self.dropout = dropout
    def forward(self, x, edge_index):
        x = F.relu(self.c1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.c2(x, edge_index)
        return x.squeeze(-1)

device = torch.device("cpu")
model = GNNRegressor(in_ch=data.num_node_features, hid=32, dropout=0.5).to(device)
data = data.to(device)

opt = torch.optim.AdamW(model.parameters(), lr=5e-3, weight_decay=2e-4)
loss_fn = nn.MSELoss()

def eval_metrics(split="val"):
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        mask = data.train_mask if split=="train" else (data.val_mask if split=="val" else data.test_mask)
        if not mask.any():
            return float("nan"), float("nan")
        mae_scaled = torch.mean(torch.abs(out[mask] - data.y[mask])).item()
        rmse_scaled = torch.sqrt(loss_fn(out[mask], data.y[mask])).item()
        return mae_scaled * Y_STD_TRAIN, rmse_scaled * Y_STD_TRAIN  # años

best_val_rmse = float("inf"); bad = 0; patience = 10
best_state = None

for ep in range(1, 121):
    model.train()
    opt.zero_grad()
    out = model(data.x, data.edge_index)
    if data.train_mask.any():
        loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
        if torch.isnan(loss) or torch.isinf(loss): raise RuntimeError("Loss NaN/Inf; revisa X/y/fechas.")
        loss.backward()
        opt.step()
        loss_val = loss.detach().item()
    else:
        loss_val = float("nan")

    if ep % 10 == 0:
        tr_mae, tr_rmse = eval_metrics("train")
        va_mae, va_rmse = eval_metrics("val")
        te_mae, te_rmse = eval_metrics("test")
        print(f"ep {ep:03d} | train_loss(MSE_scaled) {loss_val:.4f} "
              f"| TR MAE {tr_mae:.3f}y RMSE {tr_rmse:.3f}y "
              f"| VAL MAE {va_mae:.3f}y RMSE {va_rmse:.3f}y "
              f"| TEST MAE {te_mae:.3f}y RMSE {te_rmse:.3f}y")

    _, val_rmse = eval_metrics("val")
    if np.isfinite(val_rmse) and (val_rmse + 1e-4 < best_val_rmse):
        best_val_rmse = val_rmse
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        bad = 0
    else:
        bad += 1
        if bad >= patience:
            break

if best_state:
    model.load_state_dict(best_state)
tr_mae, tr_rmse = eval_metrics("train")
va_mae, va_rmse = eval_metrics("val")
te_mae, te_rmse = eval_metrics("test")
print(f"[BEST] TR MAE {tr_mae:.3f}y RMSE {tr_rmse:.3f}y | VAL MAE {va_mae:.3f}y RMSE {va_rmse:.3f}y | TEST MAE {te_mae:.3f}y RMSE {te_rmse:.3f}y")

y_years = y_full  # ya en años
def baseline_mae(y, mask, const):
    arr = y[mask].to_numpy()
    if arr.size == 0: return float("nan")
    return float(np.mean(np.abs(arr - const)))

train_mask_np = train_mask.cpu().numpy()
val_mask_np   = val_mask.cpu().numpy()
test_mask_np  = test_mask.cpu().numpy()

for split, m in [("VAL", val_mask_np), ("TEST", test_mask_np)]:
    mae_mean = baseline_mae(y_years, m, Y_MEAN_TRAIN)
    mae_zero = baseline_mae(y_years, m, 0.0)
    print(f"[Baseline {split}]  mean(TRAIN) MAE={mae_mean:.3f}y | zero MAE={mae_zero:.3f}y")


>> Modo VISITAS (usando VISDATE como fecha de visita)
Nodos: 6210 | etiquetados (pre-evento): 23 | edges totales: 72424
RIDs con etiqueta: 23 | train_rids=16, val_rids=3, test_rids=4
Labels -> train:16 val:3 test:4
Edges intra-split: 72206
ep 010 | train_loss(MSE_scaled) 1.0048 | TR MAE 0.115y RMSE 0.133y | VAL MAE 0.073y RMSE 0.081y | TEST MAE 0.188y RMSE 0.264y
ep 020 | train_loss(MSE_scaled) 0.8418 | TR MAE 0.104y RMSE 0.120y | VAL MAE 0.071y RMSE 0.073y | TEST MAE 0.172y RMSE 0.245y
ep 030 | train_loss(MSE_scaled) 0.6665 | TR MAE 0.089y RMSE 0.111y | VAL MAE 0.061y RMSE 0.073y | TEST MAE 0.161y RMSE 0.227y
[BEST] TR MAE 0.092y RMSE 0.113y | VAL MAE 0.063y RMSE 0.072y | TEST MAE 0.164y RMSE 0.231y
[Baseline VAL]  mean(TRAIN) MAE=0.077y | zero MAE=0.159y
[Baseline TEST]  mean(TRAIN) MAE=0.190y | zero MAE=0.313y


In [28]:
import random
from sklearn.metrics import roc_auc_score, average_precision_score

device = torch.device("cpu")

tmp = df.reset_index()[["index", "RID", date_col]].dropna(subset=[date_col]).sort_values(["RID", date_col])
pos_src, pos_dst = [], []
for rid, g in tmp.groupby("RID"):
    ids = g["index"].tolist()
    for a, b in zip(ids[:-1], ids[1:]):
        pos_src += [a, b]   # bidireccional
        pos_dst += [b, a]
pos_edge = torch.tensor([pos_src, pos_dst], dtype=torch.long)
print(f"[SSL] Positives (temporal): {pos_edge.size(1)}")

rids = df["RID"].dropna().unique()
rng = np.random.default_rng(123)
rng.shuffle(rids)
n = len(rids)
tr_n, va_n = max(1, int(0.7*n)), max(1, int(0.15*n))
if tr_n + va_n > n - 1: va_n = max(1, n - 1 - tr_n)
train_rids = set(rids[:tr_n]); val_rids = set(rids[tr_n:tr_n+va_n]); test_rids = set(rids[tr_n+va_n:])

node_split = np.full(len(df), "train", dtype=object)
node_rids = df["RID"].to_numpy()
node_split[np.isin(node_rids, list(val_rids))]  = "val"
node_split[np.isin(node_rids, list(test_rids))] = "test"

def mask_edges_by_split(e, split):
    src, dst = e[0].numpy(), e[1].numpy()
    keep = (np.array([node_split[s] for s in src]) == split) & (np.array([node_split[d] for d in dst]) == split)
    keep = torch.tensor(keep)
    return e[:, keep]

pos_train = mask_edges_by_split(pos_edge, "train")
pos_val   = mask_edges_by_split(pos_edge, "val")
pos_test  = mask_edges_by_split(pos_edge, "test")
print(f"[SSL] Pos train:{pos_train.size(1)} val:{pos_val.size(1)} test:{pos_test.size(1)}")

pos_set = set((int(s), int(d)) for s, d in zip(pos_edge[0].tolist(), pos_edge[1].tolist()))

def sample_negatives(num_edges, split):
    idxs = np.where(node_split == split)[0]
    neg = []
    tries = 0
    needed = num_edges
    while len(neg) < needed and tries < needed * 50:
        a, b = np.random.choice(idxs, size=2, replace=False)
        tries += 1
        if (a, b) in pos_set or (b, a) in pos_set: 
            continue
        neg.append((a, b))
    if not neg:
        return torch.empty((2,0), dtype=torch.long)
    arr = np.array(neg[:needed])
    return torch.tensor(arr.T, dtype=torch.long)

class GCNEncoder(nn.Module):
    def __init__(self, in_ch, hid=64, dropout=0.4):
        super().__init__()
        self.c1 = GCNConv(in_ch, hid)
        self.c2 = GCNConv(hid, hid)
        self.dropout = dropout
    def forward(self, x, edge_index):
        x = F.relu(self.c1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.c2(x, edge_index)
        return x  # embeddings

def dot_decode(z, edges):
    return (z[edges[0]] * z[edges[1]]).sum(dim=1)  # logits

encoder = GCNEncoder(in_ch=X_clean.shape[1], hid=64, dropout=0.4).to(device)
x = torch.tensor(X_clean.values, dtype=torch.float32, device=device)
ei = edge_index.to(device)

opt = torch.optim.AdamW(encoder.parameters(), lr=5e-3, weight_decay=2e-4)
bce = nn.BCEWithLogitsLoss()

def eval_link(z, pos_e, split):
    if pos_e.size(1) == 0:
        return float("nan"), float("nan")
    neg_e = sample_negatives(pos_e.size(1), split)
    if neg_e.numel() == 0:
        return float("nan"), float("nan")
    neg_e = neg_e.to(z.device)
    with torch.no_grad():
        pos_s = torch.sigmoid(dot_decode(z, pos_e.to(z.device))).cpu().numpy()
        neg_s = torch.sigmoid(dot_decode(z, neg_e)).cpu().numpy()
    y_true = np.concatenate([np.ones_like(pos_s), np.zeros_like(neg_s)])
    y_score = np.concatenate([pos_s, neg_s])
    return roc_auc_score(y_true, y_score), average_precision_score(y_true, y_score)

best_val_auc, patience, bad = -1.0, 10, 0
best_state = None

for ep in range(1, 121):
    encoder.train(); opt.zero_grad()
    z = encoder(x, ei)
    neg_train = sample_negatives(pos_train.size(1), "train").to(device)
    if neg_train.numel() == 0:
        break
    pos_logits = dot_decode(z, pos_train.to(device))
    neg_logits = dot_decode(z, neg_train)
    y = torch.cat([torch.ones_like(pos_logits), torch.zeros_like(neg_logits)])
    logits = torch.cat([pos_logits, neg_logits])
    loss = bce(logits, y)
    loss.backward(); opt.step()

    if ep % 10 == 0:
        encoder.eval()
        with torch.no_grad():
            z = encoder(x, ei)
        tr_auc, tr_ap = eval_link(z, pos_train, "train")
        va_auc, va_ap = eval_link(z, pos_val,   "val")
        te_auc, te_ap = eval_link(z, pos_test,  "test")
        print(f"ep {ep:03d} | loss {loss.item():.4f} "
              f"| TR AUC {tr_auc:.3f} AP {tr_ap:.3f} "
              f"| VAL AUC {va_auc:.3f} AP {va_ap:.3f} "
              f"| TEST AUC {te_auc:.3f} AP {te_ap:.3f}")

        if np.isfinite(va_auc) and va_auc > best_val_auc + 1e-4:
            best_val_auc, bad = va_auc, 0
            best_state = {k: v.detach().cpu().clone() for k, v in encoder.state_dict().items()}
        else:
            bad += 1
            if bad >= patience:
                break

if best_state is not None:
    encoder.load_state_dict(best_state)

encoder.eval()
with torch.no_grad():
    Z = encoder(x, ei).cpu().numpy()

emb_df = pd.DataFrame(Z, index=df.index)
emb_df.insert(0, "RID", df["RID"].values)
emb_df.insert(1, "VISDATE", pd.to_datetime(df[date_col]).dt.date.astype("string"))
emb_df.to_csv("visit_embeddings_ssl.csv", index=False)
print("Embeddings guardados en visit_embeddings_ssl.csv")


[SSL] Positives (temporal): 2528
[SSL] Pos train:1784 val:350 test:394
ep 010 | loss 0.5473 | TR AUC 0.894 AP 0.876 | VAL AUC 0.902 AP 0.886 | TEST AUC 0.901 AP 0.884
ep 020 | loss 0.5279 | TR AUC 0.902 AP 0.886 | VAL AUC 0.899 AP 0.888 | TEST AUC 0.897 AP 0.881
ep 030 | loss 0.5023 | TR AUC 0.911 AP 0.888 | VAL AUC 0.893 AP 0.884 | TEST AUC 0.918 AP 0.907
ep 040 | loss 0.4895 | TR AUC 0.918 AP 0.904 | VAL AUC 0.890 AP 0.873 | TEST AUC 0.920 AP 0.912
ep 050 | loss 0.4847 | TR AUC 0.926 AP 0.912 | VAL AUC 0.948 AP 0.944 | TEST AUC 0.938 AP 0.925
ep 060 | loss 0.4781 | TR AUC 0.937 AP 0.921 | VAL AUC 0.935 AP 0.923 | TEST AUC 0.934 AP 0.911
ep 070 | loss 0.4928 | TR AUC 0.931 AP 0.915 | VAL AUC 0.929 AP 0.916 | TEST AUC 0.939 AP 0.926
ep 080 | loss 0.4684 | TR AUC 0.946 AP 0.934 | VAL AUC 0.944 AP 0.937 | TEST AUC 0.944 AP 0.936
ep 090 | loss 0.4755 | TR AUC 0.942 AP 0.930 | VAL AUC 0.950 AP 0.941 | TEST AUC 0.925 AP 0.904
ep 100 | loss 0.4539 | TR AUC 0.944 AP 0.927 | VAL AUC 0.940 AP 0