# GNN con Datos DemogrÃ¡ficos (Baseline)
Este notebook es el **modelo baseline** que usa Ãºnicamente datos demogrÃ¡ficos:
- AGE_AT_VISIT (Edad en la visita)
- PTEDUCAT (AÃ±os de educaciÃ³n)
- PTGENDER (GÃ©nero)
- PTMARRY (Estado civil)

**Objetivo**: Predecir aÃ±os hasta el inicio de sÃ­ntomas de Alzheimer usando GNN

Este modelo servirÃ¡ como **referencia** para comparar con modelos que incluyen biomarcadores.

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv

torch.manual_seed(42)
np.random.seed(42)

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
csv_path = "./data/adni/demographics/PTDEMOG.csv"
df = pd.read_csv(csv_path)

print(f"Demographics loaded: {df.shape}")
print(f"Columns: {list(df.columns)[:10]}...")

Demographics loaded: (6210, 84)
Columns: ['PHASE', 'PTID', 'RID', 'VISCODE', 'VISCODE2', 'VISDATE', 'PTSOURCE', 'PTGENDER', 'PTDOB', 'PTDOBYY']...


In [3]:
def norm_codes_to_labels(s: pd.Series, mapping: dict) -> pd.Series:
    out = s.astype(str).str.strip().str.replace(r"\.0$", "", regex=True)
    out = out.map(mapping)
    return out

gender_map = {"1":"male","2":"female","male":"male","female":"female","m":"male","f":"female"}
marry_map  = {"1":"married","2":"widowed","3":"divorced","4":"never_married","6":"domestic_partnership"}

def to_year(s):
    s = pd.to_numeric(s, errors="coerce")
    s = s.where((s >= 1900) & (s <= 2100))
    return s

print("Utility functions defined.")

Utility functions defined.


In [4]:

onset_cols = [c for c in ["PTCOGBEG","PTADBEG","PTADDX"] if c in df.columns]
for c in onset_cols:
    df[c] = to_year(df[c])

def row_min_nonnull(row):
    vals = [row[c] for c in onset_cols if pd.notna(row[c])]
    return min(vals) if vals else np.nan

df["YEAR_ONSET"] = df.apply(row_min_nonnull, axis=1) if onset_cols else np.nan
df["YEAR_ONSET"] = to_year(df["YEAR_ONSET"])

for c in ["PTDOBYY","PTEDUCAT"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

if "PTGENDER" in df.columns:
    df["PTGENDER"] = norm_codes_to_labels(df["PTGENDER"], gender_map)
if "PTMARRY" in df.columns:
    df["PTMARRY"]  = norm_codes_to_labels(df["PTMARRY"], marry_map)

print(f"\nData processed. Shape: {df.shape}")
print(f"Patients with YEAR_ONSET: {df['YEAR_ONSET'].notna().sum()}")


Data processed. Shape: (6210, 85)
Patients with YEAR_ONSET: 2908


In [5]:

date_col = "EXAMDATE" if "EXAMDATE" in df.columns else ("VISDATE" if "VISDATE" in df.columns else None)
if not date_col:
    raise ValueError("No se encuentra columna de fecha (VISDATE/EXAMDATE)")

df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
df["EXAM_YEAR"] = to_year(df[date_col].dt.year)

df["AGE_AT_VISIT"] = np.where(
    df["EXAM_YEAR"].notna() & df["PTDOBYY"].notna(),
    df["EXAM_YEAR"] - df["PTDOBYY"],
    np.nan
)

df["YEARS_TO_ONSET"] = np.where(
    df["YEAR_ONSET"].notna() & df["EXAM_YEAR"].notna(),
    df["YEAR_ONSET"] - df["EXAM_YEAR"],
    np.nan
)

df.loc[(df["YEARS_TO_ONSET"] < 0) & df["YEAR_ONSET"].notna(), "YEARS_TO_ONSET"] = np.nan
df.loc[df["YEARS_TO_ONSET"] > 50, "YEARS_TO_ONSET"] = np.nan

df["HAS_LABEL"] = df["YEARS_TO_ONSET"].notna()

print(f"\nVisits prepared: {len(df)}")
print(f"Labeled visits (with YEARS_TO_ONSET): {df['HAS_LABEL'].sum()}")
print(f"Using date column: {date_col}")


Visits prepared: 6210
Labeled visits (with YEARS_TO_ONSET): 82
Using date column: VISDATE


In [6]:

num_cols = [c for c in ["AGE_AT_VISIT", "PTEDUCAT"] if c in df.columns]

cat_cols = [c for c in ["PTGENDER", "PTMARRY"] if c in df.columns]

for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(df[c].median())

parts = []

if num_cols:
    scaler = StandardScaler()
    X_num = pd.DataFrame(
        scaler.fit_transform(df[num_cols]),
        columns=num_cols,
        index=df.index
    )
    parts.append(X_num)
    print(f"Numeric features ({len(num_cols)}): {num_cols}")

if cat_cols:
    X_cat = pd.get_dummies(df[cat_cols], prefix=cat_cols, drop_first=False, dtype=float)
    parts.append(X_cat)
    print(f"Categorical features ({len(cat_cols)}): {cat_cols} -> {X_cat.shape[1]} one-hot columns")

if not parts:
    raise ValueError("No features available")

X = pd.concat(parts, axis=1).astype(float)
X_clean = X.replace([np.inf, -np.inf], np.nan).fillna(0.0)

print(f"\nðŸ“Š BASELINE Feature matrix (Demographics only): {X_clean.shape}")
print(f"Total features: {X_clean.shape[1]}")
print(f"Feature list: {list(X_clean.columns)}")

Numeric features (2): ['AGE_AT_VISIT', 'PTEDUCAT']
Categorical features (2): ['PTGENDER', 'PTMARRY'] -> 7 one-hot columns

ðŸ“Š BASELINE Feature matrix (Demographics only): (6210, 9)
Total features: 9
Feature list: ['AGE_AT_VISIT', 'PTEDUCAT', 'PTGENDER_female', 'PTGENDER_male', 'PTMARRY_divorced', 'PTMARRY_domestic_partnership', 'PTMARRY_married', 'PTMARRY_never_married', 'PTMARRY_widowed']


In [7]:

n_samples = X_clean.shape[0]
k = min(8, max(1, n_samples - 1))
n_neighbors = min(n_samples, k + 1)

if n_samples >= 2:
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric="euclidean")
    nbrs.fit(X_clean.values)
    _, idx = nbrs.kneighbors(X_clean.values)
    src_knn, dst_knn = [], []
    for i in range(idx.shape[0]):
        for j in idx[i, 1:]:
            src_knn.append(i)
            dst_knn.append(j)
    edge_knn = torch.tensor([src_knn, dst_knn], dtype=torch.long)
else:
    edge_knn = torch.empty((2, 0), dtype=torch.long)

tmp = df.reset_index()[["index", "RID", date_col]].dropna(subset=[date_col]).sort_values(["RID", date_col])
src_tmp, dst_tmp = [], []
for rid, g in tmp.groupby("RID"):
    ids = g["index"].tolist()
    for a, b in zip(ids[:-1], ids[1:]):
        src_tmp.append(a)
        dst_tmp.append(b)
edge_tmp = torch.tensor([src_tmp, dst_tmp], dtype=torch.long) if src_tmp else torch.empty((2,0), dtype=torch.long)

def undirected(e):
    return torch.cat([e, e.flip(0)], dim=1) if e.numel() else e

edges = []
if edge_knn.numel(): edges.append(undirected(edge_knn))
if edge_tmp.numel(): edges.append(undirected(edge_tmp))
edge_index = torch.cat(edges, dim=1) if edges else torch.empty((2,0), dtype=torch.long)
if edge_index.numel():
    edge_index = torch.unique(edge_index, dim=1)

print(f"\nGraph constructed:")
print(f"  Nodes: {len(df)}")
print(f"  kNN edges: {edge_knn.size(1)}")
print(f"  Temporal edges: {edge_tmp.size(1)}")
print(f"  Total edges (undirected, unique): {edge_index.size(1)}")


Graph constructed:
  Nodes: 6210
  kNN edges: 49680
  Temporal edges: 1264
  Total edges (undirected, unique): 72424


In [8]:

df["USE_FOR_LABEL"] = False
if df["HAS_LABEL"].any():
    idx_last_pre = df.loc[df["HAS_LABEL"]].groupby("RID")[date_col].idxmax()
    df.loc[idx_last_pre, "USE_FOR_LABEL"] = True

rids_with_label = df.loc[df["USE_FOR_LABEL"], "RID"].dropna().unique()
rng = np.random.default_rng(42)
rng.shuffle(rids_with_label)
n_lab_rids = len(rids_with_label)

tr_n = max(1, int(0.7 * n_lab_rids))
va_n = max(0, int(0.15 * n_lab_rids))
if tr_n + va_n > max(0, n_lab_rids - 1):
    va_n = max(0, n_lab_rids - 1 - tr_n)

train_rids = set(rids_with_label[:tr_n])
val_rids   = set(rids_with_label[tr_n:tr_n+va_n])
test_rids  = set(rids_with_label[tr_n+va_n:])

node_split = np.full(len(df), "train", dtype=object)
node_rids = df["RID"].to_numpy()
node_split[np.isin(node_rids, list(val_rids))]  = "val"
node_split[np.isin(node_rids, list(test_rids))] = "test"

use_for_label = df["USE_FOR_LABEL"].to_numpy()
train_mask_np = (node_split == "train") & use_for_label
val_mask_np   = (node_split == "val")   & use_for_label
test_mask_np  = (node_split == "test")  & use_for_label

split_map = {"train":0, "val":1, "test":2}
split_idx = np.vectorize(split_map.get)(node_split)
src_np = edge_index[0].cpu().numpy()
dst_np = edge_index[1].cpu().numpy()
keep_edges = split_idx[src_np] == split_idx[dst_np]
edge_index = edge_index[:, torch.tensor(keep_edges)]

print(f"\nSplits created:")
print(f"  RIDs with labels: {n_lab_rids}")
print(f"  Train RIDs: {len(train_rids)}")
print(f"  Val RIDs: {len(val_rids)}")
print(f"  Test RIDs: {len(test_rids)}")
print(f"\n  Labeled nodes:")
print(f"    Train: {train_mask_np.sum()}")
print(f"    Val: {val_mask_np.sum()}")
print(f"    Test: {test_mask_np.sum()}")
print(f"\n  Edges (intra-split): {edge_index.size(1)}")


Splits created:
  RIDs with labels: 82
  Train RIDs: 57
  Val RIDs: 12
  Test RIDs: 13

  Labeled nodes:
    Train: 57
    Val: 12
    Test: 13

  Edges (intra-split): 71756


In [9]:

y_full = df["YEARS_TO_ONSET"].astype(float)
y_mu  = float(y_full[train_mask_np].mean()) if train_mask_np.any() else 0.0
y_std = float(y_full[train_mask_np].std(ddof=0)) if train_mask_np.any() else 1.0
if not np.isfinite(y_std) or y_std == 0.0:
    y_std = 1.0

y_scaled = (y_full - y_mu) / y_std
y_t = torch.tensor(y_scaled.fillna(0).values, dtype=torch.float32)

x = torch.tensor(X_clean.values, dtype=torch.float32)
train_mask = torch.tensor(train_mask_np, dtype=torch.bool)
val_mask   = torch.tensor(val_mask_np,   dtype=torch.bool)
test_mask  = torch.tensor(test_mask_np,  dtype=torch.bool)

data = Data(x=x, edge_index=edge_index, y=y_t,
            train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)

print(f"\nData object created:")
print(f"  x.shape: {data.x.shape}")
print(f"  edge_index.shape: {data.edge_index.shape}")
print(f"  y.shape: {data.y.shape}")
print(f"  Target scaling: mean={y_mu:.3f}, std={y_std:.3f}")

Y_MEAN_TRAIN, Y_STD_TRAIN = y_mu, y_std


Data object created:
  x.shape: torch.Size([6210, 9])
  edge_index.shape: torch.Size([2, 71756])
  y.shape: torch.Size([6210])
  Target scaling: mean=0.018, std=0.131


In [10]:

class GNNRegressor(nn.Module):
    def __init__(self, in_ch, hid=64, dropout=0.3):
        super().__init__()
        self.c1 = GCNConv(in_ch, hid)
        self.c2 = GCNConv(hid, 1)
        self.dropout = dropout
        
    def forward(self, x, edge_index):
        x = F.relu(self.c1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.c2(x, edge_index)
        return x.squeeze(-1)

device = torch.device("cpu")
model = GNNRegressor(in_ch=data.num_node_features, hid=64, dropout=0.3).to(device)
data = data.to(device)

print(f"\nðŸ§  Model created (BASELINE - Demographics only):")
print(f"  Input features: {data.num_node_features}")
print(f"  Hidden dim: 64")
print(f"  Architecture: GCNConv -> ReLU -> Dropout -> GCNConv -> Output")
print(f"  Model parameters: {sum(p.numel() for p in model.parameters())}")


ðŸ§  Model created (BASELINE - Demographics only):
  Input features: 9
  Hidden dim: 64
  Architecture: GCNConv -> ReLU -> Dropout -> GCNConv -> Output
  Model parameters: 705


In [11]:

opt = torch.optim.AdamW(model.parameters(), lr=1e-2, weight_decay=1e-4)
loss_fn = nn.MSELoss()

def eval_metrics(split="val"):
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        mask = data.train_mask if split=="train" else (data.val_mask if split=="val" else data.test_mask)
        if not mask.any():
            return float("nan"), float("nan")
        mae_scaled = torch.mean(torch.abs(out[mask] - data.y[mask])).item()
        rmse_scaled = torch.sqrt(loss_fn(out[mask], data.y[mask])).item()
        return mae_scaled * Y_STD_TRAIN, rmse_scaled * Y_STD_TRAIN

print("\nðŸš€ Starting training...\n")

for ep in range(1, 101):
    model.train()
    opt.zero_grad()
    out = model(data.x, data.edge_index)
    
    if data.train_mask.any():
        loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
        if torch.isnan(loss) or torch.isinf(loss):
            raise RuntimeError("Loss NaN/Inf")
        loss.backward()
        opt.step()
        loss_val = loss.detach().item()
    else:
        loss_val = float("nan")
    
    if ep % 10 == 0:
        tr_mae, tr_rmse = eval_metrics("train")
        va_mae, va_rmse = eval_metrics("val")
        te_mae, te_rmse = eval_metrics("test")
        print(
            f"ep {ep:03d} | train_loss(MSE) {loss_val:.4f} "
            f"| TR MAE {tr_mae:.3f}y RMSE {tr_rmse:.3f}y "
            f"| VAL MAE {va_mae:.3f}y RMSE {va_rmse:.3f}y "
            f"| TEST MAE {te_mae:.3f}y RMSE {te_rmse:.3f}y"
        )

print("\nâœ… Training completed!")


ðŸš€ Starting training...

ep 010 | train_loss(MSE) 0.8084 | TR MAE 0.046y RMSE 0.119y | VAL MAE 0.039y RMSE 0.055y | TEST MAE 0.035y RMSE 0.060y
ep 020 | train_loss(MSE) 0.7994 | TR MAE 0.044y RMSE 0.114y | VAL MAE 0.036y RMSE 0.053y | TEST MAE 0.042y RMSE 0.074y
ep 030 | train_loss(MSE) 0.7035 | TR MAE 0.042y RMSE 0.109y | VAL MAE 0.043y RMSE 0.068y | TEST MAE 0.049y RMSE 0.096y
ep 040 | train_loss(MSE) 0.6928 | TR MAE 0.039y RMSE 0.104y | VAL MAE 0.056y RMSE 0.088y | TEST MAE 0.052y RMSE 0.115y
ep 050 | train_loss(MSE) 0.5927 | TR MAE 0.038y RMSE 0.099y | VAL MAE 0.065y RMSE 0.105y | TEST MAE 0.060y RMSE 0.136y
ep 060 | train_loss(MSE) 0.5710 | TR MAE 0.035y RMSE 0.095y | VAL MAE 0.070y RMSE 0.120y | TEST MAE 0.061y RMSE 0.156y
ep 070 | train_loss(MSE) 0.5550 | TR MAE 0.035y RMSE 0.092y | VAL MAE 0.065y RMSE 0.122y | TEST MAE 0.067y RMSE 0.167y
ep 080 | train_loss(MSE) 0.5292 | TR MAE 0.034y RMSE 0.089y | VAL MAE 0.075y RMSE 0.139y | TEST MAE 0.069y RMSE 0.180y
ep 090 | train_loss(

In [12]:

model.eval()
with torch.no_grad():
    final_out = model(data.x, data.edge_index)

tr_mae, tr_rmse = eval_metrics("train")
va_mae, va_rmse = eval_metrics("val")
te_mae, te_rmse = eval_metrics("test")

print("\n" + "="*70)
print("ðŸ“Š BASELINE RESULTS (Demographics Only)")
print("="*70)
print(f"Features used: AGE_AT_VISIT, PTEDUCAT, PTGENDER, PTMARRY")
print(f"Total features: {data.num_node_features}")
print("\nPerformance:")
print(f"  TRAIN | MAE: {tr_mae:.3f} years | RMSE: {tr_rmse:.3f} years")
print(f"  VAL   | MAE: {va_mae:.3f} years | RMSE: {va_rmse:.3f} years")
print(f"  TEST  | MAE: {te_mae:.3f} years | RMSE: {te_rmse:.3f} years")
print("="*70)

baseline_results = {
    'model': 'Demographics Only (Baseline)',
    'features': data.num_node_features,
    'train_mae': tr_mae,
    'train_rmse': tr_rmse,
    'val_mae': va_mae,
    'val_rmse': va_rmse,
    'test_mae': te_mae,
    'test_rmse': te_rmse
}

print("\nðŸ’¡ Next step: Run DemoAndBiomarkers.ipynb to see improvement with CSF biomarkers!")


ðŸ“Š BASELINE RESULTS (Demographics Only)
Features used: AGE_AT_VISIT, PTEDUCAT, PTGENDER, PTMARRY
Total features: 9

Performance:
  TRAIN | MAE: 0.038 years | RMSE: 0.087 years
  VAL   | MAE: 0.090 years | RMSE: 0.175 years
  TEST  | MAE: 0.083 years | RMSE: 0.205 years

ðŸ’¡ Next step: Run DemoAndBiomarkers.ipynb to see improvement with CSF biomarkers!


In [13]:

import json

with open('baseline_demographics_results.json', 'w') as f:
    json.dump(baseline_results, f, indent=2)

print("\nâœ… Results saved to: baseline_demographics_results.json")
print("\nYou can compare these results with DemoAndBiomarkers.ipynb")


âœ… Results saved to: baseline_demographics_results.json

You can compare these results with DemoAndBiomarkers.ipynb
