<a href="https://colab.research.google.com/github/JesiyaFernandes/heartattackrisk/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import json
import pandas as pd

json_path = "/content/drive/MyDrive/data_info.json"   # update path
csv_path  = "/content/drive/MyDrive/data_csv.csv"

with open(json_path, "r", encoding="utf-8") as f:
    raw = json.load(f)   # raw is a dict: { "2491006": { ... }, "3730004": { ... }, ... }

rows = []

for patient_id, info in raw.items():
    # common fields
    gender    = info.get("gender")
    thickness = info.get("thickness")
    label     = info.get("label")   # 0/1 – this is your risk label
    group     = info.get("group")
    true_age  = info.get("True_age")
    age_norm  = info.get("age")     # normalized age if present

    # right eye
    if "right_eye" in info and info["right_eye"]:
        rows.append({
            "PatientID": patient_id,
            "eye": "R",
            "filename": info["right_eye"],
            "gender": gender,
            "thickness": thickness,
            "label": label,
            "group": group,
            "True_age": true_age,
            "age_norm": age_norm,
        })

    # left eye
    if "left_eye" in info and info["left_eye"]:
        rows.append({
            "PatientID": patient_id,
            "eye": "L",
            "filename": info["left_eye"],
            "gender": gender,
            "thickness": thickness,
            "label": label,
            "group": group,
            "True_age": true_age,
            "age_norm": age_norm,
        })

# Create DataFrame and save to CSV
df = pd.DataFrame(rows)
df.to_csv(csv_path, index=False)
print("Saved:", csv_path)
print(df.head())


Saved: /content/drive/MyDrive/data_csv.csv
  PatientID eye       filename  gender  thickness  label  group  True_age  \
0   2491006   R  2491006_R.png       0        0.8      0      1        63   
1   2491006   L  2491006_L.png       0        0.8      0      1        63   
2   3730004   R  3730004_R.png       1        1.2      1      1        61   
3   3730004   L  3730004_L.png       1        1.2      1      1        61   
4   3730006   R  3730006_R.png       1        1.2      1      1        64   

   age_norm  
0  0.684932  
1  0.684932  
2  0.657534  
3  0.657534  
4  0.698630  


In [3]:
"""
Preprocessing pipeline for retinal fundus dataset
- CSV cleaning and encoding
- Patient-level train/val/test split
- Image cropping, resizing, and normalization-ready storage

Author: <your name>
"""

import os
from pathlib import Path
import shutil
import random

import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# -----------------------------
# Configuration
# -----------------------------
DATA_ROOT = Path("/content/drive/MyDrive/Fundus_CIMT_2903_cropped_512")  # folder containing images
CSV_PATH = Path("/content/data_csv.csv")               # your CSV file
OUTPUT_ROOT = Path("/content/drive/MyDrive/processed_dataset")

IMAGE_EXT = ".png"
IMAGE_SIZE = 384              # final square size (e.g. 224, 299, 384)
RANDOM_SEED = 42
TRAIN_RATIO = 0.7
VAL_RATIO = 0.15              # test will be 1 - TRAIN_RATIO - VAL_RATIO

# Column names in your CSV (adjust if different)
COL_PATIENT_ID = "PatientID"
COL_FILENAME = "filename"
COL_EYE = "eye"               # 'L' or 'R'
COL_GENDER = "gender"         # 0/1 or M/F
COL_LABEL = "label"           # risk label
COL_GROUP = "group"           # optional extra grouping

# -----------------------------
# Utility functions
# -----------------------------

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)

def load_and_clean_csv(csv_path: Path) -> pd.DataFrame:
    """Load CSV, drop duplicates/NA, standardize dtypes, and attach image paths."""
    df = pd.read_csv(csv_path)

    # Basic cleaning
    df = df.drop_duplicates()
    df = df.dropna(subset=[COL_PATIENT_ID, COL_FILENAME])

    # Ensure types
    df[COL_PATIENT_ID] = df[COL_PATIENT_ID].astype(str)
    df[COL_FILENAME] = df[COL_FILENAME].astype(str)

    # Standardize eye column if present
    if COL_EYE in df.columns:
        df[COL_EYE] = df[COL_EYE].str.upper().str.strip()

    # Encode gender if text; if already 0/1, this will have no effect
    if COL_GENDER in df.columns and not np.issubdtype(df[COL_GENDER].dtype, np.number):
        df[COL_GENDER] = (
            df[COL_GENDER]
            .str.upper()
            .str.strip()
            .map({"M": 1, "MALE": 1, "F": 0, "FEMALE": 0})
        )

    # Attach full image path
    df["image_path"] = df[COL_FILENAME].apply(
        lambda x: str(DATA_ROOT / x)
    )

    # Filter rows where image file actually exists
    df["image_exists"] = df["image_path"].apply(os.path.exists)
    missing = df[~df["image_exists"]]
    if len(missing) > 0:
        print(f"Warning: {len(missing)} entries without images will be dropped.")
        df = df[df["image_exists"]]

    df = df.drop(columns=["image_exists"])

    return df


def crop_fundus_circle(img: np.ndarray, padding: int = 10) -> np.ndarray:
    """
    Simple cropping of fundus images:
    - Convert to gray
    - Threshold to find the circular fundus region
    - Crop to bounding box with small padding.
    Inspired by common fundus pre-processing approaches. [web:23][web:27][web:28]
    """
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Normalize and threshold
    gray_blur = cv2.GaussianBlur(gray, (5, 5), 0)
    _, mask = cv2.threshold(gray_blur, 10, 255, cv2.THRESH_BINARY)

    # Find bounding box of the non-zero region
    coords = cv2.findNonZero(mask)
    if coords is None:
        return img  # fallback: return original

    x, y, w, h = cv2.boundingRect(coords)

    x0 = max(x - padding, 0)
    y0 = max(y - padding, 0)
    x1 = min(x + w + padding, img.shape[1])
    y1 = min(y + h + padding, img.shape[0])

    cropped = img[y0:y1, x0:x1]
    return cropped


def preprocess_image(path: str, image_size: int = 384) -> np.ndarray:
    """
    Load an image from disk, crop border, resize to fixed size.
    Returns BGR uint8 image; normalization will typically be done in the
    dataloader before feeding into the network. [web:28][web:31][web:37]
    """
    img = cv2.imread(path, cv2.IMREAD_COLOR)
    if img is None:
        raise FileNotFoundError(f"Could not read image: {path}")

    img = crop_fundus_circle(img, padding=10)
    img = cv2.resize(img, (image_size, image_size), interpolation=cv2.INTER_AREA)

    return img


def patient_level_split(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Split dataframe into train/val/test at patient level to avoid leakage. [web:29][web:32][web:35][web:41]
    """
    patients = df[COL_PATIENT_ID].unique()
    train_patients, temp_patients = train_test_split(
        patients, test_size=(1 - TRAIN_RATIO), random_state=RANDOM_SEED, shuffle=True
    )

    val_size_rel = VAL_RATIO / (1 - TRAIN_RATIO)
    val_patients, test_patients = train_test_split(
        temp_patients, test_size=(1 - val_size_rel), random_state=RANDOM_SEED, shuffle=True
    )

    train_df = df[df[COL_PATIENT_ID].isin(train_patients)].reset_index(drop=True)
    val_df = df[df[COL_PATIENT_ID].isin(val_patients)].reset_index(drop=True)
    test_df = df[df[COL_PATIENT_ID].isin(test_patients)].reset_index(drop=True)

    print(f"Patients: train={len(train_patients)}, val={len(val_patients)}, test={len(test_patients)}")
    print(f"Images:   train={len(train_df)}, val={len(val_df)}, test={len(test_df)}")

    return train_df, val_df, test_df


def save_split_and_images(
    df: pd.DataFrame,
    split_name: str,
    output_root: Path,
    image_size: int = 384,
    limit: int | None = None,
):
    """
    Preprocess and copy images into structured folders:
        output_root/split_name/images/
    Also save the CSV with updated relative paths.
    """
    split_root = output_root / split_name
    img_out_dir = split_root / "images"
    split_root.mkdir(parents=True, exist_ok=True)
    img_out_dir.mkdir(parents=True, exist_ok=True)

    processed_paths = []
    rows = df if limit is None else df.iloc[:limit].copy()

    for idx, row in rows.iterrows():
        src = row["image_path"]
        # Preserve filename
        fname = os.path.basename(src)
        dst = img_out_dir / fname

        try:
            img = preprocess_image(src, image_size=image_size)
            cv2.imwrite(str(dst), img)
            processed_paths.append(str(dst.relative_to(split_root)))
        except Exception as e:
            print(f"[{split_name}] Skipping {src}: {e}")
            processed_paths.append(None)

    rows["processed_path"] = processed_paths
    rows = rows.dropna(subset=["processed_path"])

    # Save CSV for this split
    csv_out = split_root / f"{split_name}_metadata.csv"
    rows.to_csv(csv_out, index=False)
    print(f"Saved {len(rows)} records for {split_name} to {csv_out}")


# -----------------------------
# Main entrypoint
# -----------------------------
if __name__ == "__main__":
    set_seed(RANDOM_SEED)

    # 1. Load & clean CSV
    df_all = load_and_clean_csv(CSV_PATH)
    print(f"Total valid rows after cleaning: {len(df_all)}")

    # 2. Optional: basic sanity checks
    if COL_LABEL in df_all.columns:
        print("Label distribution:")
        print(df_all[COL_LABEL].value_counts(dropna=False))

    # 3. Patient-level split
    train_df, val_df, test_df = patient_level_split(df_all)

    # 4. Save splits and preprocessed images
    #    Remove `limit` argument to process the full dataset.
    OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
    save_split_and_images(train_df, "train", OUTPUT_ROOT, image_size=IMAGE_SIZE)
    save_split_and_images(val_df, "val", OUTPUT_ROOT, image_size=IMAGE_SIZE)
    save_split_and_images(test_df, "test", OUTPUT_ROOT, image_size=IMAGE_SIZE)

    print("Preprocessing completed.")


Total valid rows after cleaning: 5806
Label distribution:
label
1    4108
0    1698
Name: count, dtype: int64
Patients: train=2032, val=435, test=436
Images:   train=4064, val=870, test=872
Saved 4064 records for train to /content/drive/MyDrive/processed_dataset/train/train_metadata.csv
Saved 870 records for val to /content/drive/MyDrive/processed_dataset/val/val_metadata.csv
Saved 872 records for test to /content/drive/MyDrive/processed_dataset/test/test_metadata.csv
Preprocessing completed.


In [2]:
"""
Pure image feature extraction for retinal fundus dataset.
Assumes directory structure:
  PROCESSED_ROOT/
      train/images/*.png
      val/images/*.png
      test/images/*.png

Outputs:
  PROCESSED_ROOT/{split}/{split}_features.npy
  PROCESSED_ROOT/{split}/{split}_filenames.npy
"""

import os
from pathlib import Path
import numpy as np
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights

import cv2

# -----------------------------
# Configuration
# -----------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

PROCESSED_ROOT = Path("/content/drive/MyDrive/processed_dataset")
BATCH_SIZE = 32
NUM_WORKERS = 2
IMAGE_SIZE = 384
FEATURE_DIM = 1280  # EfficientNet-B0 last conv feature size

# -----------------------------
# Dataset and transforms
# -----------------------------

class FundusImageOnlyDataset(Dataset):
    def __init__(self, images_dir: Path, transform=None):
        self.images_dir = images_dir
        self.transform = transform
        self.image_paths = sorted([
            p for p in images_dir.glob("*")
            if p.suffix.lower() in [".png", ".jpg", ".jpeg"]
        ])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        path = self.image_paths[idx]
        img = cv2.imread(str(path), cv2.IMREAD_COLOR)
        if img is None:
            raise FileNotFoundError(str(path))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        if self.transform is not None:
            img = self.transform(img)

        return img, str(path.name)

# ImageNet normalization for pretrained EfficientNet. [web:30][web:62]
image_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    ),
])

# -----------------------------
# Feature extractor model
# -----------------------------

def build_feature_extractor():
    """
    EfficientNet-B0 pretrained on ImageNet, used only as feature extractor
    (classifier head removed). [web:30][web:62][web:63]
    """
    weights = EfficientNet_B0_Weights.IMAGENET1K_V1
    model = efficientnet_b0(weights=weights)
    model.classifier = nn.Identity()
    model.eval()
    return model.to(DEVICE)

# -----------------------------
# Extraction loop
# -----------------------------

def extract_features_from_folder(split_name: str):
    images_dir = PROCESSED_ROOT / split_name / "images"
    dataset = FundusImageOnlyDataset(images_dir, transform=image_transform)
    dataloader = DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=NUM_WORKERS,
        pin_memory=True,
    )

    model = build_feature_extractor()

    all_features = []
    all_filenames = []

    with torch.no_grad():
        for imgs, names in tqdm(dataloader, desc=f"{split_name} feature extraction"):
            imgs = imgs.to(DEVICE, non_blocking=True)
            feats = model(imgs)               # (B, FEATURE_DIM)
            feats = feats.cpu().numpy()

            all_features.append(feats)
            all_filenames.extend(names)

    all_features = np.concatenate(all_features, axis=0)

    # Save
    split_root = PROCESSED_ROOT / split_name
    split_root.mkdir(parents=True, exist_ok=True)

    feat_path = split_root / f"{split_name}_features.npy"
    names_path = split_root / f"{split_name}_filenames.npy"

    np.save(feat_path, all_features)
    np.save(names_path, np.array(all_filenames))

    print(f"{split_name}: features shape = {all_features.shape}")
    print(f"Saved features to {feat_path}")
    print(f"Saved filenames to {names_path}")


if __name__ == "__main__":
    for split in ["train", "val", "test"]:
        extract_features_from_folder(split)
    print("Done.")


Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth


100%|██████████| 20.5M/20.5M [00:00<00:00, 134MB/s] 
train feature extraction: 100%|██████████| 127/127 [03:00<00:00,  1.42s/it]


train: features shape = (4064, 1280)
Saved features to /content/drive/MyDrive/processed_dataset/train/train_features.npy
Saved filenames to /content/drive/MyDrive/processed_dataset/train/train_filenames.npy


val feature extraction: 100%|██████████| 28/28 [01:11<00:00,  2.54s/it]


val: features shape = (870, 1280)
Saved features to /content/drive/MyDrive/processed_dataset/val/val_features.npy
Saved filenames to /content/drive/MyDrive/processed_dataset/val/val_filenames.npy


test feature extraction: 100%|██████████| 28/28 [00:53<00:00,  1.92s/it]

test: features shape = (872, 1280)
Saved features to /content/drive/MyDrive/processed_dataset/test/test_features.npy
Saved filenames to /content/drive/MyDrive/processed_dataset/test/test_filenames.npy
Done.





In [5]:

import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

# -----------------------------
# Config
# -----------------------------
CSV_PATH = Path("/content/drive/MyDrive/data_csv.csv")          # update if needed
OUT_CSV_PATH = Path("/content/drive/MyDrive/data_csv_normalized.csv")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 256
NUM_EPOCHS = 30
LR = 1e-3
LATENT_DIM = 8   # size of normalized feature vector (change as you like)

# numeric feature columns from your CSV
NUM_COLS = ["gender", "thickness", "True_age", "age_norm"]

# -----------------------------
# Dataset
# -----------------------------
class TabularFeatureDataset(Dataset):
    def __init__(self, x: np.ndarray):
        self.x = x.astype(np.float32)

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, idx):
        return self.x[idx]


# -----------------------------
# FC autoencoder normalizer
# -----------------------------
class TabularNormalizer(nn.Module):
    def __init__(self, in_dim: int, latent_dim: int):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(in_dim, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(inplace=True),
            nn.Linear(32, latent_dim),
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 32),
            nn.ReLU(inplace=True),
            nn.Linear(32, in_dim),
        )

    def forward(self, x):
        z = self.encoder(x)
        recon = self.decoder(z)
        return z, recon


# -----------------------------
# Main
# -----------------------------
if __name__ == "__main__":
    # 1) Load CSV and extract numeric features
    df = pd.read_csv(CSV_PATH)
    x = df[NUM_COLS].values
    in_dim = x.shape[1]

    dataset = TabularFeatureDataset(x)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    # 2) Train normalizer (autoencoder)
    model = TabularNormalizer(in_dim, LATENT_DIM).to(DEVICE)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)

    model.train()
    for epoch in range(NUM_EPOCHS):
        epoch_loss = 0.0
        for batch in loader:
            batch = batch.to(DEVICE)
            z, recon = model(batch)
            loss = criterion(recon, batch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item() * batch.size(0)

        epoch_loss /= len(dataset)
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, recon MSE: {epoch_loss:.6f}")

    # 3) Get normalized features for all rows
    full_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
    model.eval()
    all_z = []

    with torch.no_grad():
        for batch in tqdm(full_loader, desc="Encoding all rows"):
            batch = batch.to(DEVICE)
            z, _ = model(batch)
            all_z.append(z.cpu().numpy())

    all_z = np.concatenate(all_z, axis=0)

    # 4) Save back to CSV
    for i in range(LATENT_DIM):
        df[f"f{i}"] = all_z[:, i]

    df.to_csv(OUT_CSV_PATH, index=False)
    print(f"Saved normalized features to {OUT_CSV_PATH}")


Epoch 1/30, recon MSE: 615.450817
Epoch 2/30, recon MSE: 603.846178
Epoch 3/30, recon MSE: 578.942717
Epoch 4/30, recon MSE: 521.525258
Epoch 5/30, recon MSE: 403.560743
Epoch 6/30, recon MSE: 236.124893
Epoch 7/30, recon MSE: 114.552533
Epoch 8/30, recon MSE: 73.702872
Epoch 9/30, recon MSE: 55.952132
Epoch 10/30, recon MSE: 42.705575
Epoch 11/30, recon MSE: 30.719453
Epoch 12/30, recon MSE: 22.714344
Epoch 13/30, recon MSE: 16.780765
Epoch 14/30, recon MSE: 13.177212
Epoch 15/30, recon MSE: 10.894963
Epoch 16/30, recon MSE: 9.137455
Epoch 17/30, recon MSE: 7.937293
Epoch 18/30, recon MSE: 6.279897
Epoch 19/30, recon MSE: 5.870447
Epoch 20/30, recon MSE: 5.070012
Epoch 21/30, recon MSE: 5.211785
Epoch 22/30, recon MSE: 4.806831
Epoch 23/30, recon MSE: 3.736278
Epoch 24/30, recon MSE: 3.341305
Epoch 25/30, recon MSE: 2.984862
Epoch 26/30, recon MSE: 3.323820
Epoch 27/30, recon MSE: 3.351059
Epoch 28/30, recon MSE: 2.664830
Epoch 29/30, recon MSE: 2.644258
Epoch 30/30, recon MSE: 2.1571

Encoding all rows: 100%|██████████| 23/23 [00:00<00:00, 729.12it/s]

Saved normalized features to /content/drive/MyDrive/data_csv_normalized.csv





In [6]:
"""
Multimodal feature fusion and classification for heart attack risk.

Inputs per split (train/val/test):
  - Image features: {split}_features.npy  (N, 1280)
  - Filenames:      {split}_filenames.npy
  - Clinical + normalized features CSV: /content/drive/MyDrive/data_csv_normalized.csv
       columns: PatientID, eye, filename, gender, thickness, label, group,
                True_age, age_norm, f0..f7

Steps:
  1. Join image features with clinical latent features f0..f7 using 'filename'.
  2. Concatenate -> fused feature of size 1280 + 8 = 1288.
  3. Train FC classifier on train, validate on val, evaluate on test.

Author: <your name>
"""

import numpy as np
import pandas as pd
from pathlib import Path
from typing import Tuple

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score, accuracy_score

# -----------------------------
# Paths and config
# -----------------------------
BASE = Path("/content/drive/MyDrive/processed_dataset")

TRAIN_FEATS = BASE / "train" / "train_features.npy"
TRAIN_NAMES = BASE / "train" / "train_filenames.npy"

VAL_FEATS   = BASE / "val" / "val_features.npy"
VAL_NAMES   = BASE / "val" / "val_filenames.npy"

TEST_FEATS  = BASE / "test" / "test_features.npy"
TEST_NAMES  = BASE / "test" / "test_filenames.npy"

CLIN_CSV = Path("/content/drive/MyDrive/data_csv_normalized.csv")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 64
LR = 1e-3
NUM_EPOCHS = 30
IMAGE_FEAT_DIM = 1280
CLIN_FEAT_COLS = [f"f{i}" for i in range(8)]   # from normalization step
FUSED_DIM = IMAGE_FEAT_DIM + len(CLIN_FEAT_COLS)

# -----------------------------
# Data preparation
# -----------------------------

def load_split(
    feat_path: Path,
    name_path: Path,
    clin_df: pd.DataFrame
) -> Tuple[np.ndarray, np.ndarray]:
    """
    For one split, align image features and clinical features via filename,
    then return fused features and labels.
    """
    img_feats = np.load(feat_path)          # (N, 1280)
    filenames = np.load(name_path)          # (N,)

    # Map filename -> row in clinical CSV
    clin_sub = clin_df.set_index("filename").loc[filenames]
    clin_feats = clin_sub[CLIN_FEAT_COLS].values.astype(np.float32)  # (N, 8)
    labels = clin_sub["label"].values.astype(np.int64)               # (N,)

    # Concatenate image + clinical latent features
    fused = np.concatenate([img_feats.astype(np.float32), clin_feats], axis=1)

    assert fused.shape[0] == labels.shape[0]
    return fused, labels


class FusionDataset(Dataset):
    def __init__(self, x: np.ndarray, y: np.ndarray):
        self.x = x.astype(np.float32)
        self.y = y.astype(np.int64)

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]


# -----------------------------
# Model
# -----------------------------

class FusionClassifier(nn.Module):
    """
    Simple fully connected classifier on fused features.
    """
    def __init__(self, in_dim: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(512, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(128, 1),          # binary classification
        )

    def forward(self, x):
        logits = self.net(x)
        return logits.view(-1)


# -----------------------------
# Training & evaluation loops
# -----------------------------

def run_epoch(model, loader, criterion, optimizer=None):
    is_train = optimizer is not None
    model.train(is_train)

    total_loss = 0.0
    all_targets, all_probs = [], []

    for xb, yb in loader:
        xb = xb.to(DEVICE)
        yb = yb.to(DEVICE)

        logits = model(xb)
        loss = criterion(logits, yb.float())

        if is_train:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        total_loss += loss.item() * xb.size(0)

        probs = torch.sigmoid(logits).detach().cpu().numpy()
        all_probs.append(probs)
        all_targets.append(yb.cpu().numpy())

    total_loss /= len(loader.dataset)
    all_probs = np.concatenate(all_probs)
    all_targets = np.concatenate(all_targets)

    preds = (all_probs >= 0.5).astype(int)
    acc = accuracy_score(all_targets, preds)
    try:
        auc = roc_auc_score(all_targets, all_probs)
    except ValueError:
        auc = float("nan")

    return total_loss, acc, auc


# -----------------------------
# Main
# -----------------------------
if __name__ == "__main__":
    # 1. Load clinical CSV
    clin_df = pd.read_csv(CLIN_CSV)

    # 2. Build fused features for each split
    x_train, y_train = load_split(TRAIN_FEATS, TRAIN_NAMES, clin_df)
    x_val,   y_val   = load_split(VAL_FEATS,   VAL_NAMES,   clin_df)
    x_test,  y_test  = load_split(TEST_FEATS,  TEST_NAMES,  clin_df)

    print("Train fused:", x_train.shape, "Val fused:", x_val.shape, "Test fused:", x_test.shape)

    train_ds = FusionDataset(x_train, y_train)
    val_ds   = FusionDataset(x_val,   y_val)
    test_ds  = FusionDataset(x_test,  y_test)

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False)
    test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False)

    # 3. Model, loss, optimizer
    model = FusionClassifier(FUSED_DIM).to(DEVICE)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=1e-4)

    best_val_auc = 0.0
    best_state = None

    # 4. Training loop
    for epoch in range(1, NUM_EPOCHS + 1):
        train_loss, train_acc, train_auc = run_epoch(model, train_loader, criterion, optimizer)
        val_loss, val_acc, val_auc = run_epoch(model, val_loader, criterion, optimizer=None)

        print(
            f"Epoch {epoch:02d} | "
            f"Train loss {train_loss:.4f}, acc {train_acc:.3f}, AUC {train_auc:.3f} | "
            f"Val loss {val_loss:.4f}, acc {val_acc:.3f}, AUC {val_auc:.3f}"
        )

        if val_auc > best_val_auc:
            best_val_auc = val_auc
            best_state = model.state_dict().copy()

    # 5. Test evaluation using best model
    if best_state is not None:
        model.load_state_dict(best_state)

    test_loss, test_acc, test_auc = run_epoch(model, test_loader, criterion, optimizer=None)
    print(f"TEST | loss {test_loss:.4f}, acc {test_acc:.3f}, AUC {test_auc:.3f}")


Train fused: (4064, 1288) Val fused: (870, 1288) Test fused: (872, 1288)
Epoch 01 | Train loss 0.4395, acc 0.804, AUC 0.852 | Val loss 0.3869, acc 0.824, AUC 0.882
Epoch 02 | Train loss 0.3240, acc 0.860, AUC 0.922 | Val loss 0.4828, acc 0.783, AUC 0.923
Epoch 03 | Train loss 0.2453, acc 0.900, AUC 0.958 | Val loss 0.2912, acc 0.879, AUC 0.938
Epoch 04 | Train loss 0.2003, acc 0.916, AUC 0.972 | Val loss 0.3748, acc 0.853, AUC 0.948
Epoch 05 | Train loss 0.1666, acc 0.933, AUC 0.981 | Val loss 0.3017, acc 0.874, AUC 0.951
Epoch 06 | Train loss 0.1487, acc 0.938, AUC 0.985 | Val loss 0.2271, acc 0.897, AUC 0.961
Epoch 07 | Train loss 0.1390, acc 0.947, AUC 0.986 | Val loss 0.5169, acc 0.825, AUC 0.954
Epoch 08 | Train loss 0.1242, acc 0.951, AUC 0.989 | Val loss 0.3149, acc 0.871, AUC 0.955
Epoch 09 | Train loss 0.0909, acc 0.966, AUC 0.995 | Val loss 0.2281, acc 0.907, AUC 0.965
Epoch 10 | Train loss 0.0838, acc 0.968, AUC 0.995 | Val loss 0.2614, acc 0.908, AUC 0.962
Epoch 11 | Train 

In [8]:
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import torch
from sklearn.metrics import (
    roc_auc_score,
    roc_curve,
    confusion_matrix,
    ConfusionMatrixDisplay,
    accuracy_score,
)

OUTPUT_DIR = Path("/content/drive/MyDrive/heart_risk_results")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# -------------------------------------------------
# Evaluate best model on test set (fixed torch.load)
# -------------------------------------------------
checkpoint_path = OUTPUT_DIR / "best_model.pth"

# If you are on PyTorch >= 2.6, explicitly set weights_only=False
checkpoint = torch.load(checkpoint_path, map_location=DEVICE, weights_only=False)
model.load_state_dict(checkpoint["model_state"])

criterion = torch.nn.BCEWithLogitsLoss()

def eval_epoch(model, loader, criterion):
    model.eval()
    total_loss = 0.0
    all_targets, all_probs = [], []

    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(DEVICE)
            yb = yb.to(DEVICE)

            logits = model(xb)
            loss = criterion(logits, yb.float())

            total_loss += loss.item() * xb.size(0)

            probs = torch.sigmoid(logits).cpu().numpy()
            all_probs.append(probs)
            all_targets.append(yb.cpu().numpy())

    total_loss /= len(loader.dataset)
    all_probs = np.concatenate(all_probs)
    all_targets = np.concatenate(all_targets)

    preds = (all_probs >= 0.5).astype(int)
    acc = accuracy_score(all_targets, preds)
    try:
        auc = roc_auc_score(all_targets, all_probs)
    except ValueError:
        auc = float("nan")

    return total_loss, acc, auc, all_targets, all_probs

test_loss, test_acc, test_auc, y_true, y_prob = eval_epoch(
    model, test_loader, criterion
)
print(f"TEST | loss {test_loss:.4f}, acc {test_acc:.3f}, AUC {test_auc:.3f}")

np.save(OUTPUT_DIR / "y_true.npy", y_true)
np.save(OUTPUT_DIR / "y_prob.npy", y_prob)

# -----------------------------
# ROC curve
# -----------------------------
from sklearn.metrics import roc_curve  # [web:142]

fpr, tpr, _ = roc_curve(y_true, y_prob)
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f"ROC (AUC = {test_auc:.3f})")
plt.plot([0, 1], [0, 1], "k--", label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Heart Risk Classifier")
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.savefig(OUTPUT_DIR / "roc_curve.png", dpi=300)
plt.close()

# -----------------------------
# Confusion matrix
# -----------------------------
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay  # [web:137][web:143]

y_pred = (y_prob >= 0.5).astype(int)
cm = confusion_matrix(y_true, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
fig, ax = plt.subplots(figsize=(5, 5))
disp.plot(ax=ax, cmap="Blues", colorbar=False)
plt.title("Confusion Matrix - Heart Risk Classifier")
plt.tight_layout()
plt.savefig(OUTPUT_DIR / "confusion_matrix.png", dpi=300)
plt.close()

print(f"Saved ROC curve and confusion matrix to {OUTPUT_DIR}")


TEST | loss 0.3915, acc 0.882, AUC 0.959
Saved ROC curve and confusion matrix to /content/drive/MyDrive/heart_risk_results


In [9]:
import numpy as np
from pathlib import Path
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    classification_report,
)

OUTPUT_DIR = Path("/content/drive/MyDrive/heart_risk_results")

# If y_true and y_prob are still in memory from the previous cell, you can skip loading.
# Otherwise, load them from the saved .npy files:
y_true = np.load(OUTPUT_DIR / "y_true.npy")
y_prob = np.load(OUTPUT_DIR / "y_prob.npy")

# Convert probabilities to binary predictions with threshold 0.5
y_pred = (y_prob >= 0.5).astype(int)

# Binary classification metrics (positive class = 1)
precision = precision_score(y_true, y_pred, pos_label=1)
recall    = recall_score(y_true, y_pred, pos_label=1)
f1        = f1_score(y_true, y_pred, pos_label=1)

print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")

# Optional: full classification report (per class + averages)
print("\nClassification report:")
print(classification_report(y_true, y_pred, digits=3))


Precision: 0.969
Recall:    0.862
F1-score:  0.913

Classification report:
              precision    recall  f1-score   support

           0      0.729     0.931     0.818       248
           1      0.969     0.862     0.913       624

    accuracy                          0.882       872
   macro avg      0.849     0.897     0.865       872
weighted avg      0.901     0.882     0.886       872

