In [6]:
# ========================== SETUP ==========================
# !pip install -q pandas torch torchvision scikit-learn tqdm

import os
from pathlib import Path
import numpy as np
import pandas as pd
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from sklearn.metrics import roc_auc_score, f1_score
from tqdm import tqdm

torch.backends.cudnn.benchmark = True
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

# ========================== 1) PATHS ==========================
CSV_PATH = Path("./chest-xray/Data_Entry_2017.csv")    # <- change if needed
BASE     = Path("./chest-xray")                        # contains images_001, images_002, ...
assert CSV_PATH.exists(), f"CSV not found: {CSV_PATH}"
assert BASE.exists(), f"Base images folder not found: {BASE}"

# ========================== 2) LOAD CSV ==========================
df = pd.read_csv(CSV_PATH)
print("df shape:", df.shape)

# ========================== 3) LABELS + MULTI-HOT ==========================
LABELS = [
    'Atelectasis','Cardiomegaly','Effusion','Infiltration','Mass','Nodule',
    'Pneumonia','Pneumothorax','Consolidation','Edema','Emphysema',
    'Fibrosis','Pleural_Thickening','Hernia'
]

def to_multi_hot(lbl_str: str):
    y = np.zeros(len(LABELS), dtype=np.float32)
    if isinstance(lbl_str, str) and lbl_str != "No Finding":
        for t in lbl_str.split("|"):
            if t in LABELS:
                y[LABELS.index(t)] = 1.0
    return y

Y = np.stack([to_multi_hot(s) for s in df["Finding Labels"].astype(str)], axis=0)
print("Y shape:", Y.shape)
print("Positives per class:", dict(zip(LABELS, Y.sum(axis=0).astype(int))))

# ========================== 4) PATIENT-LEVEL SPLIT (80/10/10) ==========================
# df["Patient ID"] = df["Patient ID"].astype(str)
# bucket = df["Patient ID"].apply(lambda x: hash(x) % 10)  # 0..9
# train_df = df[bucket < 8].reset_index(drop=True)
# val_df   = df[bucket == 8].reset_index(drop=True)
# test_df  = df[bucket == 9].reset_index(drop=True)




from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)
train_idx, temp_idx = next(gss.split(df, groups=df["Patient ID"]))
temp_df = df.iloc[temp_idx].reset_index(drop=True)

gss2 = GroupShuffleSplit(n_splits=1, train_size=0.5, random_state=42)
val_idx, test_idx = next(gss2.split(temp_df, groups=temp_df["Patient ID"]))
train_df = df.iloc[train_idx].reset_index(drop=True)
val_df   = temp_df.iloc[val_idx].reset_index(drop=True)
test_df  = temp_df.iloc[test_idx].reset_index(drop=True)


print("Split sizes -> Train:", len(train_df), "Val:", len(val_df), "Test:", len(test_df))



# ========================== 5) INDEX FILES ACROSS SHARDS ==========================
# Your layout: BASE / images_XXX / images / *.png
name_to_path = {}
for p in BASE.glob("images_*/images/*.png"):
    name_to_path[p.name] = str(p)

print("Indexed files:", len(name_to_path))
first20 = df["Image Index"].head(20).tolist()
missing20 = [n for n in first20 if n not in name_to_path]
print("Missing among first 20:", len(missing20))
if missing20:
    print("Example missing:", missing20[:5])

# ========================== 6) DATASET / DATALOADERS ==========================
IMG_SIZE = 384
train_tfms = transforms.Compose([
    transforms.Resize(int(IMG_SIZE*1.1)),
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.8,1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])
val_tfms = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.CenterCrop(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])

def row_to_multi_hot_tensor(row):
    return torch.tensor(to_multi_hot(row["Finding Labels"]), dtype=torch.float32)

class ChestXray(Dataset):
    def __init__(self, df, index_map, tfm):
        self.df = df.reset_index(drop=True)
        self.idx = index_map
        self.tfm = tfm
    def __len__(self):
        return len(self.df)
    def __getitem__(self, i):
        r = self.df.iloc[i]
        fname = r["Image Index"]
        img_path = self.idx.get(fname)
        if img_path is None:
            raise FileNotFoundError(f"Image not indexed: {fname}")
        img = Image.open(img_path).convert("RGB")
        x = self.tfm(img)
        y = row_to_multi_hot_tensor(r)
        return x, y

train_ds = ChestXray(train_df, name_to_path, train_tfms)
val_ds   = ChestXray(val_df,   name_to_path, val_tfms)
test_ds  = ChestXray(test_df,  name_to_path, val_tfms)

BATCH_SIZE  = 32
NUM_WORKERS = 2  # set 0 if multiprocessing issues

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=NUM_WORKERS, pin_memory=True)
val_dl   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
test_dl  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

print("DL sizes ->", len(train_ds), len(val_ds), len(test_ds))

# ========================== 7) MODEL ==========================
model = models.densenet121(weights=models.DenseNet121_Weights.IMAGENET1K_V1)
in_features = model.classifier.in_features
model.classifier = nn.Linear(in_features, len(LABELS))
model = model.to(DEVICE)

# ========================== 8) LOSS (pos_weight from TRAIN) ==========================
train_multi = np.vstack(train_df["Finding Labels"].astype(str).map(
    lambda s: np.array(to_multi_hot(s), dtype=np.float32)
).values)
pos = train_multi.sum(axis=0)    # per-class positives in TRAIN
N = len(train_df)
pos = np.clip(pos, 1.0, None)    # avoid div-by-zero
pos_weight = torch.tensor((N - pos) / pos, dtype=torch.float32, device=DEVICE)

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE == "cuda"))

# ========================== 9) EVALUATION ==========================
def evaluate(model, loader, threshold=0.5):
    model.eval()
    ys, ps = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            p = torch.sigmoid(model(xb))
            ys.append(yb.cpu()); ps.append(p.cpu())
    ys = torch.cat(ys, 0).numpy()
    ps = torch.cat(ps, 0).numpy()

    # AUROC per class
    aurocs = []
    for c in range(len(LABELS)):
        y_c, p_c = ys[:, c], ps[:, c]
        try:
            aurocs.append(roc_auc_score(y_c, p_c))
        except ValueError:
            aurocs.append(np.nan)
    mean_auc = float(np.nanmean(aurocs))

    # F1 at fixed threshold (reference only)
    preds = (ps >= threshold).astype("int32")
    micro_f1 = f1_score(ys, preds, average="micro", zero_division=0)
    macro_f1 = f1_score(ys, preds, average="macro", zero_division=0)
    return mean_auc, dict(zip(LABELS, aurocs)), micro_f1, macro_f1

# ========================== 10) TRAIN with CHECKPOINTS ==========================
best_path = "densenet121_best.pt"
last_path = "densenet121_last.pt"
RESUME    = True
EPOCHS    = 20

def make_ckpt(epoch, best_auc):
    return {
        "epoch": epoch,
        "best_auc": best_auc,
        "model_state": model.state_dict(),
        "optimizer_state": optimizer.state_dict(),
        "scheduler_state": scheduler.state_dict(),
        "scaler_state": scaler.state_dict(),
    }

def save_last(epoch, best_auc):
    torch.save(make_ckpt(epoch, best_auc), last_path)
    print(f" Saved last: {last_path} (epoch={epoch})")

def save_best(epoch, best_auc):
    torch.save(make_ckpt(epoch, best_auc), best_path)
    print(f" Saved BEST: {best_path} (epoch={epoch}, best_auc={best_auc:.4f})")

# Resume
start_epoch = 1
best_auc = -1.0
if RESUME and os.path.exists(last_path):
    ckpt = torch.load(last_path, map_location=DEVICE)
    model.load_state_dict(ckpt["model_state"])
    optimizer.load_state_dict(ckpt["optimizer_state"])
    scheduler.load_state_dict(ckpt["scheduler_state"])
    scaler.load_state_dict(ckpt["scaler_state"])
    start_epoch = ckpt["epoch"] + 1
    best_auc    = ckpt.get("best_auc", best_auc)
    print(f"Resuming from epoch {start_epoch} (best_auc={best_auc:.4f})")
else:
    print(" Starting fresh training")

# Loop
for ep in range(start_epoch, EPOCHS + 1):
    model.train()
    running_loss = 0.0
    for xb, yb in tqdm(train_dl, desc=f"Epoch {ep}/{EPOCHS}"):
        xb, yb = xb.to(DEVICE, non_blocking=True), yb.to(DEVICE, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=(DEVICE == "cuda")):
            logits = model(xb)
            loss = criterion(logits, yb)
        scaler.scale(loss).backward()
        scaler.step(optimizer); scaler.update()
        running_loss += loss.item() * xb.size(0)

    scheduler.step()
    train_loss = running_loss / len(train_ds)

    val_mean_auc, _, val_micro_f1, val_macro_f1 = evaluate(model, val_dl)
    print(f"[Val] mean AUROC={val_mean_auc:.4f} | microF1={val_micro_f1:.4f} | macroF1={val_macro_f1:.4f} | train_loss={train_loss:.4f}")

    # Always save "last"
    save_last(ep, best_auc)

    # Save "best" if improved
    if val_mean_auc > best_auc:
        best_auc = val_mean_auc
        save_best(ep, best_auc)

# ========================== 11) TEST ==========================
if Path(best_path).exists():
    ckpt = torch.load(best_path, map_location=DEVICE)
    model.load_state_dict(ckpt["model_state"])
    print(f" Loaded BEST from epoch {ckpt['epoch']} (best_auc={ckpt['best_auc']:.4f})")
else:
    print(" BEST checkpoint not found, using last model weights in memory.")

test_mean_auc, test_per_cls, test_micro_f1, test_macro_f1 = evaluate(model, test_dl)
print(f"[TEST] mean AUROC={test_mean_auc:.4f} | microF1={test_micro_f1:.4f} | macroF1={test_macro_f1:.4f}")
print("Per-class AUROC:", {k: (None if np.isnan(v) else float(v)) for k, v in test_per_cls.items()})

# ========================== 12) (Optional) SINGLE-IMAGE PREDICT ==========================
@torch.no_grad()
def predict_image(img_path, threshold=0.5, top_k=5):
    model.eval()
    img = Image.open(img_path).convert("RGB")
    x = val_tfms(img).unsqueeze(0).to(DEVICE)
    probs = torch.sigmoid(model(x)).squeeze(0).cpu().numpy()
    pred_labels = [LABELS[i] for i, p in enumerate(probs) if p >= threshold]
    top_idx = np.argsort(-probs)[:top_k]
    top = [(LABELS[i], float(probs[i])) for i in top_idx]
    return dict(zip(LABELS, map(float, probs))), pred_labels, top

# Example:
# img_example = "/chest-xray/images_001/images/00001335_006.png"
# probs, preds, top5 = predict_image(img_example, threshold=0.5, top_k=5)
# print("Predicted (>=0.5):", preds)
# print("Top-5:", top5)


Device: cuda
df shape: (112120, 12)
Y shape: (112120, 14)
Positives per class: {'Atelectasis': 11559, 'Cardiomegaly': 2776, 'Effusion': 13317, 'Infiltration': 19894, 'Mass': 5782, 'Nodule': 6331, 'Pneumonia': 1431, 'Pneumothorax': 5302, 'Consolidation': 4667, 'Edema': 2303, 'Emphysema': 2516, 'Fibrosis': 1686, 'Pleural_Thickening': 3385, 'Hernia': 227}
Split sizes -> Train: 89826 Val: 11679 Test: 10615
Indexed files: 112120
Missing among first 20: 0
DL sizes -> 89826 11679 10615
Resuming from epoch 18 (best_auc=0.8579)


Epoch 18/20: 100%|██████████| 2808/2808 [18:34<00:00,  2.52it/s]


[Val] mean AUROC=0.8364 | microF1=0.2410 | macroF1=0.2181 | train_loss=0.9114
 Saved last: densenet121_last.pt (epoch=18)


Epoch 19/20: 100%|██████████| 2808/2808 [18:30<00:00,  2.53it/s]


[Val] mean AUROC=0.8256 | microF1=0.2100 | macroF1=0.2071 | train_loss=0.9224
 Saved last: densenet121_last.pt (epoch=19)


Epoch 20/20: 100%|██████████| 2808/2808 [18:20<00:00,  2.55it/s]


[Val] mean AUROC=0.8196 | microF1=0.2434 | macroF1=0.2180 | train_loss=0.9067
 Saved last: densenet121_last.pt (epoch=20)
 Loaded BEST from epoch 11 (best_auc=0.8579)
[TEST] mean AUROC=0.8606 | microF1=0.2677 | macroF1=0.2526
Per-class AUROC: {'Atelectasis': 0.8157046667850688, 'Cardiomegaly': 0.9410573598319599, 'Effusion': 0.8860951507104899, 'Infiltration': 0.6986046944590047, 'Mass': 0.8599246331526968, 'Nodule': 0.7936213649358839, 'Pneumonia': 0.8078863298109334, 'Pneumothorax': 0.8893055947526689, 'Consolidation': 0.8381257189231456, 'Edema': 0.9095466667595384, 'Emphysema': 0.9481564363560316, 'Fibrosis': 0.8508452414308396, 'Pleural_Thickening': 0.8166046463704607, 'Hernia': 0.9928841479881177}


In [2]:
!git status


On branch main
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   .ipynb_checkpoints/chestX-ray-checkpoint.ipynb[m
	[31mmodified:   chestX-ray.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")


In [3]:
!git add .

In [4]:
! git commit -m "second session"

[main f3fc19d] second session
 2 files changed, 974 insertions(+), 2390 deletions(-)
 rewrite .ipynb_checkpoints/chestX-ray-checkpoint.ipynb (93%)
 rewrite chestX-ray.ipynb (74%)


In [5]:
!git push origin main

Enumerating objects: 7, done.
Counting objects: 100% (7/7), done.
Delta compression using up to 96 threads
Compressing objects: 100% (4/4), done.
Writing objects: 100% (4/4), 7.43 KiB | 7.43 MiB/s, done.
Total 4 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To github.com:MatinM-96/Chest-X-ray.git
   6045f7f..f3fc19d  main -> main
