In [7]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import numpy as np
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from read_data import ChestXrayDataSet
from sklearn.metrics import roc_auc_score
import gc

In [8]:
CKPT_PATH = 'model.pth.tar'
N_CLASSES = 14
CLASS_NAMES = [ 'Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass', 'Nodule', 'Pneumonia',
                'Pneumothorax', 'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'Pleural_Thickening', 'Hernia']
DATA_DIR = './ChestX-ray14/images'
TEST_IMAGE_LIST = './ChestX-ray14/labels/test_list.txt'
BATCH_SIZE = 64

In [9]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()
print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
print("cuda runtime:", torch.version.cuda)
print("allocated (bytes):", torch.cuda.memory_allocated())
print("reserved  (bytes):", torch.cuda.memory_reserved())
print(torch.cuda.memory_summary(abbreviated=True))


torch: 2.7.1+cu118
cuda available: True
cuda runtime: 11.8
allocated (bytes): 65870336
reserved  (bytes): 75497472
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  64326 KiB | 397692 KiB |   2871 GiB |   2871 GiB |
|---------------------------------------------------------------------------|
| Active memory         |  64326 KiB | 397692 KiB |   2871 GiB |   2871 GiB |
|---------------------------------------------------------------------------|
| Requested memory      |  64114 KiB | 395944 KiB |   2865 GiB |   2865 GiB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |  

In [10]:
def main():

    cudnn.benchmark = True

    # initialize and load the model
    # model = DenseNet121(N_CLASSES).cuda()
    # model = torch.nn.DataParallel(model).cuda()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = DenseNet121(N_CLASSES).to(device)
    # use DataParallel only if you actually have >1 GPU
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)


    if os.path.isfile(CKPT_PATH):
        print("=> loading checkpoint")
        checkpoint = torch.load(CKPT_PATH)
        model.load_state_dict(checkpoint['state_dict'])
        print("=> loaded checkpoint")
    else:
        print("=> no checkpoint found")

    normalize = transforms.Normalize([0.485, 0.456, 0.406],
                                     [0.229, 0.224, 0.225])

    test_dataset = ChestXrayDataSet(data_dir=DATA_DIR,
                                    image_list_file=TEST_IMAGE_LIST,
                                    transform=transforms.Compose([
                                        transforms.Resize(256),
                                        transforms.TenCrop(224),
                                        transforms.Lambda
                                        (lambda crops: torch.stack([transforms.ToTensor()(crop) for crop in crops])),
                                        transforms.Lambda
                                        (lambda crops: torch.stack([normalize(crop) for crop in crops]))
                                    ]))
    test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE,
                             shuffle=False, num_workers=0, pin_memory=True)

    # initialize the ground truth and output tensor
    # gt = torch.FloatTensor()
    # gt = gt.cuda()
    # pred = torch.FloatTensor()
    # pred = pred.cuda()
    gt = torch.empty((0, N_CLASSES), dtype=torch.float32, device=device)
    pred = torch.empty((0, N_CLASSES), dtype=torch.float32, device=device)


    # switch to evaluate mode
    model.eval()

    for i, (inp, target) in enumerate(test_loader):
        # target = target.cuda()
        # gt = torch.cat((gt, target), 0)
        # bs, n_crops, c, h, w = inp.size()
        # input_var = torch.autograd.Variable(inp.view(-1, c, h, w).cuda(), volatile=True)
        # output = model(input_var)
        # output_mean = output.view(bs, n_crops, -1).mean(1)
        # pred = torch.cat((pred, output_mean.data), 0)

        bs, n_crops, c, h, w = inp.size()

        # move inputs to device (once) and reshape for TenCrop
        inp = inp.to(device)                          # shape (bs, n_crops, c, h, w)
        input_tensor = inp.view(-1, c, h, w)          # shape (bs * n_crops, c, h, w)
        
        # inference with no grad + mixed precision (saves memory)
        with torch.no_grad():
            from torch.cuda.amp import autocast
            with autocast():
                output = model(input_tensor)          # (bs * n_crops, n_classes)
                output_mean = output.view(bs, n_crops, -1).mean(1)   # (bs, n_classes)
        
        # accumulate predictions to pred (keep pred on GPU or CPU as you prefer)

        # change in code here
        target = target.to(device)
        gt = torch.cat((gt, target), 0)
        
        pred = torch.cat((pred, output_mean.detach().to(pred.device)), 0)
        
        # --- immediate cleanup of large temporaries to free GPU RAM ---
        del input_tensor, output, output_mean, inp
        gc.collect()
        torch.cuda.empty_cache()
        try:
            torch.cuda.ipc_collect()
        except Exception:
            pass


        # # --- safe GPU cleanup (drop-in) ---
        # try:
        #     # delete large per-batch tensors if they exist
        #     del inp
        # except NameError:
        #     pass
        
        # for _name in ("input_var", "output", "output_mean", "target", "images", "labels"):
        #     try:
        #         del globals()[_name]
        #     except Exception:
        #         # NameError / KeyError / others — ignore if not present
        #         pass
        
        # # run python garbage collector
        # gc.collect()
        
        # # free CUDA cached memory (makes memory available to other processes)
        # torch.cuda.empty_cache()
        
        # # optional: collect shared CUDA IPC (safe to call)
        # try:
        #     torch.cuda.ipc_collect()
        # except Exception:
        #     pass
        
        # # optional: brief memory summary (prints CUDA info if available)
        # try:
        #     print(torch.cuda.memory_summary(device=None, abbreviated=True))
        # except Exception:
        #     # fallback: print basic allocated/reserved info
        #     if torch.cuda.is_available():
        #         print(f"allocated: {torch.cuda.memory_allocated()} bytes, "
        #               f"reserved: {torch.cuda.memory_reserved()} bytes")
        #     else:
        #         print("CUDA not available")
        # # --- end cleanup ---


    AUROCs = compute_AUCs(gt, pred)
    AUROC_avg = np.array(AUROCs).mean()
    print('The average AUROC is {AUROC_avg:.3f}'.format(AUROC_avg=AUROC_avg))
    for i in range(N_CLASSES):
        print('The AUROC of {} is {}'.format(CLASS_NAMES[i], AUROCs[i]))


In [12]:
def compute_AUCs(gt, pred):
    """Computes Area Under the Curve (AUC) from prediction scores.

    Args:
        gt: Pytorch tensor on GPU, shape = [n_samples, n_classes]
          true binary labels.
        pred: Pytorch tensor on GPU, shape = [n_samples, n_classes]
          can either be probability estimates of the positive class,
          confidence values, or binary decisions.

    Returns:
        List of AUROCs of all classes.
    """
    AUROCs = []
    gt_np = gt.cpu().numpy()
    pred_np = pred.cpu().numpy()
    for i in range(N_CLASSES):
        AUROCs.append(roc_auc_score(gt_np[:, i], pred_np[:, i]))
    return AUROCs

In [13]:
import torch
print("torch.__version__:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
print("cuda device count:", torch.cuda.device_count())


torch.__version__: 2.7.1+cu118
cuda available: True
cuda device count: 1


In [14]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA runtime version:", torch.version.cuda)
print("Device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("Device name:", torch.cuda.get_device_name(0))


PyTorch version: 2.7.1+cu118
CUDA available: True
CUDA runtime version: 11.8
Device count: 1
Device name: NVIDIA GeForce RTX 4060 Laptop GPU


In [15]:
# ---- REPLACE current DenseNet121 class with this ----
class DenseNet121(nn.Module):
    """DenseNet121 backbone with a linear multi-label head (no Sigmoid)."""
    
    def __init__(self, out_size):
        super(DenseNet121, self).__init__()
        # Using pretrained weights; newer versions may warn about 'pretrained', but this works
        self.densenet121 = torchvision.models.densenet121(pretrained=True)
        
        # Get the number of input features to the classifier
        num_ftrs = self.densenet121.classifier.in_features
        
        # Replace the classifier with a linear layer that outputs logits
        # (No sigmoid activation here — handle it in the loss function if needed)
        self.densenet121.classifier = nn.Linear(num_ftrs, out_size)

    def forward(self, x):
        x = self.densenet121(x)
        return x
# -----------------------------------------------------


# ---------- TRAINING SMOKE-TEST CELL (FAST: runs 5 batches) ----------
import os, random, numpy as np, torch
import torch.nn as nn, torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score
import torchvision

# ---------- 1) basic env + reproducibility ----------
seed = 42
random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ---------- 2) ensure train/val lists exist (split test_list.txt if needed) ----------
labels_dir = os.path.join(".", "ChestX-ray14", "labels")
os.makedirs(labels_dir, exist_ok=True)
master_list = os.path.join(labels_dir, "test_list.txt")
train_list = os.path.join(labels_dir, "train_list.txt")
val_list = os.path.join(labels_dir, "val_list.txt")

if not os.path.exists(train_list) or not os.path.exists(val_list):
    if os.path.exists(master_list):
        with open(master_list, 'r') as f:
            lines = [l.strip() for l in f if l.strip()]
        random.shuffle(lines)
        n = len(lines)
        n_train = int(0.8*n); n_val = int(0.1*n)
        train_lines = lines[:n_train]
        val_lines = lines[n_train:n_train+n_val]
        test_lines = lines[n_train+n_val:]
        with open(train_list, 'w') as f: f.write("\n".join(train_lines))
        with open(val_list, 'w') as f: f.write("\n".join(val_lines))
        with open(os.path.join(labels_dir, 'test_list_split.txt'), 'w') as f: f.write("\n".join(test_lines))
        print("Created train/val/test split:", len(train_lines), len(val_lines), len(test_lines))
    else:
        raise FileNotFoundError(f"No master list found at {master_list}. Place a list or tell me the correct path.")
else:
    print("train_list.txt and val_list.txt already present.")

# ---------- 3) transforms ----------
train_transform = torchvision.transforms.Compose([
    torchvision.transforms.RandomResizedCrop(224),
    torchvision.transforms.RandomHorizontalFlip(),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])
val_transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize(256),
    torchvision.transforms.CenterCrop(224),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

# ---------- 4) dataset & dataloader (uses your read_data.ChestXrayDataSet) ----------
try:
    from read_data import ChestXrayDataSet
except Exception as e:
    print("ERROR importing ChestXrayDataSet from read_data:", e)
    raise

DATA_DIR = globals().get('DATA_DIR', os.path.join(".", "ChestX-ray14", "images"))
N_CLASSES = globals().get('N_CLASSES', 14)
CKPT_PATH = globals().get('CKPT_PATH', os.path.join(".", "model.pth.tar"))

train_dataset = ChestXrayDataSet(data_dir=DATA_DIR, image_list_file=train_list, transform=train_transform)
val_dataset   = ChestXrayDataSet(data_dir=DATA_DIR, image_list_file=val_list, transform=val_transform)

# Use multiple workers and pin_memory to speed up data loading (good with GPU)
BATCH_SIZE = 16   # reduce if OOM
# train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
# val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)


print("Train/Val sizes:", len(train_dataset), len(val_dataset))

# ---------- 5) model (assumes DenseNet121 class already redefined without Sigmoid) ----------
model = DenseNet121(N_CLASSES).to(device)

# freeze everything except classifier head
for name, p in model.named_parameters():
    if "densenet121.classifier" not in name:
        p.requires_grad = False

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)

# ---------- 6) QUICK train: only first few batches (smoke test) ----------
model.train()
running_loss = 0.0
max_batches = 5
for i, (imgs, targets) in enumerate(train_loader):
    if i >= max_batches:
        break
    imgs = imgs.to(device); targets = targets.to(device)
    optimizer.zero_grad()
    logits = model(imgs)             # logits
    loss = criterion(logits, targets)
    loss.backward()
    optimizer.step()
    running_loss += loss.item() * imgs.size(0)
    print(f"Batch {i+1}/{max_batches} done, loss={loss.item():.4f}")

if running_loss == 0.0:
    print("Warning: no batches were processed (train_loader may be empty).")
else:
    print("Quick train done on", i+1, "batches, avg loss:", running_loss / ((i+1) * imgs.size(0)))

# ---------- 7) quick validation ----------
model.eval()
all_logits, all_gt = [], []
with torch.no_grad():
    for imgs, targets in val_loader:
        imgs = imgs.to(device); targets = targets.to(device)
        logits = model(imgs)
        all_logits.append(logits.cpu()); all_gt.append(targets.cpu())

if len(all_logits) == 0:
    print("Warning: validation loader produced 0 batches.")
    all_logits = torch.empty((0, N_CLASSES))
    all_gt = torch.empty((0, N_CLASSES))
else:
    all_logits = torch.cat(all_logits, dim=0)
    all_gt = torch.cat(all_gt, dim=0)

probs = torch.sigmoid(all_logits).numpy() if all_logits.numel() > 0 else np.empty((0, N_CLASSES))
gt_np = all_gt.numpy() if all_gt.numel() > 0 else np.empty((0, N_CLASSES))

aucs = []
for i in range(N_CLASSES):
    if gt_np.shape[0] == 0 or len(np.unique(gt_np[:,i])) < 2:
        aucs.append(float('nan'))
    else:
        try:
            aucs.append(float(roc_auc_score(gt_np[:,i], probs[:,i])))
        except Exception:
            aucs.append(float('nan'))

mean_auc = np.nanmean(aucs) if len(aucs) > 0 else float('nan')
print("Validation mean AUROC (smoke-test):", mean_auc)
print("Per-class AUROC (first 6):", aucs[:6])

# ---------- 8) save checkpoint ----------
state = {'epoch': 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'val_mean_auc': float(mean_auc)}
torch.save(state, CKPT_PATH)
print("Saved checkpoint to", CKPT_PATH)
# ---------- END ----------


Device: cuda
train_list.txt and val_list.txt already present.
Train/Val sizes: 78468 11219




Batch 1/5 done, loss=0.7472
Batch 2/5 done, loss=0.5521
Batch 3/5 done, loss=0.4363
Batch 4/5 done, loss=0.3429
Batch 5/5 done, loss=0.2618
Quick train done on 6 batches, avg loss: 0.3900510420401891
Validation mean AUROC (smoke-test): 0.5091742308442769
Per-class AUROC (first 6): [0.5818910094762828, 0.5310147023104714, 0.564197550789494, 0.5117441020167477, 0.4609779120256749, 0.4655738279818835]
Saved checkpoint to model.pth.tar


In [16]:
import os, torch
print("exists:", os.path.exists("model.pth.tar"))
if os.path.exists("model.pth.tar"):
    ckpt = torch.load("model.pth.tar", map_location='cpu')
    print("keys:", list(ckpt.keys()))
    print("epoch:", ckpt.get('epoch'), "val_mean_auc:", ckpt.get('val_mean_auc'))


exists: True
keys: ['epoch', 'state_dict', 'optimizer', 'val_mean_auc']
epoch: 1 val_mean_auc: 0.5091742308442769


In [17]:
###WE RUN THIS TO FIX THE TEST_LIST.TXT FILE ERROR IS FIXED ,I.E.-




# Run this cell to create or locate test_list_split.txt automatically
import os, random, shutil, glob

labels_dir = os.path.join(".", "ChestX-ray14", "labels")
images_dir = os.path.join(".", "ChestX-ray14", "images")
os.makedirs(labels_dir, exist_ok=True)
wanted = os.path.join(labels_dir, "test_list_split.txt")

print("Labels dir exists:", os.path.exists(labels_dir))
print("Files in labels dir:", os.listdir(labels_dir)[:200])

if os.path.exists(wanted):
    print("Test list already exists at:", wanted)
else:
    # 1) prefer existing explicit test files
    candidates = [
        os.path.join(labels_dir, "test_list.txt"),
        os.path.join(labels_dir, "test.txt"),
        os.path.join(labels_dir, "test_list_full.txt")
    ]
    found = None
    for c in candidates:
        if os.path.exists(c):
            found = c
            break
    # 2) more general search for any test*.txt
    if not found:
        for f in os.listdir(labels_dir):
            if f.lower().startswith("test") and f.lower().endswith(".txt"):
                found = os.path.join(labels_dir, f)
                break

    if found:
        shutil.copy(found, wanted)
        print(f"Copied existing test list {found} -> {wanted}")
    else:
        # 3) if train_list exists, sample 10% into test_list_split (without modifying train_list)
        train_f = os.path.join(labels_dir, "train_list.txt")
        if os.path.exists(train_f):
            with open(train_f, "r") as fh:
                train_lines = [l.strip() for l in fh if l.strip()]
            if len(train_lines) < 2:
                print("train_list.txt exists but has too few lines to sample. Skipping.")
            else:
                random.seed(42)
                random.shuffle(train_lines)
                n = len(train_lines)
                n_test = max(1, int(0.1 * n))
                test_lines = train_lines[:n_test]
                with open(wanted, "w") as fh:
                    fh.write("\n".join(test_lines))
                print(f"Created {wanted} by sampling {n_test} entries (10%) from train_list.txt (train_list NOT modified).")
        else:
            # 4) fallback: create small test_list_split from image files
            if os.path.exists(images_dir):
                im_files = []
                for ext in ("*.png","*.jpg","*.jpeg","*.PNG","*.JPG","*.JPEG"):
                    im_files.extend(glob.glob(os.path.join(images_dir, "**", ext), recursive=True))
                if len(im_files) == 0:
                    raise FileNotFoundError("No image files found in ChestX-ray14/images to create a test list.")
                random.seed(42)
                random.shuffle(im_files)
                n_test = min(100, max(1, int(0.01 * len(im_files))))  # 1% up to 100
                chosen = im_files[:n_test]
                # write relative paths with respect to images_dir (common format)
                rel_paths = [os.path.relpath(p, images_dir).replace("\\", "/") for p in chosen]
                with open(wanted, "w") as fh:
                    fh.write("\n".join(rel_paths))
                print(f"No label lists found. Created small {wanted} with {len(rel_paths)} image paths (relative to {images_dir}).")
            else:
                raise FileNotFoundError("No labels and no images folder found. Please provide a test_list file.")

# Final confirmation
print("Final files in labels dir (sample):", os.listdir(labels_dir)[:200])
print("Test list path:", wanted, "| exists:", os.path.exists(wanted))
if os.path.exists(wanted):
    with open(wanted, "r") as fh:
        lines = [l.strip() for l in fh if l.strip()]
    print("Number of lines in test_list_split.txt:", len(lines))
    print("First 10 lines (preview):")
    for ln in lines[:10]:
        print(" ", ln)


Labels dir exists: True
Files in labels dir: ['test_list.txt', 'test_list_split.txt', 'train_list.txt', 'val_list.txt']
Test list already exists at: .\ChestX-ray14\labels\test_list_split.txt
Final files in labels dir (sample): ['test_list.txt', 'test_list_split.txt', 'train_list.txt', 'val_list.txt']
Test list path: .\ChestX-ray14\labels\test_list_split.txt | exists: True
Number of lines in test_list_split.txt: 22433
First 10 lines (preview):
  00011997_000.png 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  00011997_001.png 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  00011997_002.png 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  00011997_003.png 0 0 1 0 0 0 0 0 0 0 0 0 0 0
  00011997_004.png 0 1 1 1 0 0 0 0 0 0 0 0 0 0
  00011997_005.png 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  00011997_006.png 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  00011997_007.png 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  00011997_008.png 0 0 0 1 0 0 0 0 0 0 0 0 0 0
  00011997_009.png 0 0 0 0 0 0 0 0 0 0 0 0 0 0


In [18]:
# Cell B: Full evaluation main() - paste & run
import os, torch, torchvision, numpy as np
from collections import OrderedDict
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
from torch.utils.data import DataLoader

# === User config (edit if needed) ===
DATA_DIR = globals().get('DATA_DIR', os.path.join(".", "ChestX-ray14", "images"))
TEST_IMAGE_LIST = os.path.join(".", "ChestX-ray14", "labels", "test_list_split.txt")  # adjust if different
CKPT_PATH = globals().get('CKPT_PATH', os.path.join(".", "model.pth.tar"))
N_CLASSES = globals().get('N_CLASSES', 14)
BATCH_SIZE = 32
NUM_WORKERS = 4
PIN_MEMORY = True

# === helper: bootstrap CI for AUROC ===
def auc_bootstrap(y_true, y_score, n_boot=1000, seed=42):
    rng = np.random.RandomState(seed)
    scores = []
    n = len(y_score)
    for _ in range(n_boot):
        idx = rng.randint(0, n, n)
        if len(np.unique(y_true[idx])) < 2:
            continue
        try:
            s = roc_auc_score(y_true[idx], y_score[idx])
            scores.append(s)
        except Exception:
            pass
    if len(scores) == 0:
        return (float('nan'), float('nan'))
    scores = np.sort(scores)
    low = scores[int(0.025 * len(scores))]
    high = scores[int(0.975 * len(scores))]
    return float(low), float(high)

# === verify files ===
print("Using checkpoint:", CKPT_PATH)
print("Using test list:", TEST_IMAGE_LIST)
if not os.path.exists(CKPT_PATH):
    raise FileNotFoundError(f"Checkpoint not found at {CKPT_PATH}")
if not os.path.exists(TEST_IMAGE_LIST):
    raise FileNotFoundError(f"Test list not found at {TEST_IMAGE_LIST}")

# === dataset & dataloader ===
test_transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize(256),
    torchvision.transforms.CenterCrop(224),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

try:
    from read_data import ChestXrayDataSet
except Exception as e:
    raise ImportError(f"Failed to import ChestXrayDataSet from read_data: {e}")

test_dataset = ChestXrayDataSet(data_dir=DATA_DIR, image_list_file=TEST_IMAGE_LIST, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)
print("Test dataset size:", len(test_dataset))

# === model: reuse DenseNet121 if defined, otherwise define minimal one ===
try:
    DenseNet121
except NameError:
    import torch.nn as nn
    class DenseNet121(nn.Module):
        def __init__(self, out_size):
            super().__init__()
            self.densenet121 = torchvision.models.densenet121(pretrained=True)
            num_ftrs = self.densenet121.classifier.in_features
            self.densenet121.classifier = nn.Linear(num_ftrs, out_size)
        def forward(self, x):
            return self.densenet121(x)

# === load checkpoint robustly ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
ckpt = torch.load(CKPT_PATH, map_location=device)
# extract state_dict
if 'state_dict' in ckpt:
    sd = ckpt['state_dict']
elif 'model_state_dict' in ckpt:
    sd = ckpt['model_state_dict']
else:
    sd = ckpt
# strip 'module.' prefix if needed
new_sd = OrderedDict()
for k,v in sd.items():
    nk = k[len('module.'):] if k.startswith('module.') else k
    new_sd[nk] = v

model = DenseNet121(N_CLASSES).to(device)
model.load_state_dict(new_sd, strict=False)
model.eval()
print("Model loaded.")

# === inference ===
all_probs = []
all_gt = []
with torch.no_grad():
    for imgs, targets in test_loader:
        imgs = imgs.to(device)
        logits = model(imgs)                   # logits
        probs = torch.sigmoid(logits).cpu().numpy()
        all_probs.append(probs)
        # convert targets safely
        if isinstance(targets, torch.Tensor):
            all_gt.append(targets.cpu().numpy())
        else:
            all_gt.append(np.asarray(targets))

all_probs = np.vstack(all_probs)
all_gt = np.vstack(all_gt)
print("Predictions shape:", all_probs.shape, "GT shape:", all_gt.shape)
if all_probs.shape[0] == 0:
    raise RuntimeError("No predictions produced (empty loader?)")

# === metrics: per-class AUROC + bootstrap CI ===
per_class_auc = []
per_class_ci = []
for i in range(N_CLASSES):
    y_true = all_gt[:, i]
    y_score = all_probs[:, i]
    if len(np.unique(y_true)) < 2:
        per_class_auc.append(float('nan'))
        per_class_ci.append((float('nan'), float('nan')))
    else:
        auc = float(roc_auc_score(y_true, y_score))
        per_class_auc.append(auc)
        low, high = auc_bootstrap(y_true, y_score, n_boot=1000)
        per_class_ci.append((low, high))

mean_auc = np.nanmean(per_class_auc)
print("\nPer-class AUROC:")
for i, auc in enumerate(per_class_auc):
    ci = per_class_ci[i]
    print(f" Class {i:02d}: AUROC = {auc:.4f}   95% CI = [{ci[0]:.4f}, {ci[1]:.4f}]")
print(f"\nMean AUROC: {mean_auc:.4f}")

# === save outputs ===
out_dir = "./eval_results"
os.makedirs(out_dir, exist_ok=True)
np.save(os.path.join(out_dir, "test_probs.npy"), all_probs)
np.save(os.path.join(out_dir, "test_gt.npy"), all_gt)
with open(os.path.join(out_dir, "per_class_auc.txt"), "w") as f:
    f.write("class,auc,ci_low,ci_high\n")
    for i, auc in enumerate(per_class_auc):
        ci = per_class_ci[i]
        f.write(f"{i},{auc},{ci[0]},{ci[1]}\n")
print("Saved results to", out_dir)


Using checkpoint: model.pth.tar
Using test list: .\ChestX-ray14\labels\test_list_split.txt
Test dataset size: 22433
Device: cuda




Model loaded.
Predictions shape: (22433, 14) GT shape: (22433, 14)

Per-class AUROC:
 Class 00: AUROC = 0.5649   95% CI = [0.5530, 0.5776]
 Class 01: AUROC = 0.4903   95% CI = [0.4670, 0.5131]
 Class 02: AUROC = 0.5420   95% CI = [0.5307, 0.5539]
 Class 03: AUROC = 0.5305   95% CI = [0.5209, 0.5404]
 Class 04: AUROC = 0.4778   95% CI = [0.4626, 0.4966]
 Class 05: AUROC = 0.4536   95% CI = [0.4391, 0.4689]
 Class 06: AUROC = 0.5235   95% CI = [0.4888, 0.5589]
 Class 07: AUROC = 0.4881   95% CI = [0.4712, 0.5055]
 Class 08: AUROC = 0.4366   95% CI = [0.4189, 0.4548]
 Class 09: AUROC = 0.6378   95% CI = [0.6108, 0.6645]
 Class 10: AUROC = 0.4849   95% CI = [0.4598, 0.5118]
 Class 11: AUROC = 0.4339   95% CI = [0.4045, 0.4634]
 Class 12: AUROC = 0.4778   95% CI = [0.4578, 0.4995]
 Class 13: AUROC = 0.3112   95% CI = [0.2427, 0.3827]

Mean AUROC: 0.4895
Saved results to ./eval_results


In [19]:
!pip install tqdm

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
# Cell C: Full head-only epoch training (run after Cell A/B confirm)( only to RUN AFTER SUCCESSFUL CELL B)
from tqdm import tqdm

# assume train_loader, val_loader, model, criterion, optimizer, device already defined
log_every = 200
model.train()
running_loss = 0.0
total_batches = len(train_loader)
for i, (imgs, targets) in enumerate(train_loader, 1):
    imgs = imgs.to(device); targets = targets.to(device)
    optimizer.zero_grad()
    logits = model(imgs)
    loss = criterion(logits, targets)
    loss.backward()
    optimizer.step()
    running_loss += loss.item() * imgs.size(0)
    if i % log_every == 0 or i == total_batches:
        avg_loss = running_loss / (i * imgs.size(0))
        print(f"[{i}/{total_batches}] batches processed; avg_loss_so_far={avg_loss:.4f}")

epoch_loss = running_loss / len(train_loader.dataset)
print("Full-head epoch done, avg loss:", epoch_loss)

# run validation (same as earlier) and compute mean_auc
model.eval()
all_logits, all_gt = [], []
with torch.no_grad():
    for imgs, targets in val_loader:
        imgs = imgs.to(device); targets = targets.to(device)
        logits = model(imgs)
        all_logits.append(logits.cpu()); all_gt.append(targets.cpu())
all_logits = torch.cat(all_logits, dim=0)
all_gt = torch.cat(all_gt, dim=0)
probs = torch.sigmoid(all_logits).numpy()
gt_np = all_gt.numpy()

from sklearn.metrics import roc_auc_score
aucs = []
for i in range(N_CLASSES):
    if len(np.unique(gt_np[:,i])) < 2:
        aucs.append(float('nan'))
    else:
        try:
            aucs.append(float(roc_auc_score(gt_np[:,i], probs[:,i])))
        except Exception:
            aucs.append(float('nan'))
mean_auc = np.nanmean(aucs)
print("Validation mean AUROC after full head epoch:", mean_auc)

# save checkpoint
torch.save({'epoch': 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'val_mean_auc': float(mean_auc)}, "model_head_epoch1.pth.tar")
print("Saved full-head checkpoint to model_head_epoch1.pth.tar")


[200/4905] batches processed; avg_loss_so_far=0.2344
[400/4905] batches processed; avg_loss_so_far=0.2344
[600/4905] batches processed; avg_loss_so_far=0.2344
[800/4905] batches processed; avg_loss_so_far=0.2350
[1000/4905] batches processed; avg_loss_so_far=0.2353
[1200/4905] batches processed; avg_loss_so_far=0.2351
[1400/4905] batches processed; avg_loss_so_far=0.2352
[1600/4905] batches processed; avg_loss_so_far=0.2355
[1800/4905] batches processed; avg_loss_so_far=0.2357
[2000/4905] batches processed; avg_loss_so_far=0.2358
[2200/4905] batches processed; avg_loss_so_far=0.2360
[2400/4905] batches processed; avg_loss_so_far=0.2358
[2600/4905] batches processed; avg_loss_so_far=0.2356
[2800/4905] batches processed; avg_loss_so_far=0.2355
[3000/4905] batches processed; avg_loss_so_far=0.2357
[3200/4905] batches processed; avg_loss_so_far=0.2358
[3400/4905] batches processed; avg_loss_so_far=0.2359
[3600/4905] batches processed; avg_loss_so_far=0.2360
[3800/4905] batches processed; a

In [21]:
# === Cell D: Evaluate full-head checkpoint ===
import os, torch, torchvision, numpy as np
from collections import OrderedDict
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader

# === Paths ===
DATA_DIR = os.path.join(".", "ChestX-ray14", "images")
TEST_IMAGE_LIST = os.path.join(".", "ChestX-ray14", "labels", "test_list_split.txt")
CKPT_PATH = os.path.join(".", "model_head_epoch1.pth.tar")
N_CLASSES = 14
BATCH_SIZE = 32
NUM_WORKERS = 4
PIN_MEMORY = True

print("Using checkpoint:", CKPT_PATH)
print("Using test list:", TEST_IMAGE_LIST)
if not os.path.exists(CKPT_PATH):
    raise FileNotFoundError(f"Checkpoint not found at {CKPT_PATH}")
if not os.path.exists(TEST_IMAGE_LIST):
    raise FileNotFoundError(f"Test list not found at {TEST_IMAGE_LIST}")

# === Dataset & dataloader ===
test_transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize(256),
    torchvision.transforms.CenterCrop(224),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(mean=[0.485,0.456,0.406],
                                     std=[0.229,0.224,0.225])
])

try:
    from read_data import ChestXrayDataSet
except Exception as e:
    raise ImportError(f"Failed to import ChestXrayDataSet from read_data: {e}")

test_dataset = ChestXrayDataSet(data_dir=DATA_DIR,s
                                image_list_file=TEST_IMAGE_LIST,
                                transform=test_transform)
test_loader = DataLoader(test_dataset,
                         batch_size=BATCH_SIZE,
                         shuffle=False,
                         num_workers=NUM_WORKERS,
                         pin_memory=PIN_MEMORY)
print("Test dataset size:", len(test_dataset))

# === Model definition (reuse DenseNet121) ===
import torch.nn as nn
class DenseNet121(nn.Module):
    def __init__(self, out_size):
        super().__init__()
        self.densenet121 = torchvision.models.densenet121(pretrained=True)
        num_ftrs = self.densenet121.classifier.in_features
        self.densenet121.classifier = nn.Linear(num_ftrs, out_size)
    def forward(self, x):
        return self.densenet121(x)

# === Load checkpoint ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

ckpt = torch.load(CKPT_PATH, map_location=device)
state_dict = ckpt.get('state_dict', ckpt.get('model_state_dict', ckpt))

# Strip 'module.' prefix if needed
new_state = OrderedDict()
for k, v in state_dict.items():
    nk = k[len('module.'):] if k.startswith('module.') else k
    new_state[nk] = v

model = DenseNet121(N_CLASSES).to(device)
model.load_state_dict(new_state, strict=False)
model.eval()
print("Model loaded successfully from:", CKPT_PATH)

# === Inference ===
all_probs, all_gt = [], []
with torch.no_grad():
    for imgs, targets in test_loader:
        imgs = imgs.to(device)
        logits = model(imgs)
        probs = torch.sigmoid(logits).cpu().numpy()
        all_probs.append(probs)
        if isinstance(targets, torch.Tensor):
            all_gt.append(targets.cpu().numpy())
        else:
            all_gt.append(np.asarray(targets))

all_probs = np.vstack(all_probs)
all_gt = np.vstack(all_gt)
print("Predictions shape:", all_probs.shape, "GT shape:", all_gt.shape)

# === Compute AUROC ===
per_class_auc = []
for i in range(N_CLASSES):
    y_true = all_gt[:, i]
    y_score = all_probs[:, i]
    if len(np.unique(y_true)) < 2:
        per_class_auc.append(float('nan'))
    else:
        per_class_auc.append(float(roc_auc_score(y_true, y_score)))

mean_auc = np.nanmean(per_class_auc)
print("\nPer-class AUROC:")
for i, auc in enumerate(per_class_auc):
    print(f" Class {i:02d}: AUROC = {auc:.4f}")
print(f"\nMean AUROC: {mean_auc:.4f}")

# === Save results ===
os.makedirs("./eval_results_head", exist_ok=True)
np.save("./eval_results_head/test_probs.npy", all_probs)
np.save("./eval_results_head/test_gt.npy", all_gt)
with open("./eval_results_head/per_class_auc.txt", "w") as f:
    f.write("class,auc\n")
    for i, auc in enumerate(per_class_auc):
        f.write(f"{i},{auc}\n")
print("Saved evaluation outputs to ./eval_results_head/")


Using checkpoint: .\model_head_epoch1.pth.tar
Using test list: .\ChestX-ray14\labels\test_list_split.txt
Test dataset size: 22433
Device: cuda




Model loaded successfully from: .\model_head_epoch1.pth.tar
Predictions shape: (22433, 14) GT shape: (22433, 14)

Per-class AUROC:
 Class 00: AUROC = 0.5372
 Class 01: AUROC = 0.4435
 Class 02: AUROC = 0.5371
 Class 03: AUROC = 0.5169
 Class 04: AUROC = 0.5075
 Class 05: AUROC = 0.4988
 Class 06: AUROC = 0.4737
 Class 07: AUROC = 0.4769
 Class 08: AUROC = 0.5046
 Class 09: AUROC = 0.5937
 Class 10: AUROC = 0.4663
 Class 11: AUROC = 0.4502
 Class 12: AUROC = 0.4484
 Class 13: AUROC = 0.3255

Mean AUROC: 0.4843
Saved evaluation outputs to ./eval_results_head/


In [22]:
# Run this to get label counts and pos_weight suggestion
import numpy as np, os, torch
gt = np.load("./eval_results/test_gt.npy") if os.path.exists("./eval_results/test_gt.npy") else np.load("./eval_results_head/test_gt.npy")
pos = gt.sum(axis=0)
neg = gt.shape[0] - pos
print("Positives per class:", pos.tolist())
print("Negatives per class:", neg.tolist())
# pos_weight = neg/pos (torch expects float tensor)
pos_weight = torch.tensor((neg/(pos+1e-6)).astype(float))
print("Suggested pos_weight tensor (for BCEWithLogitsLoss):", pos_weight)


Positives per class: [2420.0, 582.0, 2754.0, 3938.0, 1133.0, 1335.0, 242.0, 1089.0, 957.0, 413.0, 509.0, 362.0, 734.0, 42.0]
Negatives per class: [20013.0, 21851.0, 19679.0, 18495.0, 21300.0, 21098.0, 22191.0, 21344.0, 21476.0, 22020.0, 21924.0, 22071.0, 21699.0, 22391.0]
Suggested pos_weight tensor (for BCEWithLogitsLoss): tensor([  8.2698,  37.5447,   7.1456,   4.6965,  18.7996,  15.8037,  91.6983,
         19.5996,  22.4410,  53.3172,  43.0727,  60.9696,  29.5627, 533.1190],
       dtype=torch.float64)


In [None]:
################################################




















In [23]:
# ===== Step 2: head training (3 epochs) with clipped pos_weight =====
import os, torch, numpy as np, random
from torch.utils.data import DataLoader
import torch.optim as optim
import torchvision
from sklearn.metrics import roc_auc_score
from collections import OrderedDict

# -------- config ----------
DATA_DIR = os.path.join(".", "ChestX-ray14", "images")
labels_dir = os.path.join(".", "ChestX-ray14", "labels")
TRAIN_LIST = os.path.join(labels_dir, "train_list.txt")
VAL_LIST   = os.path.join(labels_dir, "val_list.txt")
N_CLASSES = 14
BATCH_SIZE = 16
NUM_WORKERS = 4
PIN_MEMORY = True
NUM_EPOCHS = 3
LOG_EVERY = 200
CKPT_BEST = "model_head_best.pth.tar"
MAX_POS_WEIGHT = 50.0   # clip pos_weight to this value to avoid instability

# -------- transforms ----------
train_transform = torchvision.transforms.Compose([
    torchvision.transforms.RandomResizedCrop(224),
    torchvision.transforms.RandomHorizontalFlip(),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])
val_transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize(256),
    torchvision.transforms.CenterCrop(224),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])

# -------- dataset & dataloader (your dataset class) ----------
from read_data import ChestXrayDataSet
train_ds = ChestXrayDataSet(data_dir=DATA_DIR, image_list_file=TRAIN_LIST, transform=train_transform)
val_ds   = ChestXrayDataSet(data_dir=DATA_DIR, image_list_file=VAL_LIST, transform=val_transform)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)

print("Train/Val sizes:", len(train_ds), len(val_ds))

# -------- compute pos_weight from training labels (preferred) ----------
# This block tries to extract labels from dataset items. If your ChestXrayDataSet returns (img, labels) as numpy/torch, it will work.
pos_counts = np.zeros(N_CLASSES, dtype=float)
total_samples = 0
for i in range(len(train_ds)):
    try:
        item = train_ds[i]
        # expect (img, labels) where labels is array/tensor of shape (N_CLASSES,)
        labels = item[1]
        if isinstance(labels, torch.Tensor):
            labels = labels.cpu().numpy()
        labels = np.asarray(labels).astype(float)
        pos_counts += labels
        total_samples += 1
    except Exception:
        # fallback: stop scanning if dataset indexing is slow; you'll still be okay using precomputed pos_weight
        break

if total_samples > 0 and pos_counts.sum() > 0:
    neg_counts = total_samples - pos_counts
    pos_weight = (neg_counts / (pos_counts + 1e-8)).astype(float)
    print("Computed pos_weight from train dataset (first classes):", pos_weight[:6])
else:
    # fallback: use your provided numbers (from earlier). Replace the list below if you have different numbers.
    pos = np.array([2420., 582., 2754., 3938., 1133., 1335., 242., 1089., 957., 413., 509., 362., 734., 42.])
    neg = np.array([20013.,21851.,19679.,18495.,21300.,21098.,22191.,21344.,21476.,22020.,21924.,22071.,21699.,22391.])
    pos_weight = (neg/(pos+1e-6)).astype(float)
    print("Using fallback pos_weight (from provided counts):", pos_weight)

# clip pos_weight to a max value to stabilize training
pos_weight_clipped = np.minimum(pos_weight, MAX_POS_WEIGHT).astype(np.float32)
print("Clipped pos_weight (max={}):".format(MAX_POS_WEIGHT), pos_weight_clipped)

# convert to torch tensor on device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pos_weight_tensor = torch.tensor(pos_weight_clipped, dtype=torch.float32).to(device)

# -------- model setup ----------
import torch.nn as nn
class DenseNet121(nn.Module):
    def __init__(self, out_size):
        super().__init__()
        self.densenet121 = torchvision.models.densenet121(pretrained=True)
        num_ftrs = self.densenet121.classifier.in_features
        self.densenet121.classifier = nn.Linear(num_ftrs, out_size)
    def forward(self, x):
        return self.densenet121(x)

model = DenseNet121(N_CLASSES).to(device)

# if you want to initialize from previous checkpoint (optional)
for cand in ["model_head_epoch1.pth.tar", "model.pth.tar"]:
    if os.path.exists(cand):
        ck = torch.load(cand, map_location=device)
        sd = ck.get("state_dict", ck)
        new_sd = OrderedDict()
        for k,v in sd.items():
            nk = k[len("module."):] if k.startswith("module.") else k
            new_sd[nk] = v
        model.load_state_dict(new_sd, strict=False)
        print("Loaded initial weights from:", cand)
        break

# freeze backbone, train head only
for name, p in model.named_parameters():
    if "densenet121.classifier" not in name:
        p.requires_grad = False

# use BCEWithLogitsLoss with pos_weight
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)

# -------- training loop with validation & best-checkpoint saving ----------
best_val = -1.0
for epoch in range(NUM_EPOCHS):
    model.train()
    running = 0.0
    total = 0
    for i, (imgs, targets) in enumerate(train_loader, 1):
        imgs = imgs.to(device); targets = targets.to(device)
        optimizer.zero_grad()
        logits = model(imgs)
        loss = criterion(logits, targets)
        loss.backward()
        optimizer.step()
        running += loss.item() * imgs.size(0)
        total += imgs.size(0)
        if i % LOG_EVERY == 0:
            print(f"Epoch {epoch+1}/{NUM_EPOCHS} [{i}/{len(train_loader)}] avg_loss_so_far={running/total:.4f}")
    print(f"Epoch {epoch+1} train avg loss: {running/total:.4f}")

    # validation
    model.eval()
    all_probs = []; all_gt=[]
    with torch.no_grad():
        for imgs, targets in val_loader:
            imgs = imgs.to(device)
            logits = model(imgs)
            all_probs.append(torch.sigmoid(logits).cpu().numpy())
            all_gt.append(targets.cpu().numpy() if isinstance(targets, torch.Tensor) else np.asarray(targets))
    all_probs = np.vstack(all_probs); all_gt = np.vstack(all_gt)
    mean_auc = np.nanmean([roc_auc_score(all_gt[:,i], all_probs[:,i]) if len(np.unique(all_gt[:,i]))>1 else np.nan for i in range(N_CLASSES)])
    print(f"Epoch {epoch+1} VAL mean AUROC: {mean_auc:.4f}")
    if mean_auc > best_val:
        best_val = mean_auc
        torch.save({'epoch': epoch+1, 'state_dict': model.state_dict(), 'val_mean_auc': float(mean_auc)}, CKPT_BEST)
        print("Saved new best head checkpoint:", CKPT_BEST, "val_mean_auc:", best_val)

print("Head training complete. Best val mean AUROC:", best_val)


Train/Val sizes: 78468 11219
Computed pos_weight from train dataset (first classes): [ 8.8134067  39.24        7.47295109  4.63949978 18.67602808 16.93554286]
Clipped pos_weight (max=50.0): [ 8.813407  39.24       7.472951   4.6394997 18.676027  16.935543
 50.        20.178947  23.04781   45.43077   42.617565  50.
 33.43089   50.       ]




Loaded initial weights from: model_head_epoch1.pth.tar
Epoch 1/3 [200/4905] avg_loss_so_far=1.3006
Epoch 1/3 [400/4905] avg_loss_so_far=1.2522
Epoch 1/3 [600/4905] avg_loss_so_far=1.2256
Epoch 1/3 [800/4905] avg_loss_so_far=1.2182
Epoch 1/3 [1000/4905] avg_loss_so_far=1.2082
Epoch 1/3 [1200/4905] avg_loss_so_far=1.1953
Epoch 1/3 [1400/4905] avg_loss_so_far=1.1914
Epoch 1/3 [1600/4905] avg_loss_so_far=1.1917
Epoch 1/3 [1800/4905] avg_loss_so_far=1.1890
Epoch 1/3 [2000/4905] avg_loss_so_far=1.1840
Epoch 1/3 [2200/4905] avg_loss_so_far=1.1823
Epoch 1/3 [2400/4905] avg_loss_so_far=1.1824
Epoch 1/3 [2600/4905] avg_loss_so_far=1.1820
Epoch 1/3 [2800/4905] avg_loss_so_far=1.1760
Epoch 1/3 [3000/4905] avg_loss_so_far=1.1744
Epoch 1/3 [3200/4905] avg_loss_so_far=1.1748
Epoch 1/3 [3400/4905] avg_loss_so_far=1.1728
Epoch 1/3 [3600/4905] avg_loss_so_far=1.1710
Epoch 1/3 [3800/4905] avg_loss_so_far=1.1685
Epoch 1/3 [4000/4905] avg_loss_so_far=1.1676
Epoch 1/3 [4200/4905] avg_loss_so_far=1.1669
Epoc

In [1]:
# ===== Step 2 (FAST VERSION): head training (3 epochs) with clipped pos_weight =====
import os, torch, numpy as np, random
from torch.utils.data import DataLoader, WeightedRandomSampler
import torch.optim as optim
import torchvision
from sklearn.metrics import roc_auc_score
from collections import OrderedDict

# -------- config ----------
DATA_DIR = os.path.join(".", "ChestX-ray14", "images")
labels_dir = os.path.join(".", "ChestX-ray14", "labels")
TRAIN_LIST = os.path.join(labels_dir, "train_list.txt")
VAL_LIST   = os.path.join(labels_dir, "val_list.txt")
N_CLASSES = 14
BATCH_SIZE = 16
NUM_WORKERS = 2
PIN_MEMORY = True
NUM_EPOCHS = 3
LOG_EVERY = 200
CKPT_BEST = "model_head_best.pth.tar"
MAX_POS_WEIGHT = 50.0   # clip pos_weight to this value to avoid instability

# -------- transforms ----------
train_transform = torchvision.transforms.Compose([
    torchvision.transforms.RandomResizedCrop(224),
    torchvision.transforms.RandomHorizontalFlip(),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])
val_transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize(256),
    torchvision.transforms.CenterCrop(224),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])

# -------- fast label-based pos_weight computation ----------
# This avoids loading all images — reads label text directly.
def compute_pos_weight_from_list(label_list_path, n_classes=14):
    pos_counts = np.zeros(n_classes, dtype=float)
    total = 0
    with open(label_list_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 2: 
                continue
            labels_str = parts[1].split(',')
            labels = np.zeros(n_classes, dtype=float)
            for lab in labels_str:
                try:
                    idx = int(lab)
                    if 0 <= idx < n_classes:
                        labels[idx] = 1.0
                except ValueError:
                    pass
            pos_counts += labels
            total += 1
    neg_counts = total - pos_counts
    pos_weight = neg_counts / (pos_counts + 1e-8)
    return pos_weight, total

print("Computing pos_weight directly from label file (fast)...")
pos_weight, total = compute_pos_weight_from_list(TRAIN_LIST, N_CLASSES)
print(f"Found {int(total)} samples.")
print("Raw pos_weight (first 6):", pos_weight[:6])
pos_weight = np.minimum(pos_weight, MAX_POS_WEIGHT).astype(np.float32)
print("Clipped pos_weight:", pos_weight)

# convert to torch tensor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pos_weight_tensor = torch.tensor(pos_weight, dtype=torch.float32).to(device)

# -------- dataset & dataloader ----------
from read_data import ChestXrayDataSet
train_ds = ChestXrayDataSet(data_dir=DATA_DIR, image_list_file=TRAIN_LIST, transform=train_transform)
val_ds   = ChestXrayDataSet(data_dir=DATA_DIR, image_list_file=VAL_LIST, transform=val_transform)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)

print("Train/Val sizes:", len(train_ds), len(val_ds))

# -------- model setup ----------
import torch.nn as nn
class DenseNet121(nn.Module):
    def __init__(self, out_size):
        super().__init__()
        self.densenet121 = torchvision.models.densenet121(pretrained=True)
        num_ftrs = self.densenet121.classifier.in_features
        self.densenet121.classifier = nn.Linear(num_ftrs, out_size)
    def forward(self, x):
        return self.densenet121(x)

model = DenseNet121(N_CLASSES).to(device)

# if checkpoint exists, load initial weights
for cand in ["model_head_epoch1.pth.tar", "model.pth.tar"]:
    if os.path.exists(cand):
        ck = torch.load(cand, map_location=device)
        sd = ck.get("state_dict", ck)
        new_sd = OrderedDict((k[len("module."):] if k.startswith("module.") else k, v) for k,v in sd.items())
        model.load_state_dict(new_sd, strict=False)
        print("Loaded initial weights from:", cand)
        break

# freeze backbone
for name, p in model.named_parameters():
    if "densenet121.classifier" not in name:
        p.requires_grad = False

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)

# -------- training loop ----------
best_val = -1.0
for epoch in range(NUM_EPOCHS):
    model.train()
    running = total = 0
    for i, (imgs, targets) in enumerate(train_loader, 1):
        imgs, targets = imgs.to(device), targets.to(device)
        optimizer.zero_grad()
        logits = model(imgs)
        loss = criterion(logits, targets)
        loss.backward()
        optimizer.step()
        running += loss.item() * imgs.size(0)
        total += imgs.size(0)
        if i % LOG_EVERY == 0:
            print(f"Epoch {epoch+1}/{NUM_EPOCHS} [{i}/{len(train_loader)}] avg_loss_so_far={running/total:.4f}")
    print(f"Epoch {epoch+1} train avg loss: {running/total:.4f}")

    model.eval()
    all_probs, all_gt = [], []
    with torch.no_grad():
        for imgs, targets in val_loader:
            imgs = imgs.to(device)
            logits = model(imgs)
            all_probs.append(torch.sigmoid(logits).cpu().numpy())
            all_gt.append(targets.cpu().numpy())
    all_probs, all_gt = np.vstack(all_probs), np.vstack(all_gt)
    mean_auc = np.nanmean([roc_auc_score(all_gt[:,i], all_probs[:,i]) if len(np.unique(all_gt[:,i]))>1 else np.nan for i in range(N_CLASSES)])
    print(f"Epoch {epoch+1} VAL mean AUROC: {mean_auc:.4f}")
    if mean_auc > best_val:
        best_val = mean_auc
        torch.save({'epoch': epoch+1, 'state_dict': model.state_dict(), 'val_mean_auc': float(mean_auc)}, CKPT_BEST)
        print("Saved new best head checkpoint:", CKPT_BEST, "val_mean_auc:", best_val)

print("✅ FAST head training complete. Best val mean AUROC:", best_val)


Computing pos_weight directly from label file (fast)...
Found 78468 samples.
Raw pos_weight (first 6): [1.13463503e-01 8.81340670e+00 7.84680000e+12 7.84680000e+12
 7.84680000e+12 7.84680000e+12]
Clipped pos_weight: [ 0.11346351  8.813407   50.         50.         50.         50.
 50.         50.         50.         50.         50.         50.
 50.         50.        ]
Train/Val sizes: 78468 11219




Loaded initial weights from: model_head_epoch1.pth.tar
Epoch 1/3 [200/4905] avg_loss_so_far=1.6469
Epoch 1/3 [400/4905] avg_loss_so_far=1.5675
Epoch 1/3 [600/4905] avg_loss_so_far=1.5489
Epoch 1/3 [800/4905] avg_loss_so_far=1.5333
Epoch 1/3 [1000/4905] avg_loss_so_far=1.5144
Epoch 1/3 [1200/4905] avg_loss_so_far=1.5153
Epoch 1/3 [1400/4905] avg_loss_so_far=1.5070
Epoch 1/3 [1600/4905] avg_loss_so_far=1.4975
Epoch 1/3 [1800/4905] avg_loss_so_far=1.4979
Epoch 1/3 [2000/4905] avg_loss_so_far=1.4902
Epoch 1/3 [2200/4905] avg_loss_so_far=1.4876
Epoch 1/3 [2400/4905] avg_loss_so_far=1.4839
Epoch 1/3 [2600/4905] avg_loss_so_far=1.4824
Epoch 1/3 [2800/4905] avg_loss_so_far=1.4805
Epoch 1/3 [3000/4905] avg_loss_so_far=1.4793
Epoch 1/3 [3200/4905] avg_loss_so_far=1.4746
Epoch 1/3 [3400/4905] avg_loss_so_far=1.4752
Epoch 1/3 [3600/4905] avg_loss_so_far=1.4724
Epoch 1/3 [3800/4905] avg_loss_so_far=1.4730
Epoch 1/3 [4000/4905] avg_loss_so_far=1.4722
Epoch 1/3 [4200/4905] avg_loss_so_far=1.4707
Epoc

In [12]:
# ## we DONT EXECUTE THIS YET , INSTEAD WE EXECUTE THE CELL BELOW IT FIRST --> AVOID THIS CELL





















# # ---- REPLACE current DenseNet121 class with this ---- 
# class DenseNet121(nn.Module):
#     """DenseNet121 backbone with a linear multi-label head (no Sigmoid)."""
    
#     def __init__(self, out_size):
#         super(DenseNet121, self).__init__()
#         # Using pretrained weights; newer versions may warn about 'pretrained', but this works
#         self.densenet121 = torchvision.models.densenet121(pretrained=True)
        
#         # Get the number of input features to the classifier
#         num_ftrs = self.densenet121.classifier.in_features
        
#         # Replace the classifier with a linear layer that outputs logits
#         # (No sigmoid activation here — handle it in the loss function if needed)
#         self.densenet121.classifier = nn.Linear(num_ftrs, out_size)

#     def forward(self, x):
#         x = self.densenet121(x)
#         return x
# # -----------------------------------------------------

# if __name__ == '__main__':
#     main()


  return t.to(


=> no checkpoint found


  with autocast():


KeyboardInterrupt: 

In [9]:
# # ---- REPLACE current DenseNet121 class with this ----
# class DenseNet121(nn.Module):
#     """DenseNet121 backbone with a linear multi-label head (no Sigmoid)."""
    
#     def __init__(self, out_size):
#         super(DenseNet121, self).__init__()
#         # Using pretrained weights; newer versions may warn about 'pretrained', but this works
#         self.densenet121 = torchvision.models.densenet121(pretrained=True)
        
#         # Get the number of input features to the classifier
#         num_ftrs = self.densenet121.classifier.in_features
        
#         # Replace the classifier with a linear layer that outputs logits
#         # (No sigmoid activation here — handle it in the loss function if needed)
#         self.densenet121.classifier = nn.Linear(num_ftrs, out_size)

#     def forward(self, x):
#         x = self.densenet121(x)
#         return x
# # -----------------------------------------------------




# # ---------- TRAINING SMOKE-TEST CELL ----------
# import os, random, numpy as np, torch
# import torch.nn as nn, torch.optim as optim
# from torch.utils.data import DataLoader
# from sklearn.metrics import roc_auc_score
# import torchvision

# # ---------- 1) basic env + reproducibility ----------
# seed = 42
# random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
# if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print("Device:", device)

# # ---------- 2) ensure train/val lists exist (split test_list.txt if needed) ----------
# labels_dir = os.path.join(".", "ChestX-ray14", "labels")
# os.makedirs(labels_dir, exist_ok=True)
# master_list = os.path.join(labels_dir, "test_list.txt")
# train_list = os.path.join(labels_dir, "train_list.txt")
# val_list = os.path.join(labels_dir, "val_list.txt")

# if not os.path.exists(train_list) or not os.path.exists(val_list):
#     if os.path.exists(master_list):
#         with open(master_list, 'r') as f:
#             lines = [l.strip() for l in f if l.strip()]
#         random.shuffle(lines)
#         n = len(lines)
#         n_train = int(0.8*n); n_val = int(0.1*n)
#         train_lines = lines[:n_train]
#         val_lines = lines[n_train:n_train+n_val]
#         test_lines = lines[n_train+n_val:]
#         with open(train_list, 'w') as f: f.write("\n".join(train_lines))
#         with open(val_list, 'w') as f: f.write("\n".join(val_lines))
#         with open(os.path.join(labels_dir, 'test_list_split.txt'), 'w') as f: f.write("\n".join(test_lines))
#         print("Created train/val/test split:", len(train_lines), len(val_lines), len(test_lines))
#     else:
#         raise FileNotFoundError(f"No master list found at {master_list}. Place a list or tell me the correct path.")
# else:
#     print("train_list.txt and val_list.txt already present.")

# # ---------- 3) transforms ----------
# train_transform = torchvision.transforms.Compose([
#     torchvision.transforms.RandomResizedCrop(224),
#     torchvision.transforms.RandomHorizontalFlip(),
#     torchvision.transforms.ToTensor(),
#     torchvision.transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
# ])
# val_transform = torchvision.transforms.Compose([
#     torchvision.transforms.Resize(256),
#     torchvision.transforms.CenterCrop(224),
#     torchvision.transforms.ToTensor(),
#     torchvision.transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
# ])

# # ---------- 4) dataset & dataloader (uses your read_data.ChestXrayDataSet) ----------
# try:
#     from read_data import ChestXrayDataSet
# except Exception as e:
#     print("ERROR importing ChestXrayDataSet from read_data:", e)
#     raise

# DATA_DIR = globals().get('DATA_DIR', os.path.join(".", "ChestX-ray14", "images"))
# N_CLASSES = globals().get('N_CLASSES', 14)
# CKPT_PATH = globals().get('CKPT_PATH', os.path.join(".", "model.pth.tar"))

# train_dataset = ChestXrayDataSet(data_dir=DATA_DIR, image_list_file=train_list, transform=train_transform)
# val_dataset   = ChestXrayDataSet(data_dir=DATA_DIR, image_list_file=val_list, transform=val_transform)

# BATCH_SIZE = 16   # lower if OOM on your machine
# train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, pin_memory=False)
# val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=False)

# print("Train/Val sizes:", len(train_dataset), len(val_dataset))

# # ---------- 5) model (assumes DenseNet121 class already redefined without Sigmoid) ----------
# model = DenseNet121(N_CLASSES).to(device)

# # freeze everything except classifier head
# for name, p in model.named_parameters():
#     if "densenet121.classifier" not in name:
#         p.requires_grad = False

# criterion = nn.BCEWithLogitsLoss()
# optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)

# # ---------- 6) train 1 epoch (smoke test) ----------
# model.train()
# running_loss = 0.0
# for imgs, targets in train_loader:
#     imgs = imgs.to(device); targets = targets.to(device)
#     optimizer.zero_grad()
#     logits = model(imgs)             # logits
#     loss = criterion(logits, targets)
#     loss.backward()
#     optimizer.step()
#     running_loss += loss.item() * imgs.size(0)
# print("Train epoch done, avg loss:", running_loss / max(1, len(train_loader.dataset)))

# # ---------- 7) quick validation ----------
# model.eval()
# all_logits, all_gt = [], []
# with torch.no_grad():
#     for imgs, targets in val_loader:
#         imgs = imgs.to(device); targets = targets.to(device)
#         logits = model(imgs)
#         all_logits.append(logits.cpu()); all_gt.append(targets.cpu())

# all_logits = torch.cat(all_logits, dim=0)
# all_gt = torch.cat(all_gt, dim=0)
# probs = torch.sigmoid(all_logits).numpy()
# gt_np = all_gt.numpy()

# aucs = []
# for i in range(N_CLASSES):
#     if len(np.unique(gt_np[:,i])) < 2:
#         aucs.append(float('nan'))
#     else:
#         try:
#             aucs.append(float(roc_auc_score(gt_np[:,i], probs[:,i])))
#         except Exception:
#             aucs.append(float('nan'))

# mean_auc = np.nanmean(aucs)
# print("Validation mean AUROC (smoke-test):", mean_auc)
# print("Per-class AUROC (first 6):", aucs[:6])

# # ---------- 8) save checkpoint ----------
# state = {'epoch': 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'val_mean_auc': float(mean_auc)}
# torch.save(state, CKPT_PATH)
# print("Saved checkpoint to", CKPT_PATH)
# # ---------- END ----------


Device: cuda
train_list.txt and val_list.txt already present.
Train/Val sizes: 78468 11219


  return t.to(


Train epoch done, avg loss: 0.1777671042022862
Validation mean AUROC (smoke-test): 0.737075715617487
Per-class AUROC (first 6): [0.7167493076385387, 0.7312389941403284, 0.773260474840952, 0.646305061855538, 0.6773198414196715, 0.6412038155016444]
Saved checkpoint to model.pth.tar


In [8]:
##this is to see that whether the gpu is being used or the cpu is being used-

import torch, os
print("torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    try:
        print("CUDA device count:", torch.cuda.device_count())
        print("CUDA current device index:", torch.cuda.current_device())
        print("CUDA device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
        print("Memory allocated (bytes):", torch.cuda.memory_allocated())
        print("Memory reserved (bytes):", torch.cuda.memory_reserved())
    except Exception as e:
        print("Error querying CUDA device:", e)

# check where the model lives (if model is defined in this kernel)
try:
    p = next(model.parameters())
    print("Sample model parameter device:", p.device)
except Exception as e:
    print("Model not found in this scope or error:", e)

# quick check: are we on the device variable you printed earlier?
try:
    print("device variable in env (if exists):", globals().get('device', None))
except:
    pass


torch version: 2.7.1+cu118
CUDA available: True
CUDA device count: 1
CUDA current device index: 0
CUDA device name: NVIDIA GeForce RTX 4060 Laptop GPU
Memory allocated (bytes): 0
Memory reserved (bytes): 0
Model not found in this scope or error: name 'model' is not defined
device variable in env (if exists): None


In [None]:
# # the below cell is code for creating a backup safely 

In [2]:
# # Backup the notebook file (edit src_name if your file has a different name)
# import os, shutil

# # set these to match your environment / notebook filename
# notebook_dir = r"C:\Users\Ansh\Desktop\ml project"
# os.chdir(notebook_dir)
# print("Changed CWD to:", os.getcwd())

# src_name = "mlproject.ipynb"   # change if your file has a different name
# if not os.path.exists(src_name):
#     print(f"File not found: {src_name}. Files in dir:", os.listdir()[:50])
# else:
#     dst_name = "mlproject_backup.ipynb"
#     shutil.copy(src_name, dst_name)
#     print("Backup created:", os.path.join(os.getcwd(), dst_name))


Changed CWD to: C:\Users\Ansh\Desktop\ml project
Backup created: C:\Users\Ansh\Desktop\ml project\mlproject_backup.ipynb
