In [2]:
# Cell 1: Project Header and Imports
"""
DenseNet-121 Baseline Model for NIH Chest X-ray Classification
COSC 4368 Final Project - Group 13
Yla Herrera, Nicolas Mangilit, Kiriti Padavala, and Matt Tindall
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import os
from pathlib import Path

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms

from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from tqdm import tqdm

print("All libraries imported successfully!")

All libraries imported successfully!


In [3]:
# Cell 2: Set Random Seeds and Check Device
# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using device: cpu


In [4]:
# Cell 3: Define Paths and Load Metadata
# Define paths (UPDATE THIS TO YOUR LOCAL PATH)
DATA_DIR = "data"  # Update this!

# The dataset has images split across multiple folders: images_001 to images_012
# We'll handle this in the Dataset class

# Load metadata
print("Loading metadata CSV...")
csv_path = os.path.join(DATA_DIR, "Data_Entry_2017.csv")
df = pd.read_csv(csv_path)

print(f"✓ Total number of images: {len(df)}")
print(f"\nDataset columns:")
print(df.columns.tolist())
print(f"\nFirst few rows:")
print(df.head())

# Check which image folders exist
print(f"\nChecking for image folders...")
image_folders = []
for i in range(1, 13):  # images_001 to images_012
    folder_name = f"images_{i:03d}"
    folder_path = os.path.join(DATA_DIR, folder_name)
    if os.path.exists(folder_path):
        num_images = len([f for f in os.listdir(folder_path) if f.endswith('.png')])
        image_folders.append(folder_name)
        print(f"  ✓ Found {folder_name} with {num_images} images")

print(f"\nTotal image folders found: {len(image_folders)}")


Loading metadata CSV...
✓ Total number of images: 112120

Dataset columns:
['Image Index', 'Finding Labels', 'Follow-up #', 'Patient ID', 'Patient Age', 'Patient Gender', 'View Position', 'OriginalImage[Width', 'Height]', 'OriginalImagePixelSpacing[x', 'y]', 'Unnamed: 11']

First few rows:
        Image Index          Finding Labels  Follow-up #  Patient ID  \
0  00000001_000.png            Cardiomegaly            0           1   
1  00000001_001.png  Cardiomegaly|Emphysema            1           1   
2  00000001_002.png   Cardiomegaly|Effusion            2           1   
3  00000002_000.png              No Finding            0           2   
4  00000003_000.png                  Hernia            0           3   

   Patient Age Patient Gender View Position  OriginalImage[Width  Height]  \
0           58              M            PA                 2682     2749   
1           58              M            PA                 2894     2729   
2           58              M            PA  