In [1]:
import os
import pandas as pd
import config
from utils.dataset import chest_xray_datasplit, ChestXrayDataset, ChestXrayDatasetWithMask
from utils.model import ChestXrayDenseNet121
from utils.train import train, validate, compute_pos_weight

In [2]:
full_df = pd.read_csv(os.path.join(config.DATASET_DIR, 'Data_Entry_2017_v2020.csv'))
# Step 1: Create dummy columns using one-hot encoding
expanded_df = full_df['Finding Labels'].str.get_dummies(sep='|')

# Step 2: Concatenate with original Image Index
final_df = pd.concat([full_df[['Image Index']], expanded_df], axis=1)
final_df.head()

Unnamed: 0,Image Index,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax
0,00000001_000.png,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,00000001_001.png,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
2,00000001_002.png,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
3,00000002_000.png,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,00000003_001.png,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [3]:
classes_df = final_df.drop(columns=['Image Index', 'No Finding'])

In [4]:
# Initialize model
model = ChestXrayDenseNet121(num_classes=len(classes_df.columns))

In [5]:
import torch
def compute_pos_weight_from_df(full_df, label_cols):
    num_samples = len(full_df)

    # Sum positives per class
    pos_counts = full_df[label_cols].sum().values  # shape [num_classes]
    neg_counts = num_samples - pos_counts

    pos_weight = neg_counts / (pos_counts + 1e-6)  # avoid div by zero
    return torch.tensor(pos_weight, dtype=torch.float32)

pos_weight = compute_pos_weight_from_df(classes_df, list(classes_df.columns))

In [6]:
from torch.utils.data import DataLoader
full_dataset = ChestXrayDataset(dataframe=final_df, img_dir=os.path.join(config.DATASET_DIR, 'cxr', 'images'))

train_dataset, val_dataset, test_dataset = chest_xray_datasplit(final_df, full_dataset, dataset_dir=os.path.join(config.DATASET_DIR, 'cxr', 'images'))

train_loader = DataLoader(train_dataset, batch_size=96, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=96, shuffle=False, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=96, shuffle=False, num_workers=4, pin_memory=True)

In [None]:
import torch
device = torch.device("cuda:6" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False

# Train
train(model, train_loader, val_loader, device, epochs=10, lr=1e-4, save_path=config.MODEL_FOLDER, file_name="PulmoScanX_Densenet_v2", pos_weight=pos_weight,
      multi_gpu=True, device_ids=[1,2,3])

Using 3 GPUs: [1, 2, 3]


Epoch 1/10:   1%|          | 5/818 [00:18<34:28,  2.54s/it, loss=1.45]  

In [9]:
from utils.evaluate import evaluate, evaluate_per_class
evals_per_class = evaluate_per_class(model, test_loader, device, list(classes_df.columns))


Per-Class F1 Scores:
                 Pathology  F1 Score
                  Effusion  0.505556
    Subcutaneous Emphysema  0.468314
                 Emphysema  0.413209
              Infiltration  0.405411
                    Hernia  0.400000
              Pneumothorax  0.379723
               Atelectasis  0.374114
         Pneumomediastinum  0.366864
                      Mass  0.335202
              Cardiomegaly  0.290840
                    Nodule  0.243829
                     Edema  0.217028
          Pneumoperitoneum  0.205405
             Consolidation  0.195868
        Pleural Thickening  0.188668
            Tortuous Aorta  0.165581
                  Fibrosis  0.149390
                 Pneumonia  0.091616
Calcification of the Aorta  0.085603


In [None]:
list(classes_df.columns)

['Atelectasis',
 'Cardiomegaly',
 'Consolidation',
 'Edema',
 'Effusion',
 'Emphysema',
 'Fibrosis',
 'Hernia',
 'Infiltration',
 'Mass',
 'Nodule',
 'Pleural Thickening',
 'Pneumonia',
 'Pneumothorax',
 'Pneumoperitoneum',
 'Pneumomediastinum',
 'Subcutaneous Emphysema',
 'Tortuous Aorta',
 'Calcification of the Aorta']