In [None]:
import pandas as pd
import numpy as np
import os
import cv2
import matplotlib.pyplot as plt
import datetime
from pathlib import Path
import gc
import warnings
warnings.filterwarnings("ignore")
import sys
import time
####
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
####
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch import optim
import torchvision.transforms as transforms
import torchvision
##
from fastprogress import master_bar, progress_bar
from PIL import Image
gc.collect()

Device 

In [None]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')


def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device

    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl:
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [None]:
data_directory = os.path.join(os.path.dirname(os.getcwd()),"data")
data_path = Path(data_directory)
full_train_df = pd.read_csv(os.path.join(data_path, Path('archive/train.csv')))
full_valid_df = pd.read_csv(os.path.join(data_path, Path('archive/valid.csv')))

In [None]:
u_one_features = ['Atelectasis', 'Edema']
u_zero_features = ['Cardiomegaly', 'Consolidation', 'Pleural Effusion']

full_train_df['Cardiomegaly'] = full_train_df['Cardiomegaly'].replace(-1,0)
full_train_df['Consolidation'] = full_train_df['Consolidation'].replace(-1,0)
full_train_df['Pleural Effusion'] = full_train_df['Pleural Effusion'].replace(-1,0)

full_train_df['Atelectasis'] = full_train_df['Atelectasis'].replace(-1,1)
full_train_df['Edema'] = full_train_df['Edema'].replace(-1,1)
full_train_df = full_train_df.replace(-1,np.nan)

In [None]:
train_data, val_data = train_test_split(full_train_df, test_size=0.20, random_state=2021)
del full_train_df
del full_valid_df
gc.collect()

Model


In [None]:
class DenseNet121(nn.Module):
    def __init__(self, num_classes=14, is_trained=False):
        super().__init__()
        self.net = torchvision.models.densenet121(pretrained=is_trained)
        out = self.net.classifier.in_features
        self.net.classifier = nn.Sequential(nn.Linear(out,num_classes), nn.Sigmoid())
    def forward(self, input):
        return self.net(input)

class ResNet50(nn.Module):
    def __init__(self, num_classes=14,is_trained=False):
        super.__init__()
        self.net = torchvision.models.resnet50(pretrained=is_trained)

        counter = 0
        for child in self.net.children():
            counter +=1
            if counter <= 8:
                for param in child.parameters():
                    param.requires_grad = False
        self.net.fc = nn.Sequential(
               nn.Linear(self.net.fc.in_features, 128),
               nn.ReLU(inplace=True),
               nn.Linear(128, num_classes),nn.Sigmoid())
        
    def forward(self, inputs):
        """
        Forward the netword with the inputs
        """
        return self.net(inputs)

        #Replace last layer

In [None]:
class ChestXrayDataset(Dataset):
    def __init__(self, folder_dir, dataframe, image_size, normalization, data_limit=10000):

        self.image_paths = [] # List of image paths
        self.image_labels = [] # List of image labels
        
        # Define list of image transformations
        image_transformation = [
            transforms.Resize((image_size, image_size)),
            transforms.ToTensor()
        ]
        
        self.image_transformation = transforms.Compose(image_transformation)
        
        # Get all image paths and image labels from dataframe
        for index, row in dataframe.iterrows():
            path_parts = row.Path.split('/')
            # Remove the first directory by slicing the list from the second element onwards
            img_data_path = path_parts[1:]
            img_data_path.insert(0,'archive')
            new_path = '\\'.join(img_data_path)
            image_path = os.path.join(folder_dir, new_path)
            self.image_paths.append(image_path)
            if len(row) < 14:
                labels = [0] * 14
            else:
                labels = []
                for col in row[5:]:
                    if col == 1:
                        labels.append(1)
                    else:
                        labels.append(0)
            self.image_labels.append(labels)
    
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, index):
        """
        Read image at index and convert to torch Tensor
        """
        
        # Read image
        image_path = self.image_paths[index]
        image_data = Image.open(image_path).convert("RGB") # Convert image to RGB channels

        image_data = self.image_transformation(image_data)
        
        return image_data, torch.FloatTensor(self.image_labels[index])
    

In [None]:
IMAGE_SIZE = 224                              # Image size (224x224)
BATCH_SIZE = 150                             
LEARNING_RATE = 0.001
LEARNING_RATE_SCHEDULE_FACTOR = 0.1           # Parameter used for reducing learning rate
LEARNING_RATE_SCHEDULE_PATIENCE = 5           # Parameter used for reducing learning rate
MAX_EPOCHS = 30 ##100    

In [None]:
train_data_set = ChestXrayDataset(data_directory, train_data, IMAGE_SIZE, True)
len(train_data_set)

In [None]:
#only take 5% of data 
from torch.utils.data import SubsetRandomSampler
subset_size = 0.05  # 5% of the dataset
num_samples = len(train_data_set)
num_val_sample = len(val_data)
subset_indices = torch.randperm(num_samples)[:int(subset_size * num_samples)]
subset_indices_val = torch.randperm(num_val_sample)[:int(subset_size * num_val_sample)]
subset_sampler = SubsetRandomSampler(subset_indices)
subset_val_sampler = SubsetRandomSampler(subset_indices_val)

In [None]:
train_data_loader = DataLoader(dataset=train_data_set, sampler= subset_sampler, batch_size=BATCH_SIZE, shuffle=False)
device = get_default_device()
train_data_loader = DeviceDataLoader(train_data_loader, device)


val_dataset = ChestXrayDataset(data_directory, val_data, IMAGE_SIZE, True)
val_dataloader = DataLoader(dataset=val_dataset, sampler=subset_val_sampler, batch_size=BATCH_SIZE, shuffle=False)
device = get_default_device()
val_dataloader = DeviceDataLoader(val_dataloader, device)

In [None]:
def multi_label_auroc(y_gt, y_pred):
    """ Calculate AUROC for each class

    Parameters
    ----------
    y_gt: torch.Tensor
        groundtruth
    y_pred: torch.Tensor
        prediction

    Returns
    -------
    list
        F1 of each class
    """
    auroc = []
    gt_np = y_gt.to("cpu").numpy()
    pred_np = y_pred.to("cpu").numpy()
    assert gt_np.shape == pred_np.shape, "y_gt and y_pred should have the same size"
    for i in range(gt_np.shape[1]):
        try:
            auroc.append(roc_auc_score(gt_np[:, i], pred_np[:, i]))
        except ValueError:
            pass
    return auroc

def multi_label_accuracy(y_gt, y_pred):
    """ Calculate AUROC for each class

    Parameters
    ----------
    y_gt: torch.Tensor
        groundtruth
    y_pred: torch.Tensor
        prediction

    Returns
    -------
    list
        F1 of each class
    """
    acc = []
    gt_np = y_gt.to("cpu").numpy()
    pred_np = y_pred.to("cpu").numpy()
    assert gt_np.shape == pred_np.shape, "y_gt and y_pred should have the same size"
    for i in range(gt_np.shape[1]):
        acc.append(accuracy_score(gt_np[:, i], np.where(pred_np[:, i]>=0.5,1,0)))
    return acc

def multi_label_f1(y_gt, y_pred):
    """ Calculate f1 for each class

    Parameters
    ----------
    y_gt: torch.Tensor
        groundtruth
    y_pred: torch.Tensor
        prediction

    Returns
    -------
    list
        F1 of each class
    """
    f1_out = []
    gt_np = y_gt.to("cpu").numpy()
    pred_np = y_pred.to("cpu").numpy()
    assert gt_np.shape == pred_np.shape, "y_gt and y_pred should have the same size"
    for i in range(gt_np.shape[1]):
        f1_out.append(f1_score(gt_np[:, i], np.where(pred_np[:, i]>=0.5,1,0)))
    return f1_out


def multi_label_precision_recall(y_gt, y_pred):
    """ Calculate precision for each class

    Parameters
    ----------
    y_gt: torch.Tensor
        groundtruth
    y_pred: torch.Tensor
        prediction

    Returns
    -------
    list
        precision of each class
    """
    precision_out = []
    recall_out = []
    gt_np = y_gt.to("cpu").numpy()
    pred_np = y_pred.to("cpu").numpy()
    assert gt_np.shape == pred_np.shape, "y_gt and y_pred should have the same size"
    for i in range(gt_np.shape[1]):
        p = precision_recall_fscore_support(gt_np[:, i], np.where(pred_np[:, i]>=0.5,1,0),average='binary')
        precision_out.append(p[0])
        recall_out.append(p[1])
    return precision_out,recall_out

In [None]:
def epoch_training(epoch, model, train_data_loader, loss_criteria, optimizer, mb):
    model.train()
    training_loss = 0

    for batch, (image, label) in enumerate(progress_bar(train_data_loader,parent=mb)):

        #Zero the previous grad
        optimizer.zero_grad()

        #Run forward pass
        output = model(image)
        
        #Calculate the backward loss
        loss = loss_criteria(output, label)

        #Do GD
        loss.backward()
        optimizer.step()

        training_loss += loss.item()
    del images, labels, loss
    if torch.cuda.is_available(): torch.cuda.empty_cache()

    # return training loss
    return training_loss/len(train_data_loader)


def evaluating(epoch, model, val_loader, loss_criteria, mb):
    model.eval()
    val_loss = 0                                   # Total loss of model on validation set
    out_pred = torch.FloatTensor().to(device)      # Tensor stores prediction values
    out_gt = torch.FloatTensor().to(device) 
    with torch.no_grad():
         for step, (images, labels) in enumerate(progress_bar(val_loader, parent=mb)):
             
             out_gt = torch.cat((out_gt,labels),0)

             ps = model(images)
             loss = loss_criteria(ps, labels)
             out_pred = torch.cat((out_pred, ps), 0)

            # Update validation loss after each batch
             val_loss += loss
    del images, labels, loss
    if torch.cuda.is_available(): torch.cuda.empty_cache()
    val_loss_mean = val_loss/len(val_loader)
    auroc_mean = np.nanmean(np.array(multi_label_auroc(out_gt, out_pred)))
    acc_mean = np.nanmean(np.array(multi_label_accuracy(out_gt, out_pred)))
    f1_mean = np.nanmean(np.array(multi_label_f1(out_gt, out_pred)))
    
    return val_loss_mean,auroc_mean,acc_mean,f1_mean



In [None]:
def get_opt(modeltxt,model):
    
    if modeltxt == "DenseNet121":
        return optim.Adam(model.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-5)
    if modeltxt == "ResNet50":
        return optim.Adam(model.parameters())

In [None]:

model_list = [DenseNet121,ResNet50]
mName_list = ['DenseNet121','ResNet50']
out = data_directory = os.path.join(os.path.dirname(os.getcwd()),"model\\image")
def actual_training(modelname,loss_criteria,modeltxt):
    model = modelname(14, is_trained=True).to(device)

    optimizer = get_opt(modeltxt, model)

    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor = LEARNING_RATE_SCHEDULE_FACTOR,
                                                        patience = LEARNING_RATE_SCHEDULE_PATIENCE, mode = 'max', verbose=True)
    
    model_path = out+modeltxt+".pth"

    best_score = 0
    best_score_acc = 0
    best_score_f1 = 0
    
    out_path = out+modeltxt+"_running.csv"
    training_losses = []
    validation_losses = []
    validation_score = []
    validation_acc = []
    validation_f1 = []

    mb = master_bar(range(MAX_EPOCHS))
    mb.names = ['Train loss', 'Val loss', 'AUROC', 'Accuracy', 'f1 score']
    x = []

    nonimproved_epoch = 0
    start_time = time.time()
    cnt = 1

    #training

    for epoch in mb:
        mb.main_bar.comment = f'Best AUROC score: {best_score}'
        x.append(epoch)

        train_loss = epoch_training(epoch, model, train_data_loader, loss_criteria, optimizer,mb)
        mb.write('Finish training epoch {} with loss {:.4f}'.format(epoch, train_loss))
        training_losses.append(train_loss)

        val_loss, new_score, new_score_acc, new_score_f1 = evaluating(epoch, model, val_dataloader, device, loss_criteria, mb)

        validation_losses.append(val_loss)
        validation_score.append(new_score)
        validation_acc.append(new_score_acc)
        validation_f1.append(new_score_f1)

        gc.collect()
        # Update learning rate
        lr_scheduler.step(new_score)

        mb.update_graph([[x, training_losses], [x, validation_losses], [x, validation_score] , [x, validation_acc] ,
                         [x, validation_f1]],
                        [0,epoch+1+round(epoch*0.3)], [0,1])

        diff = np.round(time.time() - start_time)

        t2 = 4
        if modeltxt == 'DenseNet121':
            t2 = 6
        if best_score < new_score:
            #mb.write(f"Improve AUROC from {best_score} to {new_score}")    
            best_score = new_score
            best_score_acc = new_score_acc
            best_score_f1 = new_score_f1
            nonimproved_epoch = 0
            best_model = model
            torch.save({"model": model.state_dict(), 
                        "optimizer": optimizer.state_dict(), 
                        "best_score": best_score, 
                        "epoch": epoch, 
                        "lr_scheduler": lr_scheduler.state_dict()}, model_path)
        else: 
            nonimproved_epoch += 1
        if nonimproved_epoch > 5:
            break
            print("Early stopping")
        if time.time() - start_time > 3600*t2:
            break
            print("Out of time")
        return best_score,best_score_acc,best_score_f1,best_model

eval_df_train = []
for m in model_list:
    mName = m().__class__.__name__
    print("Processing Model ",mName)
    globals()[f"best_score_{mName}"],globals()[f"best_score_acc_{mName}"],globals()[f"best_score_f1_{mName}"], \
        globals()[f"best_model_{mName}"] = actual_training(modelname=m,loss_criteria=nn.BCELoss(),modeltxt=mName)
    eval_df_train.append([mName,globals()[f"best_score_{mName}"],globals()[f"best_score_acc_{mName}"],globals()[f"best_score_f1_{mName}"]])

