Before running this code, make sure you have <br>
1) images/  folder which as all the images (raw data; unzipped from the tar file) <br>
2) images/train_val_filtered.pkl : list of non blacklisted training images <br>
3) images/test_filtered.pkl : list of non blacklisted test images

In [1]:
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import torch
import os
import torchvision.transforms as transforms
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.autograd import Variable

In [2]:
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
from sklearn.metrics.ranking import roc_auc_score
from sklearn.preprocessing import MultiLabelBinarizer

In [3]:
import pandas as pd
import numpy as np
import glob
import time

  (fname, cnt))
  (fname, cnt))


In [4]:
set_labels =['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema',
                 'Fibrosis','Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening',
                 'Pneumonia','Pneumothorax']
    
## convert to Multilabels 
mlb = MultiLabelBinarizer(classes=set_labels)
mlb.fit(set_labels)

MultiLabelBinarizer(classes=['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax'],
          sparse_output=False)

In [5]:
def label2vector(label):
    
    ## INPUT : ['Atelectasis', 'Cardiomegaly']
    ## OUTPUT : [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    
    label = [label]
    test_labels_str = [i.split('|') for i in label]
    test_labels_str = [ x if 'Finding' not in x[0] else [] for x in (test_labels_str)]
    return mlb.transform(test_labels_str)[0]

def get_vector_labels(path, is_test):
    
    ## INPUT : ('../data/images/, True)
    ## OUTPUT : 
    
#         img_filename	             vector
# 44187   00013774_026.png    [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
# 44188   00013774_028.png    [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
    
    
    ## read list of non-blacklisted images; train and test 
    if not is_test:
        train_img_list = pd.read_pickle(path + "train_val_filtered.pkl")
    else:
        train_img_list = pd.read_pickle(path + "test_filtered.pkl")
        
    ## read images in our images folder 
    single_tar_images = [i.split('/')[1] for i in glob.glob(path + '*.png')]
    ## subset 
    train_img_list = train_img_list[train_img_list['img_filename'].isin(single_tar_images)]
    
    train_img_list['vector'] = train_img_list['text_label'].apply(label2vector)
    train_img_list = train_img_list[['img_filename', 'vector']]
    return train_img_list

def train_val_splitter(train_df, percentage = 0.1):
    
    assert percentage <= 1
    assert percentage > 0
    
    ## INPUT :  training_df of the shape (n,2), 0 <float <=1
    ## OUTPUT : validation_df, train_df; no patient overlap in the two dfs
                # validation_Df.shape = ~(n*percentage,2), 
                # train_df.shape = ~(n*(1-percentage), 2)
    
    col_list = train_df.columns.tolist()
    
    train_df['patient'] = train_df.img_filename.apply(lambda x : x.split("_")[0])
    valid_patients = np.random.choice(train_df.patient.unique(), 
                 int(train_df.patient.unique().shape[0]*percentage),
                 replace=False)
    
    valid_df = train_df[train_df['patient'].isin(valid_patients)]
    dummy = train_df[~train_df['patient'].isin(valid_patients)]
    
    # assert no rows are missed 
    assert valid_df.shape[0] + dummy.shape[0] == train_df.shape[0]
    # assert intersection is null
    assert np.intersect1d(valid_df.patient.values, dummy.patient.values).tolist() == []
    
    return valid_df[col_list], dummy[col_list]

In [6]:
train_df = get_vector_labels(path = 'images/', is_test = False)
train_df.shape

(8412, 2)

In [7]:
test_df = get_vector_labels(path = 'images/', is_test = True)
test_df.shape

(1469, 2)

In [8]:
valid_df, train_df = train_val_splitter(train_df, 0.1)
print (valid_df.shape, train_df.shape, test_df.shape)

(859, 2) (7553, 2) (1469, 2)


In [9]:
train_df.head()

Unnamed: 0,img_filename,vector
20744,00006587_000.png,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
20745,00006588_000.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
20746,00006588_001.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
20747,00006588_002.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
20748,00006588_003.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [10]:
CLASSES = 14
WIDTH = 224
HEIGHT = 224
LR = 0.0001
EPOCHS = 2 #100
# Can scale to max for inference but for training LR will be affected
# Prob better to increase this though on P100 since LR is not too low
# Easier to see when plotted
BATCHSIZE = 16 #64*2
IMAGENET_RGB_MEAN = [0.485, 0.456, 0.406]
IMAGENET_RGB_SD = [0.229, 0.224, 0.225]

In [11]:
assert torch.cuda.is_available()

In [12]:
torch.cuda.get_device_capability(torch.cuda.current_device())

(7, 0)

In [13]:
!nvidia-smi
!cat /usr/local/cuda-8.0/version.txt
!cat /usr/local/cuda/version.txt

Tue Apr  3 19:38:29 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.111                Driver Version: 384.111                   |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000000:00:1E.0 Off |                    0 |
| N/A   38C    P0    29W / 300W |     10MiB / 16152MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [14]:
class XRayDataset(Dataset):

    def __init__(self, train_df, root_dir, transform=None):
        self.df = train_df
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir,
                                self.df.iloc[idx, 0])
        image = Image.open(img_name).convert('RGB')
        labels = torch.FloatTensor(train_df[train_df['img_filename'] == train_df.iloc[idx, 0]]['vector'].values[0])
        #labels = torch.FloatTensor(np.array([0]*14))

        if self.transform:
            image = self.transform(image)

        return image, labels

In [15]:
normalize = transforms.Normalize(IMAGENET_RGB_MEAN, IMAGENET_RGB_SD)

train_dataset = XRayDataset(train_df,
                                    root_dir= os.path.join(os.getcwd() + '/images/'), 
                           transform=transforms.Compose([
                           transforms.Resize(WIDTH),
                           transforms.ToTensor(),
                           normalize]))

test_dataset = XRayDataset(test_df,
                                    root_dir= os.path.join(os.getcwd() + '/images/'), 
                           transform=transforms.Compose([
                           transforms.Resize(WIDTH),
                           transforms.ToTensor(),
                           normalize]))

valid_dataset = XRayDataset(valid_df,
                                    root_dir= os.path.join(os.getcwd() + '/images/'), 
                           transform=transforms.Compose([
                           transforms.Resize(WIDTH),
                           transforms.ToTensor(),
                           normalize]))

In [16]:
def get_symbol(out_features=CLASSES):
    model = models.densenet.densenet121(pretrained=True)
    # Replace classifier (FC-1000) with (FC-14)
    model.classifier = nn.Sequential(
        nn.Linear(model.classifier.in_features, out_features), 
        nn.Sigmoid())
    # CUDA
    model.cuda()  
    return model

In [17]:
def init_symbol(sym, lr=LR):
    # torch.optim.Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
    opt = optim.Adam(sym.parameters(), lr=lr, betas=(0.9, 0.999))
    criterion = nn.BCELoss()
    scheduler = ReduceLROnPlateau(opt, factor = 0.1, patience = 5, mode = 'min')
    return opt, criterion, scheduler

In [18]:
def compute_roc_auc(data_gt, data_pd, mean=True, classes=CLASSES):
    roc_auc = []
    data_gt = data_gt.cpu().numpy()
    data_pd = data_pd.cpu().numpy()
    for i in range(classes):
        roc_auc.append(roc_auc_score(data_gt[:, i], data_pd[:, i]))
    if mean:
        roc_auc = np.mean(roc_auc)
    return roc_auc

In [19]:
def train_epoch(model, dataloader, optimizer, criterion, epoch, batch=BATCHSIZE):
    model.train()
    print("Training epoch {}".format(epoch+1))
    loss_val = 0
    loss_cnt = 0
    for data, target in tqdm(dataloader):
        # Get samples
        data = Variable(torch.FloatTensor(data).cuda())
        target = Variable(torch.FloatTensor(target).cuda())
        # Init
        optimizer.zero_grad()
        # Forwards
        output = model(data)
        # Loss
        loss = criterion(output, target)
        # Back-prop
        loss.backward()
        optimizer.step()   
         # Log the loss
        loss_val += loss.data[0]
        loss_cnt += 1
    print("Training loss: {0:.4f}".format(loss_val/loss_cnt))


def valid_epoch(model, dataloader, criterion, epoch, phase='valid', batch=BATCHSIZE):
    model.eval()
    if phase == 'testing':
        print("Testing epoch {}".format(epoch+1))
    else:
        print("Validating epoch {}".format(epoch+1))
    out_pred = torch.FloatTensor().cuda()
    out_gt = torch.FloatTensor().cuda()
    loss_val = 0
    loss_cnt = 0
    for data, target in dataloader:
        # Get samples
        data = Variable(torch.FloatTensor(data).cuda(), volatile=True)
        target = Variable(torch.FloatTensor(target).cuda(), volatile=True)
         # Forwards
        output = model(data)
        # Loss
        loss = criterion(output, target)
        # Log the loss
        loss_val += loss.data[0]
        loss_cnt += 1
        # Log for AUC
        out_pred = torch.cat((out_pred, output.data), 0)
        out_gt = torch.cat((out_gt, target.data), 0)
    loss_mean = loss_val/loss_cnt
    if phase == 'testing':
        print("Test-Dataset loss: {0:.4f}".format(loss_mean))
        print("Test-Dataset AUC: {0:.4f}".format(compute_roc_auc(out_gt, out_pred)))

    else:
        print("Validation loss: {0:.4f}".format(loss_mean))
        print("Validation AUC: {0:.4f}".format(compute_roc_auc(out_gt, out_pred)))
    return loss_mean

def print_learning_rate(opt):
    for param_group in opt.param_groups:
        print("Learining rate: ", param_group['lr'])

In [20]:
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCHSIZE,
                          shuffle=True, num_workers=0, pin_memory=False)

valid_loader = DataLoader(dataset=valid_dataset, batch_size=8*BATCHSIZE,
                          shuffle=False, num_workers=0, pin_memory=False)

test_loader = DataLoader(dataset=test_dataset, batch_size=8*BATCHSIZE,
                         shuffle=False, num_workers=0, pin_memory=False)

In [21]:
%%time
# Load symbol
azure_chest_xray_sym = get_symbol()

CPU times: user 2.62 s, sys: 492 ms, total: 3.11 s
Wall time: 3.12 s


In [22]:
%%time
# Load optimiser, loss
optimizer, criterion, scheduler = init_symbol(azure_chest_xray_sym)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.88 ms


In [23]:
loss_min = float("inf")    

# No-training
valid_epoch(azure_chest_xray_sym, valid_loader, criterion, -1)

# Main train/val/test loop
for j in (range(EPOCHS)):
    stime = time.time()
    train_epoch(azure_chest_xray_sym, train_loader, optimizer, criterion, j)
    loss_val = valid_epoch(azure_chest_xray_sym, valid_loader, criterion, j)
#     test_loss_val = valid_epoch(azure_chest_xray_sym, test_loader, criterion, j, 'testing')
    # LR Schedule
    scheduler.step(loss_val)
    print_learning_rate(optimizer)
    # todo: tensorboard hooks
    # Logging
    if loss_val < loss_min:
        print("Loss decreased. Saving ...")
        loss_min = loss_val
        torch.save({'epoch': j + 1, 
                    'state_dict': azure_chest_xray_sym.state_dict(), 
                    'best_loss': loss_min, 
                    'optimizer' : optimizer.state_dict()}, 'best_azure_chest_xray_model_v2.pth.tar')
    etime = time.time()
    print("Epoch time: {0:.0f} seconds".format(etime-stime))
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

Validating epoch 0


  0%|          | 0/473 [00:00<?, ?it/s]

Validation loss: 0.7077
Validation AUC: 0.5059
Training epoch 1


100%|██████████| 473/473 [03:38<00:00,  2.16it/s]


Training loss: 0.1684
Validating epoch 1
Validation loss: 0.1901
Validation AUC: 0.5183
Learining rate:  0.0001
Loss decreased. Saving ...


  0%|          | 0/473 [00:00<?, ?it/s]

Epoch time: 265 seconds
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training epoch 2


100%|██████████| 473/473 [03:37<00:00,  2.17it/s]


Training loss: 0.1278
Validating epoch 2
Validation loss: 0.2061
Validation AUC: 0.5307
Learining rate:  0.0001
Epoch time: 506 seconds
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [24]:
# # Load model for testing
# azure_chest_xray_sym_test = get_symbol()
# chkpt = torch.load("best_azure_chest_xray_model_v2.pth.tar")
# azure_chest_xray_sym_test.load_state_dict(chkpt['state_dict'])

# valid_loss = valid_epoch(azure_chest_xray_sym_test, valid_loader, criterion, -1)
# test_loss = valid_epoch(azure_chest_xray_sym_test, test_loader, criterion, -1, 'testing')