In [1]:
import os, glob
import json_tricks as json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import collections
import cv2
import tqdm
from PIL import Image
import torch

import torchvision.models

In [2]:
import sys
sys.path.append('/home/users/jsoelter/Code/ChestImageAI/utils/')
sys.path.append('/home/users/jsoelter/Code/big_transfer/')

import data_loader, evaluations, model_setup

### Model Setup

In [3]:
model_dict =  dict(
    architecture = 'BiT-M-R50x3',
    #architecture = 'densenet121',
    num_classes = 5,
    pretrained = 'imagenet', #'/home/users/jsoelter/models/chexpert/fullmeta_503/step05000.pt',
    fresh_head_weights = False
)

In [4]:
model_out = '/home/users/jsoelter/models/chexpert/dense/april_base7'

if not os.path.exists(model_out):
    os.makedirs(model_out)

model = model_setup.instantiate_model(**model_dict)

saved_models = glob.glob(os.path.join(model_out, 'step*.pt'))
if not saved_models:
    checkpoint = None
    ledger = collections.defaultdict(list)
    ledger['model'] = model_dict
    step = 0
else:
    last_model = np.sort(saved_models)[-1]
    print(f"Resume training for saved model '{last_model}'")
    checkpoint = torch.load(last_model, map_location="cpu")
    re_keyed = {k.split('module.')[-1]: v for k, v in checkpoint['model'].items()}
    model.load_state_dict(re_keyed)
    
    ledger = json.load(open(os.path.join(model_out, 'train_ledger.json')))
    step = checkpoint["step"]

    
# Lets cuDNN benchmark conv implementations and choose the fastest.
# Only good if sizes stay the same within the main loop!
torch.backends.cudnn.benchmark = True

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = torch.nn.DataParallel(model)
model = model.to(device)

BIT


### Data Setup

In [8]:
data_setup = dict(
    include_meta = [],
    #include_meta = ['Sex', 'AP/PA', 'Frontal/Lateral'],
    label_value_map = {
       0: 0.01,
       'nan': 0.05,
       -1.: 0.5,
       1: 0.9
    },
    fill_hierachy = {
        #'Enlarged Cardiomediastinum': ['Cardiomegaly'],
        #'Consolidation': ['Pneumonia'],
        #'Lung Opacity': ['Edema', 'Pneumonia', 'Consolidation', 'Lung Lesion', 'Atelectasis']
    },
    labels = ['Cardiomegaly', 'Edema',  'Consolidation', 'Atelectasis', 'Pleural Effusion'],
    subset = {}, # Define subsetting of data
)

transforms = [
    ('ToPILImage', {}),
    #('Resize', {
    #    'size': 320 #smaller edege mapped to x
    #}),
    ('Resize', {
        'size': 342
    }),
    ('RandomRotation', {
        'degrees': 5
    }),    
    ('RandomCrop', {
        'size': (320, 320)
    }),
    ('ToTensor', {}),
    ('Normalize', {
        #'mean': [0.485, 0.456, 0.406], 
        'mean': (0.5, 0.5, 0.5),
        #'std': [0.229, 0.224, 0.225]  
        'std': (0.5, 0.5, 0.5)
    }),
]

In [9]:
preprocess = data_loader.transform_pipeline_from_dict(transforms)

data = data_loader.ChexpertData('CheXpert-v1.0/train.csv', transform=preprocess, **data_setup)
internal_valid_data, train_data = torch.utils.data.random_split(data, [1000, len(data)-1000], generator=torch.Generator().manual_seed(42))
external_valid_data = data_loader.ChexpertData('CheXpert-v1.0/valid.csv', transform=preprocess, 
                                               labels=data_setup['labels'], include_meta=data_setup['include_meta'])

Removed 0 entries
Removed 0 entries


In [10]:
real_batch_size = 256#128
batch_split = 16 #8 #number of forward pathes before optimization is performed 

train_loader = torch.utils.data.DataLoader(train_data, batch_size=int(real_batch_size/batch_split), num_workers=8, shuffle=True, drop_last=False)

valid_int_loader = torch.utils.data.DataLoader(internal_valid_data, batch_size=int(real_batch_size/batch_split), num_workers=8, shuffle=True)
valid_ext_loader = torch.utils.data.DataLoader(external_valid_data, batch_size=16, num_workers=8)

len(train_loader)/batch_split

868.8125

### Optimizer Setup

In [13]:
opt = 'Adam'
#opt = 'SGD'
opt_param = dict(
    lr = 3E-4,
    #momentum=0.9
)

In [14]:
#optim = torch.optim.SGD(model.parameters(), lr=0.003, momentum=0.9)
#optim = torch.optim.SGD([p for n,p in model.named_parameters() if 'head' in n], lr=0.003, momentum=0.9)
optim = getattr(torch.optim, opt)(model.parameters(), **opt_param)
if  checkpoint is not None:
    optim.load_state_dict(checkpoint["optim"])
else:
    optim.zero_grad()

In [15]:
supports = [100, 2000, 3000, 3500, 4000]

### Loss

In [16]:
train_on = None #['Cardiomegaly'] #,  'Consolidation' 'Cardiomegaly', 'Atelectasis', 'Pleural Effusion', 'Edema', 'Support Devices',  'AP/PA', 'Sex', 'Frontal/Lateral'] #,
train_cols = [i in train_on for i in data.targets] if train_on else None

In [17]:
weighting = 'equal'

if weighting == 'equal':
    w_pos = [] 
    for c in data_setup['labels']:
        m = (train_data.dataset.meta_df[c]>0.5)
        w_pos.append(np.logical_not(m).sum()/m.sum())
else:
    w_pos = None

In [18]:
import importlib
importlib.reload(model_setup)
importlib.reload(evaluations)

<module 'evaluations' from '/home/users/jsoelter/Code/ChestImageAI/utils/evaluations.py'>

In [19]:
crit = model_setup.maskedBCE(train_cols, device, pos_weight=w_pos)

### Initial errors

In [20]:
preds, targets = evaluations.batch_prediction(model, valid_ext_loader)
print(f'AUC on {train_on}: {evaluations.eval_auc(preds, targets, train_cols):.3f}')
print(f'Crit on {train_on}: {evaluations.eval_crit(model, valid_int_loader, crit):.3f}')

AUC on None: 0.500
Crit on None: 0.693


### Training Loop

In [21]:
eval_intervall = 50
save_intervall = 2000

In [None]:
accum_steps = 0
batch_loss, batch_samples = 0, 0
lr = opt_param['lr']

train_setup = ledger.setdefault('train_setup', {})
train_setup[step] = {
    'transforms': transforms,
    'data_setup': data_setup,
    'optimizer': {
        'name': opt,
        'param': opt_param
    },
    'real_batch_size': real_batch_size,
    'batch_split': batch_split
}

while lr:
    for x, y, _ in train_loader:
        
        _ = model.train()
        
        # Schedule sending to GPU(s)
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)

        # Update learning-rate, including stop training if over.
        lr = model_setup.get_lr(step, supports=supports, base_lr=opt_param['lr'])
        if lr is None: break
        for param_group in optim.param_groups:
            param_group["lr"] = lr

        logits = model(x)
        loss, n_samples = crit(logits, y)
        if loss != 0:
            # Accumulate grads
            (loss / batch_split / n_samples).backward()

        batch_loss += float(loss.data.cpu().numpy())  # Also ensures a sync point.
        batch_samples += n_samples.cpu().numpy()

        accum_steps += 1

        # Update params
        if accum_steps == batch_split:
            optim.step()
            optim.zero_grad()
            train_loss = batch_loss/batch_samples
            ledger['train_loss'].append(train_loss)
            batch_loss, batch_samples = 0, 0
            ledger['lr'].append(lr)
            step += 1
            accum_steps = 0
            
            # Evaluate 
            if (step % eval_intervall) == 0:
                preds, targets = evaluations.batch_prediction(model, valid_ext_loader)
                #auc_selection = evaluations.eval_auc(preds, targets, train_cols)
                #benchmark_cols = [i in ['Consolidation',  'Cardiomegaly', 'Atelectasis', 'Pleural Effusion', 'Edema'] for i in data.targets] 
                auc_bench = evaluations.eval_auc_percol(preds, targets)
                ledger['external'].append([step-1].extend(auc_bench))
                val = evaluations.eval_crit(model, valid_int_loader, crit)
                ledger['internal'].append((step-1, val.cpu().numpy()))
                score = ', '.join([f'{a:.2f}' for a in auc_bench])
                print(f'step {step} ->, train: {train_loss:.3f}, val internal : {val:.3f}, auc bench: ' + score) # FULL: 

            if (step % save_intervall) == 0:
                torch.save({
                        "step": step,
                        "model": model.module.state_dict(),
                        "optim" : optim.state_dict(),
                    }, 
                    os.path.join(model_out, f'step{step:05d}.pt')
                )
                json.dump(ledger, open(os.path.join(model_out, 'train_ledger.json'), 'w'))

step 50 ->, train: 0.666, val internal : 0.674, auc bench: 0.76, 0.78, 0.85, 0.79, 0.81
step 100 ->, train: 0.656, val internal : 0.654, auc bench: 0.72, 0.77, 0.88, 0.81, 0.82
step 150 ->, train: 0.644, val internal : 0.657, auc bench: 0.78, 0.81, 0.90, 0.81, 0.81
step 200 ->, train: 0.668, val internal : 0.649, auc bench: 0.78, 0.83, 0.90, 0.59, 0.82
step 250 ->, train: 0.652, val internal : 0.642, auc bench: 0.76, 0.83, 0.89, 0.68, 0.84
step 300 ->, train: 0.647, val internal : 0.628, auc bench: 0.81, 0.84, 0.93, 0.62, 0.85
step 350 ->, train: 0.620, val internal : 0.624, auc bench: 0.80, 0.86, 0.92, 0.84, 0.88
step 400 ->, train: 0.611, val internal : 0.625, auc bench: 0.78, 0.87, 0.91, 0.77, 0.89


In [23]:
torch.save({
        "step": step,
        "model": model.module.state_dict(),
        "optim" : optim.state_dict(),
    }, 
    os.path.join(model_out, f'step{step:05d}.pt')
)
_ = json.dump(ledger, open(os.path.join(model_out, 'train_ledger.json'), 'w'))

In [24]:
step

4000