In [1]:
import os, glob, json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import collections
import cv2
import tqdm
from PIL import Image
import torch

import torchvision.models

In [2]:
import sys
sys.path.append('/home/users/jsoelter/Code/ChestImageAI/utils/')
sys.path.append('/home/users/jsoelter/Code/big_transfer/')

import data_loader, evaluations, model_setup

### Model Setup

In [3]:
model_dict =  dict(
    #architecture = 'BiT-M-R50x3',
    architecture = 'densenet121',
    num_classes = 5,
    pretrained = 'imagenet', #'/home/users/jsoelter/models/chexpert/fullmeta_503/step05000.pt',
    fresh_head_weights = False
)

In [16]:
model_out = '/home/users/jsoelter/models/chexpert/densenet/pain_lowres3'

if not os.path.exists(model_out):
    os.makedirs(model_out)

model = model_setup.instantiate_model(**model_dict)

saved_models = glob.glob(os.path.join(model_out, 'step*.pt'))
if not saved_models:
    checkpoint = None
    ledger = collections.defaultdict(list)
    ledger['model'] = model_dict
    step = 0
else:
    last_model = np.sort(saved_models)[-1]
    print(f"Resume training for saved model '{last_model}'")
    checkpoint = torch.load(last_model, map_location="cpu")
    re_keyed = {k.split('module.')[-1]: v for k, v in checkpoint['model'].items()}
    model.load_state_dict(re_keyed)
    
    ledger = json.load(open(os.path.join(model_out, 'train_ledger.json')))
    step = checkpoint["step"]

    
# Lets cuDNN benchmark conv implementations and choose the fastest.
# Only good if sizes stay the same within the main loop!
torch.backends.cudnn.benchmark = True

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = torch.nn.DataParallel(model)
model = model.to(device)

### Data Setup

In [17]:
data_setup = dict(
    include_meta = [],
    #include_meta = ['Sex', 'AP/PA', 'Frontal/Lateral'],
    label_value_map = {
       0: 0.05,
       'nan': 0.1,
       -1.: 0.5,
       1: 0.9
    },
    fill_hierachy = {
        #'Enlarged Cardiomediastinum': ['Cardiomegaly'],
        #'Consolidation': ['Pneumonia'],
        #'Lung Opacity': ['Edema', 'Pneumonia', 'Consolidation', 'Lung Lesion', 'Atelectasis']
    },
    labels = ['Cardiomegaly', 'Edema',  'Consolidation', 'Atelectasis', 'Pleural Effusion'],
    subset = {}, # Define subsetting of data
)

transforms = [
    ('ToPILImage', {}),
    ('Resize', {
        'size': 544
    }),
    ('RandomRotation', {
        'degrees': 5
    }),    
    ('RandomCrop', {
        'size': (512, 512)
    }),
    ('Resize', {
        'size': 136 #smaller edege mapped to x
    }),
    ('ToTensor', {}),
    ('Normalize', {
        'mean': [0.485, 0.456, 0.406], #'mean': (0.5, 0.5, 0.5),
        'std': [0.229, 0.224, 0.225]  #(0.5, 0.5, 0.5)
    }),
]

In [18]:
preprocess = data_loader.transform_pipeline_from_dict(transforms)

data = data_loader.ChexpertData('CheXpert-v1.0/train.csv', transform=preprocess, **data_setup)
internal_valid_data, train_data = torch.utils.data.random_split(data, [1000, len(data)-1000], generator=torch.Generator().manual_seed(42))
external_valid_data = data_loader.ChexpertData('CheXpert-v1.0/valid.csv', transform=preprocess, 
                                               labels=data_setup['labels'], include_meta=data_setup['include_meta'])

Removed 0 entries
Removed 0 entries


In [19]:
real_batch_size = 32#128
batch_split = 1 #8 #number of forward pathes before optimization is performed 

train_loader = torch.utils.data.DataLoader(train_data, batch_size=int(real_batch_size/batch_split), num_workers=8, shuffle=True, drop_last=False)

valid_int_loader = torch.utils.data.DataLoader(internal_valid_data, batch_size=int(real_batch_size/batch_split), num_workers=8, shuffle=True)
valid_ext_loader = torch.utils.data.DataLoader(external_valid_data, batch_size=16, num_workers=8)

len(train_loader)/batch_split

6951.0

### Optimizer Setup

In [20]:
opt = 'Adam'#'SGD'
opt_param = dict(
    lr = 1E-4,
    #momentum=0.9
)

In [21]:
#optim = torch.optim.SGD(model.parameters(), lr=0.003, momentum=0.9)
#optim = torch.optim.SGD([p for n,p in model.named_parameters() if 'head' in n], lr=0.003, momentum=0.9)
optim = getattr(torch.optim, opt)(model.parameters(), **opt_param)
if  checkpoint is not None:
    optim.load_state_dict(checkpoint["optim"])
else:
    optim.zero_grad()

In [22]:
supports = [-1, len(train_loader), int(1.5*len(train_loader)), int(2*len(train_loader))]#[3000, 7000, 9000, 10000]

### Loss

In [23]:
train_on = None #['Cardiomegaly'] #,  'Consolidation' 'Cardiomegaly', 'Atelectasis', 'Pleural Effusion', 'Edema', 'Support Devices',  'AP/PA', 'Sex', 'Frontal/Lateral'] #,
train_cols = [i in train_on for i in data.targets] if train_on else None

In [24]:
crit = model_setup.maskedBCE(train_cols, device)

### Initial errors

In [25]:
preds, targets = evaluations.batch_prediction(model, valid_ext_loader)
print(f'AUC on {train_on}: {evaluations.eval_auc(preds, targets, train_cols):.3f}')
print(f'Crit on {train_on}: {evaluations.eval_crit(model, valid_int_loader, crit):.3f}')

AUC on None: 0.556
Crit on None: 0.691


### Training Loop

In [26]:
eval_intervall = 50
save_intervall = 3500

In [None]:
accum_steps = 0
batch_loss, batch_samples = 0, 0
lr = opt_param['lr']

train_setup = ledger.setdefault('train_setup', {})
train_setup[step] = {
    'transforms': transforms,
    'data_setup': data_setup,
    'optimizer': {
        'name': opt,
        'param': opt_param
    },
    'real_batch_size': real_batch_size,
    'batch_split': batch_split
}

while lr:
    for x, y in train_loader:
        
        _ = model.train()
        
        # Schedule sending to GPU(s)
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)

        # Update learning-rate, including stop training if over.
        lr = model_setup.get_lr(step, supports=supports, base_lr=opt_param['lr'])
        if lr is None: break
        for param_group in optim.param_groups:
            param_group["lr"] = lr

        logits = model(x)
        loss, n_samples = crit(logits, y)
        if loss != 0:
            # Accumulate grads
            (loss / batch_split / n_samples).backward()

        batch_loss += float(loss.data.cpu().numpy())  # Also ensures a sync point.
        batch_samples += n_samples.cpu().numpy()

        accum_steps += 1

        # Update params
        if accum_steps == batch_split:
            optim.step()
            optim.zero_grad()
            train_loss = batch_loss/batch_samples
            ledger['train_loss'].append(train_loss)
            batch_loss, batch_samples = 0, 0
            ledger['lr'].append(lr)
            step += 1
            accum_steps = 0
            
            # Evaluate 
            if (step % eval_intervall) == 0:
                preds, targets = evaluations.batch_prediction(model, valid_ext_loader)
                auc_selection = evaluations.eval_auc(preds, targets, train_cols)
                benchmark_cols = [i in ['Consolidation',  'Cardiomegaly', 'Atelectasis', 'Pleural Effusion', 'Edema'] for i in data.targets] 
                auc_bench = evaluations.eval_auc(preds, targets, benchmark_cols)
                ledger['external'].append((step-1, auc_selection))
                val = evaluations.eval_crit(model, valid_int_loader, crit)
                ledger['internal'].append((step-1, val))
                print(f'step {step} ->, train: {train_loss:.3f}, val internal : {val:.3f}, auc select: {auc_selection:.3f}, auc bench: {auc_bench:.3f}') # FULL: 

            if (step % save_intervall) == 0:
                torch.save({
                        "step": step,
                        "model": model.module.state_dict(),
                        "optim" : optim.state_dict(),
                    }, 
                    os.path.join(model_out, f'step{step:05d}.pt')
                )
                json.dump(ledger, open(os.path.join(model_out, 'train_ledger.json'), 'w'))

step 50 ->, train: 0.592, val internal : 0.562, auc select: 0.731, auc bench: 0.731
step 100 ->, train: 0.574, val internal : 0.551, auc select: 0.739, auc bench: 0.739
step 150 ->, train: 0.512, val internal : 0.544, auc select: 0.750, auc bench: 0.750
step 200 ->, train: 0.535, val internal : 0.547, auc select: 0.777, auc bench: 0.777
step 250 ->, train: 0.608, val internal : 0.543, auc select: 0.769, auc bench: 0.769
step 300 ->, train: 0.539, val internal : 0.542, auc select: 0.776, auc bench: 0.776
step 350 ->, train: 0.502, val internal : 0.538, auc select: 0.798, auc bench: 0.798
step 400 ->, train: 0.579, val internal : 0.538, auc select: 0.783, auc bench: 0.783
step 450 ->, train: 0.520, val internal : 0.539, auc select: 0.807, auc bench: 0.807
step 500 ->, train: 0.510, val internal : 0.536, auc select: 0.796, auc bench: 0.796
step 550 ->, train: 0.526, val internal : 0.536, auc select: 0.783, auc bench: 0.783
step 600 ->, train: 0.545, val internal : 0.534, auc select: 0.806

In [None]:
torch.save({
        "step": step,
        "model": model.module.state_dict(),
        "optim" : optim.state_dict(),
    }, 
    os.path.join(model_out, f'step{step:05d}.pt')
)
json.dump(ledger, open(os.path.join(model_out, 'train_ledger.json'), 'w'))

In [None]:
1+1

In [None]:
ledger['model'] = model_dict

In [None]:
torch.cuda.empty_cache()