In [1]:
# Torch
import torch
import torchvision
from torch.backends import cudnn
from torch.nn import functional as F
from torch.utils.data import sampler
from torch.utils.data import DataLoader
from torch.utils.data.sampler import RandomSampler
cudnn.benchmark = True
# Data Handling
import numpy as np
import pandas as pd
# System
import os
import time
import pprint
import itertools
# Custom
import models
import datasets
from helpers import utils

# Model Training Framework

This Notebook works for all model configurations detailed in the experiment settings below

Experiment Settings below could eventually be removed, not the best way of putting it rn with the dictionary tbh


In [2]:
####################### EXPERIMENT SETTINGS ################################
EXP_GROUPS = {}

EXP_GROUPS['denmark'] = {"dataset": {'name': 'denmark', 
                                     'transform': 'rgb_normalize'},
                               "model": {'name': 'lcfcn',
                                         'base': "fcn8_vgg16"},
                               "batch_size": [1],
                               "max_epoch": [5],
                               'dataset_size': [{'train': 'all', 
                                                 'val': 'all'},],
                               'optimizer':['adam'],
                               'lr':[1e-5]}

EXP_GROUPS['denmark_debug'] = {"dataset": {'name': 'denmark', 
                                           'transform': 'rgb_normalize'},
                               "model": {'name': 'lcfcn',
                                         'base': "fcn8_vgg16"},
                               "batch_size": [1],
                               "max_epoch": [5],
                               'dataset_size': [{'train': 10, 
                                                 'val': 5},],
                               'optimizer':['adam'],
                               'lr':[1e-5]}

EXP_GROUPS['denmark_debug_cob'] = {"dataset": {'name': 'denmark', 
                                           'transform': 'rgb_normalize'},
                               "model": {'name': 'cob',
                                         'base': "fcn8_vgg16"},
                               "batch_size": [1],
                               "max_epoch": [5],
                               'dataset_size': [{'train': 'all', 
                                                 'val': 'all'},],
                               'optimizer':['adam'],
                               'lr':[1e-5]}

EXP_GROUPS = {k: utils.cartesian(v) for k, v in EXP_GROUPS.items()}

exp_group_list = ["denmark_debug_cob"]
exp_list = []
for exp_group_name in exp_group_list:
    exp_list += EXP_GROUPS[exp_group_name]
exp_dict = exp_list[0]
    
########################## FILE SYSTEM SETTINGS ###########################

savedir_base = "/home/jovyan/work/runs/LCFCN"
#datadir = "/home/jovyan/work/data/TRANCOS"
datadir = "/home/jovyan/work/DENMARK/250x250"

############################### PRINTS ####################################

pprint.pprint(exp_dict)

{'batch_size': 1,
 'dataset': {'name': 'denmark', 'transform': 'rgb_normalize'},
 'dataset_size': {'train': 'all', 'val': 'all'},
 'lr': 1e-05,
 'max_epoch': 5,
 'model': {'base': 'fcn8_vgg16', 'name': 'cob'},
 'optimizer': 'adam'}


## Saving Location

Create new folder for the selected experiment and save the experiment dict

In [3]:
exp_id = utils.hashDict(exp_dict) #generate ID by hashing experiment dict
savedir = os.path.join(savedir_base, exp_id)

# Backup and Overwrite previous experiment with same name
utils.deleteExperiment(savedir, backup_flag = True)
print("Cleared previous experiment...")

os.makedirs(savedir, exist_ok=True)
utils.saveJSON(os.path.join(savedir, "exp_dict.json"), exp_dict)
print("Experiment saved in %s" % savedir)

Cleared previous experiment...
Experiment saved in /home/jovyan/work/runs/LCFCN/8bd70e101945e82afea8a5021b195720


## Data

Introduce datasets and dataloaders

In [4]:
train_set = datasets.getDataset(dataset_dict = exp_dict["dataset"],
                                 split = "train",
                                 datadir = datadir,
                                 exp_dict = exp_dict,
                                 dataset_size = exp_dict['dataset_size'])
val_set = datasets.getDataset(dataset_dict = exp_dict["dataset"],
                               split = "val",
                               datadir = datadir,
                               exp_dict = exp_dict,
                               dataset_size = exp_dict['dataset_size'])

# find out if this makes sense
#train_sampler = torch.utils.data.RandomSampler(train_set, replacement=True, num_samples=2*len(val_set))
train_sampler = torch.utils.data.RandomSampler(train_set)
train_loader = DataLoader(train_set,
                          sampler = train_sampler,
                          batch_size = exp_dict["batch_size"], 
                          drop_last = True, 
                          num_workers = 2)

val_sampler = torch.utils.data.SequentialSampler(val_set)
val_loader = DataLoader(val_set,
                        sampler = val_sampler,
                        batch_size = 1,
                        num_workers = 2)

In [None]:
# Mean computation, maybe try out
means = []
stds = []

for i, data in enumerate(train_loader, 0):
    # shape (batch_size, 3, height, width)
    numpy_image = data['images'].numpy()
    
    # shape (3,)
    batch_mean = np.mean(numpy_image, axis=(0,2,3))
    batch_std = np.std(numpy_image, axis=(0,2,3))
    
    means.append(batch_mean)
    stds.append(batch_std)

# shape (num_iterations, 3) -> (mean across 0th axis) -> shape (3,)
means = np.array(means).mean(axis=0)
stds = np.array(stds).mean(axis=0)
print(f"Means: {means} \n SDs: {stds}")

## Model

Load Model and underlying base model

In [5]:
model = models.getModel(model_dict = exp_dict['model'],
                         exp_dict = exp_dict,
                         train_set = train_set).cuda()

## Experiment Run Management 

Resume experiment if a previous score_list exists or start a new one from epoch 0 if not

In [6]:
model_path = os.path.join(savedir, "model.pth")
score_list_path = os.path.join(savedir, "score_list.pkl")

if os.path.exists(score_list_path): #resume
    model.loadStateDict(utils.loadTorch(model_path))
    score_list = utils.loadPKL(score_list_path)
    s_epoch = score_list[-1]['epoch'] + 1
    print(f"Resuming previous experiment fom epoch {s_epoch}")
else: #restart
    score_list = []
    s_epoch = 0
    print(f"Beginning new experiment from epoch {s_epoch}")

Beginning new experiment from epoch 0


## Main Epoch Loop

Each epoch conists of training, validation, updating the statstics and saving the best as well as the most recent model and validation statistics

In [7]:
for e in range(s_epoch, exp_dict['max_epoch']):
    # Validate only at the start of each cycle
    score_dict = {}
    # Train the model
    train_dict = model.trainOnLoader(model, train_loader)
    print("Training done...")
    # Validate and Visualize the model
    val_dict = model.valOnLoader(val_loader, savedir_images=os.path.join(savedir, "images"), n_images=30)
    print("Validation done..")
    
    # Update score_dict and add to score_list
    score_dict.update(val_dict)
    score_dict.update(train_dict)
    score_dict["epoch"] = len(score_list)
    score_list += [score_dict]

    # Report score_list
    score_df = pd.DataFrame(score_list)
    print("\n", score_df.tail(), "\n")
    
    # Save Model and score_list
    utils.saveTorch(model_path, model.getStateDict())
    utils.savePKL(score_list_path, score_list)
    print("Checkpoint Saved: %s" % savedir)

    # Save best Checkpoint
    if e == 0 or (score_dict.get("val_score", 0) > score_df["val_score"][:-1].fillna(0).max()):
        utils.savePKL(os.path.join(savedir, "score_list_best.pkl"), score_list)
        utils.saveTorch(os.path.join(savedir, "model_best.pth"), model.getStateDict())
        print("Saved Best: %s" % savedir)
    print(f"Epoch {e+1} of {exp_dict['max_epoch'] - s_epoch} completed.")

print(f"Experiment completed!")

Training. Loss: 7.0418: 100%|██████████| 439/439 [00:30<00:00, 14.47it/s]
  0%|          | 0/126 [00:00<?, ?it/s]

Training done...


Validating. MAE: 2.2302: 100%|██████████| 126/126 [00:04<00:00, 29.69it/s]


Validation done..

     val_mae  val_score  train_loss  epoch
0  2.230159  -2.230159    7.041775      0 

Checkpoint Saved: /home/jovyan/work/runs/LCFCN/8bd70e101945e82afea8a5021b195720


  0%|          | 0/439 [00:00<?, ?it/s]

Saved Best: /home/jovyan/work/runs/LCFCN/8bd70e101945e82afea8a5021b195720
Epoch 1 of 5 completed.


Training. Loss: 6.8420: 100%|██████████| 439/439 [00:29<00:00, 15.01it/s]
  0%|          | 0/126 [00:00<?, ?it/s]

Training done...


Validating. MAE: 2.3492: 100%|██████████| 126/126 [00:12<00:00, 10.47it/s]


Validation done..

     val_mae  val_score  train_loss  epoch
0  2.230159  -2.230159    7.041775      0
1  2.349206  -2.349206    6.841991      1 



  0%|          | 0/439 [00:00<?, ?it/s]

Checkpoint Saved: /home/jovyan/work/runs/LCFCN/8bd70e101945e82afea8a5021b195720
Epoch 2 of 5 completed.


Training. Loss: 6.3875: 100%|██████████| 439/439 [00:29<00:00, 14.94it/s]
  0%|          | 0/126 [00:00<?, ?it/s]

Training done...


Validating. MAE: 1.4286: 100%|██████████| 126/126 [00:10<00:00, 11.51it/s]


Validation done..

     val_mae  val_score  train_loss  epoch
0  2.230159  -2.230159    7.041775      0
1  2.349206  -2.349206    6.841991      1
2  1.428571  -1.428571    6.387503      2 

Checkpoint Saved: /home/jovyan/work/runs/LCFCN/8bd70e101945e82afea8a5021b195720


  0%|          | 0/439 [00:00<?, ?it/s]

Saved Best: /home/jovyan/work/runs/LCFCN/8bd70e101945e82afea8a5021b195720
Epoch 3 of 5 completed.


Training. Loss: 5.5910: 100%|██████████| 439/439 [00:29<00:00, 14.64it/s]
  0%|          | 0/126 [00:00<?, ?it/s]

Training done...


Validating. MAE: 1.4603: 100%|██████████| 126/126 [00:13<00:00,  9.05it/s]


Validation done..

     val_mae  val_score  train_loss  epoch
0  2.230159  -2.230159    7.041775      0
1  2.349206  -2.349206    6.841991      1
2  1.428571  -1.428571    6.387503      2
3  1.460317  -1.460317    5.591024      3 



  0%|          | 0/439 [00:00<?, ?it/s]

Checkpoint Saved: /home/jovyan/work/runs/LCFCN/8bd70e101945e82afea8a5021b195720
Epoch 4 of 5 completed.


Training. Loss: 5.1228: 100%|██████████| 439/439 [00:30<00:00, 14.59it/s]
  0%|          | 0/126 [00:00<?, ?it/s]

Training done...


Validating. MAE: 2.0794: 100%|██████████| 126/126 [00:04<00:00, 29.33it/s]


Validation done..

     val_mae  val_score  train_loss  epoch
0  2.230159  -2.230159    7.041775      0
1  2.349206  -2.349206    6.841991      1
2  1.428571  -1.428571    6.387503      2
3  1.460317  -1.460317    5.591024      3
4  2.079365  -2.079365    5.122836      4 

Checkpoint Saved: /home/jovyan/work/runs/LCFCN/8bd70e101945e82afea8a5021b195720
Epoch 5 of 5 completed.
Experiment completed!
