In [1]:
# Torch
import torch
import torchvision
from torch.backends import cudnn
from torch.nn import functional as F
from torch.utils.data import sampler
from torch.utils.data import DataLoader
from torch.utils.data.sampler import RandomSampler
cudnn.benchmark = True
# Data Handling
import numpy as np
import pandas as pd
# System
import os
import time
import pprint
import itertools
# Custom
import models
import datasets
from helpers import utils

# LCFCN Model

Locates Objects with Point Supervision Training

Based on https://github.com/ElementAI/LCFCN

Experiment Settings below could eventually be removed


In [2]:
####################### EXPERIMENT SETTINGS ################################
EXP_GROUPS = {}
EXP_GROUPS['trancos'] = {"dataset": {'name': 'trancos', 
                                     'transform': 'rgb_normalize'},
                         "model": {'name': 'lcfcn',
                                   'base': "fcn8_vgg16"},
                         "batch_size": [1, 5, 10],
                         "max_epoch": [100],
                         'dataset_size': [{'train': 'all', 
                                           'val': 'all'},],
                         'optimizer': ['adam'],
                         'lr':[1e-5]}

EXP_GROUPS['trancos_debug'] = {"dataset": {'name': 'trancos', 
                                           'transform': 'rgb_normalize'},
                               "model": {'name': 'lcfcn',
                                         'base': "fcn8_vgg16"},
                               "batch_size": [1, 5, 10],
                               "max_epoch": [5],
                               'dataset_size': [{'train': 5, 
                                                 'val': 5},],
                               'optimizer':['adam'],
                               'lr':[1e-5]}

EXP_GROUPS['denmark_debug'] = {"dataset": {'name': 'denmark', 
                                           'transform': 'rgb_normalize'},
                               "model": {'name': 'lcfcn',
                                         'base': "fcn8_vgg16"},
                               "batch_size": [1, 5, 10],
                               "max_epoch": [5],
                               'dataset_size': [{'train': 1, 
                                                 'val': 1},],
                               'optimizer':['adam'],
                               'lr':[1e-5]}

EXP_GROUPS = {k: utils.cartesian(v) for k, v in EXP_GROUPS.items()}

exp_group_list = ["trancos_debug"]
exp_list = []
for exp_group_name in exp_group_list:
    exp_list += EXP_GROUPS[exp_group_name]
exp_dict = exp_list[0]
    
########################## FILE SYSTEM SETTINGS ###########################

savedir_base = "/home/jovyan/work/jannis/saves/LCFCN"
datadir = "/home/jovyan/work/jannis/data/TRANCOS"

############################### PRINTS ####################################

pprint.pprint(exp_dict)

{'batch_size': 1,
 'dataset': {'name': 'trancos', 'transform': 'rgb_normalize'},
 'dataset_size': {'train': 5, 'val': 5},
 'lr': 1e-05,
 'max_epoch': 5,
 'model': {'base': 'fcn8_vgg16', 'name': 'lcfcn'},
 'optimizer': 'adam'}


## Saving Location

Create new folder for the selected experiment and save the experiment dict

In [3]:
exp_id = utils.hashDict(exp_dict) #generate ID by hashing experiment dict
savedir = os.path.join(savedir_base, exp_id)

# Backup and Overwrite previous experiment with same name
utils.deleteExperiment(savedir, backup_flag = True)
print("Cleared previous experiment...")

os.makedirs(savedir, exist_ok=True)
utils.saveJSON(os.path.join(savedir, "exp_dict.json"), exp_dict)
print("Experiment saved in %s" % savedir)

Experiment saved in /home/jovyan/work/jannis/saves/LCFCN/b64501300fd5500efb7d3ef455b67732


## Data

Introduce datasets and dataloaders

In [4]:
train_set = datasets.getDataset(dataset_dict = exp_dict["dataset"],
                                 split = "train",
                                 datadir = datadir,
                                 exp_dict = exp_dict,
                                 dataset_size = exp_dict['dataset_size'])
val_set = datasets.getDataset(dataset_dict = exp_dict["dataset"],
                               split = "val",
                               datadir = datadir,
                               exp_dict = exp_dict,
                               dataset_size = exp_dict['dataset_size'])

train_sampler = torch.utils.data.RandomSampler(train_set, replacement=True, num_samples=2*len(val_set))
train_loader = DataLoader(train_set,
                          sampler = train_sampler,
                          batch_size = exp_dict["batch_size"], 
                          drop_last = True, 
                          num_workers = 2)

val_sampler = torch.utils.data.SequentialSampler(val_set)
val_loader = DataLoader(val_set,
                        sampler = val_sampler,
                        batch_size = 1,
                        num_workers = 2)

## Model

Load Model and underlying base model

In [5]:
model = models.getModel(model_dict = exp_dict['model'],
                         exp_dict = exp_dict,
                         train_set = train_set).cuda()

# model.opt = optimizers.get_optim(exp_dict['opt'], model)

## Experiment Run Management 

Resume experiment if a previous score_list exists or start a new one from epoch 0 if not

In [6]:
model_path = os.path.join(savedir, "model.pth")
score_list_path = os.path.join(savedir, "score_list.pkl")

if os.path.exists(score_list_path): #resume
    model.loadStateDict(utils.loadTorch(model_path))
    score_list = utils.loadPKL(score_list_path)
    s_epoch = score_list[-1]['epoch'] + 1
    print(f"Resuming previous experiment fom epoch {s_epoch}")
else: #restart
    score_list = []
    s_epoch = 0
    print(f"Beginning new experiment from epoch {s_epoch}")

Beginning new experiment from epoch 0


## Main Epoch Loop

Each epoch conists of training, validation, updating the statstics and saving the best as well as the most recent model and validation statistics

In [7]:
for e in range(s_epoch, exp_dict['max_epoch']):
    # Validate only at the start of each cycle
    score_dict = {}
    # Train the model
    train_dict = model.trainOnLoader(model, train_loader)
    print("Training done...")
    # Validate and Visualize the model
    val_dict = model.valOnLoader(val_loader, savedir_images=os.path.join(savedir, "images"), n_images=3)
    # model.visOnLoader(vis_loader, savedir=os.path.join(savedir, "images"))
    print("Validation done..")
    
    # Update score_dict and add to score_list
    score_dict.update(val_dict)
    score_dict.update(train_dict)
    score_dict["epoch"] = len(score_list)
    score_list += [score_dict]

    # Report score_list
    score_df = pd.DataFrame(score_list)
    print("\n", score_df.tail(), "\n")
    
    # Save Model and score_list
    utils.saveTorch(model_path, model.getStateDict())
    utils.savePKL(score_list_path, score_list)
    print("Checkpoint Saved: %s" % savedir)

    # Save best Checkpoint
    if e == 0 or (score_dict.get("val_score", 0) > score_df["val_score"][:-1].fillna(0).max()):
        utils.savePKL(os.path.join(savedir, "score_list_best.pkl"), score_list)
        utils.saveTorch(os.path.join(savedir, "model_best.pth"), model.getStateDict())
        print("Saved Best: %s" % savedir)
    print(f"Epoch {e+1} of {exp_dict['max_epoch'] - s_epoch} completed.")

print(f"Experiment completed!")

Training. Loss: 50.1538: 100%|██████████| 10/10 [00:13<00:00,  1.38s/it]
  0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A

Training done...


Validating. MAE: 41.0000:  20%|██        | 1/5 [00:01<00:04,  1.18s/it]
Validating. MAE: 43.5000:  40%|████      | 2/5 [00:02<00:03,  1.04s/it]
Validating. MAE: 41.0000:  60%|██████    | 3/5 [00:02<00:01,  1.29it/s]
Validating. MAE: 44.0000:  80%|████████  | 4/5 [00:03<00:00,  1.43it/s]
Validating. MAE: 44.6000: 100%|██████████| 5/5 [00:04<00:00,  1.26it/s]
100%|██████████| 5/5 [00:04<00:00,  1.19it/s][A
Validating. MAE: 44.6000: 100%|██████████| 5/5 [00:04<00:00,  1.19it/s]


Validation done..

    val_mae  val_score  train_loss  epoch
0     44.6      -44.6   50.153827      0 

Checkpoint Saved: /home/jovyan/work/jannis/saves/LCFCN/b64501300fd5500efb7d3ef455b67732


  0%|          | 0/10 [00:00<?, ?it/s]

Saved Best: /home/jovyan/work/jannis/saves/LCFCN/b64501300fd5500efb7d3ef455b67732
Epoch 1 of 5 completed.


Training. Loss: 45.1515: 100%|██████████| 10/10 [00:01<00:00,  5.10it/s]
  0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A

Training done...


Validating. MAE: 25.0000:  20%|██        | 1/5 [00:00<00:01,  3.38it/s]
Validating. MAE: 28.0000:  40%|████      | 2/5 [00:00<00:00,  4.33it/s]
Validating. MAE: 28.0000:  60%|██████    | 3/5 [00:00<00:00,  4.86it/s]
100%|██████████| 5/5 [00:00<00:00,  5.47it/s]5 [00:00<00:00,  5.57it/s]
Validating. MAE: 32.8000: 100%|██████████| 5/5 [00:00<00:00,  5.43it/s]


Validation done..

    val_mae  val_score  train_loss  epoch
0     44.6      -44.6   50.153827      0
1     32.8      -32.8   45.151526      1 

Checkpoint Saved: /home/jovyan/work/jannis/saves/LCFCN/b64501300fd5500efb7d3ef455b67732


  0%|          | 0/10 [00:00<?, ?it/s]

Saved Best: /home/jovyan/work/jannis/saves/LCFCN/b64501300fd5500efb7d3ef455b67732
Epoch 2 of 5 completed.


Training. Loss: 44.6693: 100%|██████████| 10/10 [00:02<00:00,  4.11it/s]
  0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A

Training done...


Validating. MAE: 29.0000:  20%|██        | 1/5 [00:00<00:01,  3.42it/s]
Validating. MAE: 29.5000:  40%|████      | 2/5 [00:00<00:00,  4.00it/s]
Validating. MAE: 27.3333:  60%|██████    | 3/5 [00:00<00:00,  4.34it/s]
100%|██████████| 5/5 [00:01<00:00,  4.84it/s]5 [00:00<00:00,  4.72it/s]
Validating. MAE: 28.8000: 100%|██████████| 5/5 [00:01<00:00,  4.84it/s]


Validation done..

    val_mae  val_score  train_loss  epoch
0     44.6      -44.6   50.153827      0
1     32.8      -32.8   45.151526      1
2     28.8      -28.8   44.669345      2 

Checkpoint Saved: /home/jovyan/work/jannis/saves/LCFCN/b64501300fd5500efb7d3ef455b67732


  0%|          | 0/10 [00:00<?, ?it/s]

Saved Best: /home/jovyan/work/jannis/saves/LCFCN/b64501300fd5500efb7d3ef455b67732
Epoch 3 of 5 completed.


Training. Loss: 42.0703: 100%|██████████| 10/10 [00:02<00:00,  3.78it/s]
  0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A

Training done...


Validating. MAE: 30.0000:  20%|██        | 1/5 [00:00<00:01,  3.55it/s]
Validating. MAE: 27.5000:  40%|████      | 2/5 [00:00<00:00,  3.68it/s]
Validating. MAE: 24.6667:  60%|██████    | 3/5 [00:00<00:00,  3.79it/s]
100%|██████████| 5/5 [00:01<00:00,  4.42it/s]5 [00:01<00:00,  4.10it/s]
Validating. MAE: 25.0000: 100%|██████████| 5/5 [00:01<00:00,  4.42it/s]


Validation done..

    val_mae  val_score  train_loss  epoch
0     44.6      -44.6   50.153827      0
1     32.8      -32.8   45.151526      1
2     28.8      -28.8   44.669345      2
3     25.0      -25.0   42.070331      3 

Checkpoint Saved: /home/jovyan/work/jannis/saves/LCFCN/b64501300fd5500efb7d3ef455b67732


  0%|          | 0/10 [00:00<?, ?it/s]

Saved Best: /home/jovyan/work/jannis/saves/LCFCN/b64501300fd5500efb7d3ef455b67732
Epoch 4 of 5 completed.


Training. Loss: 38.7645: 100%|██████████| 10/10 [00:02<00:00,  3.55it/s]
  0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A

Training done...


Validating. MAE: 22.0000:  20%|██        | 1/5 [00:00<00:01,  3.68it/s]
Validating. MAE: 21.5000:  40%|████      | 2/5 [00:00<00:00,  3.41it/s]
Validating. MAE: 20.0000:  60%|██████    | 3/5 [00:00<00:00,  3.36it/s]
100%|██████████| 5/5 [00:01<00:00,  4.04it/s]5 [00:01<00:00,  3.71it/s]
Validating. MAE: 22.8000: 100%|██████████| 5/5 [00:01<00:00,  4.03it/s]


Validation done..

    val_mae  val_score  train_loss  epoch
0     44.6      -44.6   50.153827      0
1     32.8      -32.8   45.151526      1
2     28.8      -28.8   44.669345      2
3     25.0      -25.0   42.070331      3
4     22.8      -22.8   38.764454      4 

Checkpoint Saved: /home/jovyan/work/jannis/saves/LCFCN/b64501300fd5500efb7d3ef455b67732
Saved Best: /home/jovyan/work/jannis/saves/LCFCN/b64501300fd5500efb7d3ef455b67732
Epoch 5 of 5 completed.
Experiment completed!
