In [1]:
import os, sys
# Root directory of the project
ROOT_DIR = os.path.abspath("../")

# To find local version of the library
sys.path.append(ROOT_DIR)

import dateutil.tz
import datetime
import pprint

import skopt
from skopt import gp_minimize


import numpy as np
import torch
from torchvision import transforms


from utils.dataloader import SVHNDataset
from utils.config import cfg, cfg_from_file
from utils.dataloader import prepare_dataloaders
from utils.misc import mkdir_p
from utils.transforms import FirstCrop, Rescale, RandomCrop, ToTensor
from utils.misc import load_obj
from utils.visualization import visualize_sample

from models.vgg import VGG
from trainer.trainer import train_model

from utils.checkpointer import CheckpointSaver

import matplotlib.pyplot as plt
%matplotlib inline

%load_ext autoreload
%autoreload 2


In [6]:
cfg_from_file("../config/base_config.yml")

vgg19 = VGG('VGG19', num_classes_length=7, num_classes_digits=10)

checkpoint = CheckpointSaver("../tmp_results")
checkpoint.save(vgg19, 11)

model = checkpoint.load("checkpoint_epoch11")

print(model)
print(cfg)

Init VGG
Checkpointing new model ...
VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace)
    (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU(inplace)
    (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14)

In [4]:
# Wrapper function for bayesian optimization
# The space contains the parameters to explore
def train_model_opt(space):
    print(space)
    (train_loader,
     valid_loader) = prepare_dataloaders(
        dataset_split='train',
        dataset_path='../data/SVHN/train',
        metadata_filename='../data/SVHN/train_metadata.pkl',
        batch_size=32,
        sample_size=1000,
        valid_split=0.8)
    
    vgg19 = VGG('VGG19', num_classes_length=7, num_classes_digits=10)


    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("Device used: ", device)
    return -train_model(vgg19,
                train_loader=train_loader,
                valid_loader=valid_loader,
                num_epochs=2,
                device=device,
                lr=space[0],
                checkpoint_dir="../checkpoints",
                output_dir="../tmp_results")


# Define the range 
space = [skopt.space.Real(10**-5, 10**0, "log-uniform", name='lr'), 
         skopt.space.Categorical(["VGG11", "VGG13", "VGG16", "VGG19"])]

mkdir_p("../tmp_results")


res_gp = gp_minimize(train_model_opt, space, n_calls=10,
                     random_state=0)

print("Best score: {0}".format(-res_gp.fun))
print("Best lr: {0}".format(res_gp.x[0]))

[0.009209225155490905, 'VGG19']
Init VGG
Device used:  cuda:0
Learning rate is: 0.009209225155490905
Directory run already exists
# Start training #


Iterating over training data...


[TRAIN] - EPOCH 1/ 2 - BATCH LOSS: 10.9502(avg) : 100%|██████████| 25/25 [00:11<00:00,  1.37it/s]
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:04<00:00,  1.28s/it]


Checkpointing new model ...

Epoch: 1/2
	Train Loss: 10.9502
	Valid Loss: 437.2933
	Valid Accuracy: 0.0250


Iterating over training data...


[TRAIN] - EPOCH 2/ 2 - BATCH LOSS: 10.2367(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.23it/s]
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  3.81it/s]



Epoch: 2/2
	Train Loss: 10.2367
	Valid Loss: 10.3013
	Valid Accuracy: 0.0200


Training complete in 0m 38s
Saving model ...
Best model saved to : ../tmp_results/best_model.pth
[0.19486241836466403, 'VGG19']
Init VGG
Device used:  cuda:0
Learning rate is: 0.19486241836466403
Directory run already exists
# Start training #


Iterating over training data...


[TRAIN] - EPOCH 1/ 2 - BATCH LOSS: 12.6815(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.29it/s]
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  3.82it/s]


Checkpointing new model ...

Epoch: 1/2
	Train Loss: 12.6815
	Valid Loss: 647.2395
	Valid Accuracy: 0.0100


Iterating over training data...


[TRAIN] - EPOCH 2/ 2 - BATCH LOSS: 9.6314(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.28it/s]
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  3.74it/s]



Epoch: 2/2
	Train Loss: 9.6314
	Valid Loss: 9.1601
	Valid Accuracy: 0.0150


Training complete in 0m 21s
Saving model ...
Best model saved to : ../tmp_results/best_model.pth
[0.013116515715358098, 'VGG13']
Init VGG
Device used:  cuda:0
Learning rate is: 0.013116515715358098
Directory run already exists
# Start training #


Iterating over training data...


[TRAIN] - EPOCH 1/ 2 - BATCH LOSS: 10.6186(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.27it/s]
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  3.89it/s]


Checkpointing new model ...

Epoch: 1/2
	Train Loss: 10.6186
	Valid Loss: 37.7453
	Valid Accuracy: 0.0250


Iterating over training data...


[TRAIN] - EPOCH 2/ 2 - BATCH LOSS: 9.9379(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.23it/s] 
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  3.81it/s]



Epoch: 2/2
	Train Loss: 9.9379
	Valid Loss: 9.2596
	Valid Accuracy: 0.0200


Training complete in 0m 26s
Saving model ...
Best model saved to : ../tmp_results/best_model.pth
[0.0003073781785362612, 'VGG11']
Init VGG
Device used:  cuda:0
Learning rate is: 0.0003073781785362612
Directory run already exists
# Start training #


Iterating over training data...


[TRAIN] - EPOCH 1/ 2 - BATCH LOSS: 10.3703(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.26it/s]
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  3.71it/s]


Checkpointing new model ...

Epoch: 1/2
	Train Loss: 10.3703
	Valid Loss: 9.9126
	Valid Accuracy: 0.0100


Iterating over training data...


[TRAIN] - EPOCH 2/ 2 - BATCH LOSS: 9.6000(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.29it/s]
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  3.74it/s]



Epoch: 2/2
	Train Loss: 9.6000
	Valid Loss: 9.8740
	Valid Accuracy: 0.0150


Training complete in 0m 26s
Saving model ...
Best model saved to : ../tmp_results/best_model.pth
[0.00023082427114609125, 'VGG13']
Init VGG
Device used:  cuda:0
Learning rate is: 0.00023082427114609125
Directory run already exists
# Start training #


Iterating over training data...


[TRAIN] - EPOCH 1/ 2 - BATCH LOSS: 10.1036(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.25it/s]
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  3.76it/s]


Checkpointing new model ...

Epoch: 1/2
	Train Loss: 10.1036
	Valid Loss: 9.5982
	Valid Accuracy: 0.0050


Iterating over training data...


[TRAIN] - EPOCH 2/ 2 - BATCH LOSS: 9.5762(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.25it/s] 
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  3.62it/s]



Epoch: 2/2
	Train Loss: 9.5762
	Valid Loss: 9.7597
	Valid Accuracy: 0.0200


Training complete in 0m 20s
Saving model ...
Best model saved to : ../tmp_results/best_model.pth
[0.11503861485898605, 'VGG13']
Init VGG
Device used:  cuda:0
Learning rate is: 0.11503861485898605
Directory run already exists
# Start training #


Iterating over training data...


[TRAIN] - EPOCH 1/ 2 - BATCH LOSS: 13.4745(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.29it/s]
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  3.76it/s]


Checkpointing new model ...

Epoch: 1/2
	Train Loss: 13.4745
	Valid Loss: 2420.9654
	Valid Accuracy: 0.0000


Iterating over training data...


[TRAIN] - EPOCH 2/ 2 - BATCH LOSS: 9.6304(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.28it/s]
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  3.73it/s]



Epoch: 2/2
	Train Loss: 9.6304
	Valid Loss: 10.0869
	Valid Accuracy: 0.0150


Training complete in 0m 20s
Saving model ...
Best model saved to : ../tmp_results/best_model.pth
[0.0009202884691104562, 'VGG19']
Init VGG
Device used:  cuda:0
Learning rate is: 0.0009202884691104562
Directory run already exists
# Start training #


Iterating over training data...


[TRAIN] - EPOCH 1/ 2 - BATCH LOSS: 10.1567(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.26it/s]
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  3.60it/s]


Checkpointing new model ...

Epoch: 1/2
	Train Loss: 10.1567
	Valid Loss: 9.3616
	Valid Accuracy: 0.0350


Iterating over training data...


[TRAIN] - EPOCH 2/ 2 - BATCH LOSS: 9.5285(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.25it/s]
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  3.82it/s]



Epoch: 2/2
	Train Loss: 9.5285
	Valid Loss: 9.3645
	Valid Accuracy: 0.0300


Training complete in 0m 23s
Saving model ...
Best model saved to : ../tmp_results/best_model.pth
[0.0004863857046189474, 'VGG16']
Init VGG
Device used:  cuda:0
Learning rate is: 0.0004863857046189474
Directory run already exists
# Start training #


Iterating over training data...


[TRAIN] - EPOCH 1/ 2 - BATCH LOSS: 10.7158(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.28it/s]
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  3.56it/s]


Checkpointing new model ...

Epoch: 1/2
	Train Loss: 10.7158
	Valid Loss: 9.1116
	Valid Accuracy: 0.0200


Iterating over training data...


[TRAIN] - EPOCH 2/ 2 - BATCH LOSS: 10.0473(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.28it/s]
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  3.56it/s]



Epoch: 2/2
	Train Loss: 10.0473
	Valid Loss: 9.2955
	Valid Accuracy: 0.0200


Training complete in 0m 21s
Saving model ...
Best model saved to : ../tmp_results/best_model.pth
[0.0006937575128144787, 'VGG19']
Init VGG
Device used:  cuda:0
Learning rate is: 0.0006937575128144787
Directory run already exists
# Start training #


Iterating over training data...


[TRAIN] - EPOCH 1/ 2 - BATCH LOSS: 10.1941(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.27it/s]
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  4.16it/s]


Checkpointing new model ...

Epoch: 1/2
	Train Loss: 10.1941
	Valid Loss: 9.9013
	Valid Accuracy: 0.0300


Iterating over training data...


[TRAIN] - EPOCH 2/ 2 - BATCH LOSS: 9.8545(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.22it/s]
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  3.61it/s]



Epoch: 2/2
	Train Loss: 9.8545
	Valid Loss: 9.2507
	Valid Accuracy: 0.0350


Training complete in 0m 21s
Saving model ...
Best model saved to : ../tmp_results/best_model.pth
[5.0321537523063144e-05, 'VGG19']
Init VGG
Device used:  cuda:0
Learning rate is: 5.0321537523063144e-05
Directory run already exists
# Start training #


Iterating over training data...


[TRAIN] - EPOCH 1/ 2 - BATCH LOSS: 10.5590(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.26it/s]
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  3.51it/s]


Checkpointing new model ...

Epoch: 1/2
	Train Loss: 10.5590
	Valid Loss: 10.1405
	Valid Accuracy: 0.0000


Iterating over training data...


[TRAIN] - EPOCH 2/ 2 - BATCH LOSS: 9.4754(avg) : 100%|██████████| 25/25 [00:06<00:00,  4.25it/s]
  0%|          | 0/7 [00:00<?, ?it/s]



Iterating over validation data...


100%|██████████| 7/7 [00:01<00:00,  3.67it/s]



Epoch: 2/2
	Train Loss: 9.4754
	Valid Loss: 9.0634
	Valid Accuracy: 0.0300


Training complete in 0m 20s
Saving model ...
Best model saved to : ../tmp_results/best_model.pth
Best score: 0.035
Best lr: 0.0006937575128144787
