In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import pprint

import torch
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms

from config import config
from config import extra
from function import train
from function import valid
from dataset import create_dataset
from models import create_model
from utils import create_optimizer
from utils import create_logger

In [2]:
config.MODE = 'train'
extra()

# create a logger
logger = create_logger('train')

# logging configurations
logger.info(pprint.pformat(config))

# cudnn related setting
cudnn.benchmark = config.CUDNN.BENCHMARK
torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
torch.backends.cudnn.enabled = config.CUDNN.ENABLED

{'ADV': {'LINF_NORM': 0.03, 'TYPE': 'FGSM'},
 'CUDNN': {'BENCHMARK': True, 'DETERMINISTIC': False, 'ENABLED': True},
 'DATASET': {'CLASSNUM': 10,
             'DATASET': 'CIFAR10',
             'ROOT': '/m/shibf/dataset/cifar10'},
 'GPUS': '4, 5, 6, 7',
 'GPU_NUM': 4,
 'MODE': 'train',
 'MODEL': {'INPUT_DIM': 256, 'TYPE': 'ConvNet'},
 'OUTPUT_DIR': 'experiments/CIFAR10/train',
 'TEST': {'PRINT_EVERY': 1,
          'STATE_DICT': 'experiments/CIFAR10/train/model_FC_conj_angle_0.5606.pth',
          'TEST_EVERY': 5},
 'TRAIN': {'BATCH_SIZE': 32,
           'BEGIN_EPOCH': 0,
           'CONJREG': 1,
           'END_EPOCH': 120,
           'IF_CONJREG': True,
           'IF_L1REG': False,
           'L1REG': 0,
           'LR': 0.001,
           'LR_DECAY_RATE': 0.5,
           'LR_MILESTONES': [30, 60, 90],
           'MOMENTUM': 0.9,
           'NESTEROV': False,
           'OPTIMIZER': 'sgd',
           'PRINT_EVERY': 1,
           'WD': 0.0001},
 'WORKERS': 4}


In [6]:
# create a model
os.environ["CUDA_VISIBLE_DEVICES"] = config.GPUS
gpus = [int(i) for i in config.GPUS.split(',')]
gpus = range(gpus.__len__())
model = create_model()

model = model.cuda(gpus[0])
model = torch.nn.DataParallel(model, device_ids=gpus)


In [7]:
# create an optimizer
optimizer = create_optimizer(config, model)

# create a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
optimizer, config.TRAIN.LR_MILESTONES,
config.TRAIN.LR_DECAY_RATE
)

# get dataset
train_dataset, test_dataset, train_loader, test_loader = create_dataset()

In [None]:
#training and validating
best_perf = 0
for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH):
    lr_scheduler.step()

    # train for one epoch
    train(train_loader, model, optimizer, epoch)

    # evaluate on validation set
    if (epoch + 1) % config.TEST.TEST_EVERY == 0:
        perf_indicator = valid(test_loader, model)

        if perf_indicator > best_perf:
            logger.info("=> saving checkpoint into {}".format(os.path.join(config.OUTPUT_DIR, 'checkpoint_{}.pth'.format(perf_indicator))))
            best_perf = perf_indicator
            torch.save(model.state_dict(), os.path.join(config.OUTPUT_DIR, 'checkpoint_{}.pth'.format(perf_indicator)))

# save the final model
logger.info("=> saving final model into {}".format(
    os.path.join(config.OUTPUT_DIR, 'model_{}.pth'.format(perf_indicator))
))
torch.save(model.state_dict(),
           os.path.join(config.OUTPUT_DIR, 'model_{}.pth'.format(perf_indicator)))

Epoch: [0][0/391]	Time 3.718s (3.718s)	Speed 34.4 samples/s	Data 0.904s (0.904s)	Loss 2.30582 (2.30582)	Accuracy 0.117 (0.117)
Epoch: [0][1/391]	Time 0.040s (1.879s)	Speed 3205.9 samples/s	Data 0.008s (0.456s)	Loss 2.31367 (2.30974)	Accuracy 0.062 (0.090)
Epoch: [0][2/391]	Time 0.015s (1.258s)	Speed 8414.1 samples/s	Data 0.002s (0.305s)	Loss 2.30687 (2.30879)	Accuracy 0.094 (0.091)
Epoch: [0][3/391]	Time 0.014s (0.947s)	Speed 9099.7 samples/s	Data 0.002s (0.229s)	Loss 2.30683 (2.30830)	Accuracy 0.133 (0.102)
Epoch: [0][4/391]	Time 0.017s (0.761s)	Speed 7617.8 samples/s	Data 0.005s (0.184s)	Loss 2.30397 (2.30743)	Accuracy 0.070 (0.095)
Epoch: [0][5/391]	Time 0.012s (0.636s)	Speed 10491.5 samples/s	Data 0.001s (0.154s)	Loss 2.30367 (2.30680)	Accuracy 0.086 (0.094)
Epoch: [0][6/391]	Time 0.014s (0.547s)	Speed 9219.2 samples/s	Data 0.004s (0.132s)	Loss 2.30630 (2.30673)	Accuracy 0.094 (0.094)
Epoch: [0][7/391]	Time 0.010s (0.480s)	Speed 12377.1 samples/s	Data 0.001s (0.116s)	Loss 2.30381 (