## trixi PyTorch Experiment

Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from torchvision import datasets, transforms

from trixi.util import Config
from trixi.experiment import PytorchExperiment

Using torch multi processing


Build config

In [2]:
!ls experiment_dir/

20180622-165237_experiment	   data
20180622-165352_resume_experiment  test-experiment


In [3]:
!rm -rf experiment_dir/20*

In [4]:
!du -sh experiment_dir/

106M	experiment_dir/


In [5]:
c = Config()

c.batch_size = 64
c.batch_size_test = 1000
c.n_epochs = 10
c.learning_rate = 0.01
c.momentum = 0.9
if torch.cuda.is_available():
    c.use_cuda = True
else:
    c.use_cuda = False
c.data_loader_kwargs = {'num_workers': 1, 'pin_memory': True} if c.use_cuda else {}
c.rnd_seed = 1
c.log_interval = 200

c.train_loader = {
    torch.utils.data.DataLoader: {
        'dataset': {
            datasets.MNIST: {
                'root': 'experiment_dir/data/',
                'train': True,
                'download': True,
                'transform': {
                    transforms.ToTensor: {}
                }
            }
        },
        'batch_size': c.batch_size,
        **c.data_loader_kwargs
    }
}

c.test_loader = {
    torch.utils.data.DataLoader: {
        'dataset': {
            datasets.MNIST: {
                'root': 'experiment_dir/data/',
                'train': False,
                'download': True,
                'transform': {
                    transforms.ToTensor: {}
                }
            }
        },
        'batch_size': c.batch_size_test,
        **c.data_loader_kwargs
    }
}

In [6]:
# build a simple cnn model
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

Build net

In [7]:
class MNIST_experiment(PytorchExperiment):
    def setup(self):
        self.train_data_loader = self.config.train_loader
        self.test_data_loader = self.config.test_loader
        self.model = Net()
        if self.config.use_cuda:
            self.model.cuda()
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.config.learning_rate,
                                               momentum=self.config.momentum)
        self.save_checkpoint(name="checkpoint_start")
        self.vlog.plot_model_structure(self.model,
                                       [self.config.batch_size, 1, 28, 28], 
                                       name='Model Structure')
        self.elog.print('Experiment set up.')
        self.batch_counter = 0
    
    def train(self, epoch):
        self.model.train()
        for batch_idx, (data, target) in enumerate(self.train_data_loader):
            self.batch_counter += 1
            if self.config.use_cuda:
                data, target = data.cuda(), target.cuda()
            self.optimizer.zero_grad()
            output = self.model(data)
            self.loss = F.nll_loss(output, target)
            self.loss.backward()
            self.optimizer.step()
            if batch_idx % self.config.log_interval == 0:
                # plot train loss
                self.vlog.show_value(value=self.loss.item(), name='Loss',
                                     count=self.batch_counter, tag='Train Loss')
                # log train batch loss and progress
                self.elog.print(
                    'Train Epoch: {} [{}/{} samples ({:.0f}%)]\t Batch Loss: {:.6f}'
                    .format(epoch, batch_idx * len(data),
                            len(self.train_data_loader.dataset),
                            100. * batch_idx / len(self.train_data_loader),
                            self.loss.item()))
                self.save_checkpoint(name="checkpoint", n_iter=batch_idx)
                
    def validate(self, epoch):
        self.model.eval()
        validation_loss = 0
        correct = 0
        for data, target in self.test_data_loader:
            if self.config.use_cuda:
                data, target = data.cuda(), target.cuda()
            output = self.model(data)
            validation_loss += F.nll_loss(output, target, size_average=False).item()
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(target.data.view_as(pred)).cpu().sum()
        validation_loss /= len(self.test_data_loader.dataset)
        # plot the test loss
        self.vlog.show_value(value=validation_loss, name='Loss',
                             count=self.batch_counter, tag='Validation Loss')
        # log validation loss and accuracy
        self.elog.print(
            '\nValidation set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'
            .format(validation_loss, correct, len(self.test_data_loader.dataset),
                    100. * correct / len(self.test_data_loader.dataset)))

In [None]:
exp = MNIST_experiment(config=c, name='experiment', n_epochs=c.n_epochs, 
                       seed=42, base_dir='./experiment_dir')

In [9]:
exp.run()

Experiment set up.
Experiment started.

Validation set: Average loss: 0.1424, Accuracy: 9568/10000 (95%)


Validation set: Average loss: 0.0941, Accuracy: 9700/10000 (97%)


Validation set: Average loss: 0.0731, Accuracy: 9772/10000 (97%)


Validation set: Average loss: 0.0646, Accuracy: 9800/10000 (98%)


Validation set: Average loss: 0.0581, Accuracy: 9819/10000 (98%)


Validation set: Average loss: 0.0541, Accuracy: 9829/10000 (98%)


Validation set: Average loss: 0.0499, Accuracy: 9844/10000 (98%)


Validation set: Average loss: 0.0472, Accuracy: 9844/10000 (98%)


Validation set: Average loss: 0.0450, Accuracy: 9850/10000 (98%)


Validation set: Average loss: 0.0422, Accuracy: 9872/10000 (98%)

Training complete.
Experiment ended. Checkpoints stored =)
Experiment ended.


In [10]:
import os
last_experiment = 'experiment_dir/' + sorted([d for d in os.listdir('experiment_dir/') if '20' in str(d)], reverse=True)[0]

In [11]:
!ls experiment_dir/

20180622-165631_experiment  data  test-experiment


In [12]:
last_experiment

'experiment_dir/20180622-165631_experiment'

In [13]:
!ls experiment_dir/20180622-164650_experiment/checkpoint/

ls: Zugriff auf 'experiment_dir/20180622-164650_experiment/checkpoint/' nicht möglich: Datei oder Verzeichnis nicht gefunden


In [None]:
from trixi.experiment import PytorchExperiment
exp_resume = MNIST_experiment(config=c, name='resume_experiment', 
                              n_epochs=c.n_epochs, seed=42, base_dir='./experiment_dir', 
                              resume=last_experiment, resume_save_types=('model',
                                                                         'simple',
                                                                         'th_vars',
                                                                         'results'))

In [15]:
exp_resume.run()

Experiment set up.
Loaded existing config from: experiment_dir/20180622-165631_experiment
Loaded existing checkpoint from: experiment_dir/20180622-165631_experiment/checkpoint/checkpoint_last.pth.tar
Experiment started.

Validation set: Average loss: 0.0450, Accuracy: 9862/10000 (98%)


Validation set: Average loss: 0.0413, Accuracy: 9870/10000 (98%)


Validation set: Average loss: 0.0392, Accuracy: 9870/10000 (98%)


Validation set: Average loss: 0.0374, Accuracy: 9874/10000 (98%)


Validation set: Average loss: 0.0405, Accuracy: 9887/10000 (98%)


Validation set: Average loss: 0.0379, Accuracy: 9878/10000 (98%)


Validation set: Average loss: 0.0344, Accuracy: 9892/10000 (98%)


Validation set: Average loss: 0.0369, Accuracy: 9884/10000 (98%)


Validation set: Average loss: 0.0368, Accuracy: 9889/10000 (98%)


Validation set: Average loss: 0.0415, Accuracy: 9888/10000 (98%)

Training complete.
Experiment ended. Checkpoints stored =)
Experiment ended.


In [16]:
!ls experiment_dir

20180622-165631_experiment	   data
20180622-165742_resume_experiment  test-experiment
