In [1]:
import sys
sys.path.append('..')

In [2]:
import torch
import torchvision

from torch import nn, autograd, optim
from torchvision import datasets, transforms

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from time import time

In [3]:
from NeuralODE.neural_ode_solvers import AdjointODE
from NeuralODE.odenet import ResidualBlock, ODEBlock, MNISTClassifier

In [4]:
transform = transforms.Compose([transforms.ToTensor()])

In [5]:
trainset = datasets.MNIST('../PATH_TO_STORE_TRAINSET', train=True, transform=transform)
testset = datasets.MNIST('../PATH_TO_STORE_TESTSET', train=False, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=True)

In [6]:
dataiter = iter(trainloader)
images, labels = dataiter.next()

print(images.shape)
print(labels.shape)

torch.Size([64, 1, 28, 28])
torch.Size([64])


In [7]:
device = 'cuda:2'

In [8]:
criterion = nn.NLLLoss()

In [9]:
# Function that optimizes a model
def train(model, epochs, print_epochs=False):
    optimizer = optim.Adam(model.parameters())
    time0 = time()
    losses = []
    for e in tqdm(range(epochs)):
        running_loss = 0
        for images, labels in trainloader:
            images = images.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()

            output = model(images)
            loss = criterion(output, labels)
            #This is where the model learns by backpropagating
            loss.backward()

            #And optimizes its weights here
            optimizer.step()

            running_loss += loss.item()
        losses.append(running_loss/len(trainloader))
        if print_epochs:
            print("Epoch {} - Training loss: {}".format(e, losses[-1]))
    print("\nTraining Time (in minutes) =",(time()-time0)/60)

In [10]:
# Function that returns loss and accuracy of a model on the test set
def test(model):
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for image, labels in testloader:
            image = image.to(device)
            labels = labels.to(device)
            output = model(image)
            test_loss += criterion(output, labels)
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(labels.data.view_as(pred)).sum()
        test_loss /= len(testloader.dataset)
        test_loss = float(test_loss)
        correct = 100. * float(correct) / len(testloader.dataset)
    return test_loss, correct

## 1. ResNet

In our architechture we have a structure of a convolutional layer, two sequential resnet blocks and a maxpool layer repeating 4 times. 

In [11]:
resnet = MNISTClassifier(ResidualBlock).to(device)

In [12]:
train(resnet, epochs=15)

100%|██████████| 15/15 [06:15<00:00, 24.39s/it]


Training Time (in minutes) = 6.250414184729258





In [13]:
test(resnet)

(0.0005774503806605935, 99.26)

## 2. ODENet
The same architechture as the ResNet model, but with ODENet blocks instead of ResNet blocks.

In [14]:
odenet = MNISTClassifier(ODEBlock).to(device)

In [15]:
train(odenet, epochs=15)

100%|██████████| 15/15 [13:52<00:00, 55.66s/it]


Training Time (in minutes) = 13.878390876452128





In [16]:
test(odenet)

(0.0008329695556312799, 98.78)

## Conclusion

We can see that ODENet shows relatively the same quality as ResNet with the same architechture on the MNIST dataset. We can draw the conclusion that substituding discrete time in a generative model with a continues time flow doesn't affect the quality and can be used. 

In our case the ODENet model did take more time to train, but theoretical estimations show that both models have the same training time of O(L).

Authors of https://arxiv.org/pdf/1806.07366.pdf have also experimented with MNIST dataset and, same as in our case, ODENet didn't outperform ResNet but showed similar accuracy. The signinficant difference of the models is that ODENet requires O(1) memory whereas ResNet takes O(L), where L is the number of layers.