# PyTorch debugging

In [1]:
from fastai.vision import *

In [2]:
torch.__version__

'1.0.1.post2'

In [3]:
__version__

'1.0.50.post1'

In [4]:
import pdb

## Python debugger pdb

Instead of using a lot of print() use pdb.set_trace() to debug your code (see the [pdb cheatsheet](https://github.com/nblock/pdb-cheatsheet/releases) for details):

In [5]:
y = 0

In [6]:
for x in range(10):
    y += x
    pdb.set_trace()

> <ipython-input-6-31dc6367e2f1>(1)<module>()
-> for x in range(10):
(Pdb) n
> <ipython-input-6-31dc6367e2f1>(2)<module>()
-> y += x
(Pdb) n
> <ipython-input-6-31dc6367e2f1>(3)<module>()
-> pdb.set_trace()
(Pdb) print(y)
1
(Pdb) n
> <ipython-input-6-31dc6367e2f1>(1)<module>()
-> for x in range(10):
(Pdb) c
> <ipython-input-6-31dc6367e2f1>(1)<module>()
-> for x in range(10):
(Pdb) c
> <ipython-input-6-31dc6367e2f1>(1)<module>()
-> for x in range(10):
(Pdb) print(y)
6
(Pdb) print(x)
3
(Pdb) u
> /home/mmp/anaconda3/envs/fastai/lib/python3.7/site-packages/IPython/core/interactiveshell.py(3291)run_code()
-> exec(code_obj, self.user_global_ns, self.user_ns)
(Pdb) u
> /home/mmp/anaconda3/envs/fastai/lib/python3.7/site-packages/IPython/core/interactiveshell.py(3209)run_ast_nodes()
-> if (yield from self.run_code(code, result)):
(Pdb) u
> /home/mmp/anaconda3/envs/fastai/lib/python3.7/site-packages/IPython/core/interactiveshell.py(3044)run_cell_async()
-> interactivity=interactivity, compiler=co

BdbQuit: 

## Tensor data type mismatch

Make sure to always use the right data type.

In [7]:
input = torch.randn(10, 5) # (bs, number of classes)

In [10]:
input

tensor([[ 1.9750, -1.3547, -0.7392, -1.3551,  0.9476],
        [ 0.0473, -0.7427,  0.9868,  1.3153, -0.5540],
        [ 0.0207,  1.1429, -0.6156,  0.2084,  0.0083],
        [-0.2268,  0.5686, -0.3392,  2.0113,  1.3083],
        [ 0.4994, -1.8223, -0.8370,  1.0272, -0.2712],
        [ 1.0959,  0.6956, -0.3961, -0.0597, -0.5974],
        [-1.2216,  1.1892, -0.5037,  0.9391, -0.5234],
        [ 1.0192, -0.8703,  1.7618, -0.2617, -0.0212],
        [ 1.7314, -0.7544,  2.0450,  1.0137, -0.4833],
        [-0.7080,  1.7692, -1.0644,  0.9657, -0.5552]])

In [8]:
target = torch.cat((torch.arange(5),torch.arange(5))).float()

In [11]:
target

tensor([0., 1., 2., 3., 4., 0., 1., 2., 3., 4.])

In [12]:
F.cross_entropy(input, target)

RuntimeError: Expected object of scalar type Long but got scalar type Float for argument #2 'target'

In [13]:
target

tensor([0., 1., 2., 3., 4., 0., 1., 2., 3., 4.])

In [14]:
target.type()

'torch.FloatTensor'

Wrong data type for the target variable, needs to be a torch.LongTensor (= 64-bit integer (signed)):

In [22]:
target.long()

tensor([0, 1, 2, 3, 4, 0, 1, 2, 3, 4])

In [23]:
target = target.long()

In [24]:
target.type()

'torch.LongTensor'

In [25]:
F.cross_entropy(input, target)

tensor(1.5498)

[See PyTorch tensor datatypes.](https://pytorch.org/docs/stable/tensors.html)

## Tensor size mismatch

Make sure that your tensor have the right dimensions.

### Single layer

In [27]:
# Conv2d layer with 1 input channel and 16 output channels and a 3x3 kernel:
conv_layer = nn.Conv2d(1,16,3)

In [32]:
# random input tensor of shape (bs, w, h):
x = torch.randn(10, 28, 28)

In [29]:
# running it through the Conv2d results in an error:
conv_layer(x).shape

RuntimeError: Expected 4-dimensional input for 4-dimensional weight [16, 1, 3, 3], but got 3-dimensional input of size [10, 28, 28] instead

In [33]:
# add channel dimension to satisfy Conv2d input shape prerequisites:
x = x.view(-1,1,28,28); x.shape

torch.Size([10, 1, 28, 28])

In [34]:
conv_layer(x).shape

torch.Size([10, 16, 26, 26])

### Network

In [98]:
model = nn.Sequential(nn.Conv2d(3, 64, 5), nn.ReLU(),
                      nn.Conv2d(64, 128, 3), nn.ReLU(),
                      nn.Conv2d(128, 256, 3), nn.ReLU(),
                      nn.Conv2d(256, 256, 3), nn.ReLU(),
                      nn.Conv2d(256, 256, 3),
                      nn.AdaptiveAvgPool2d(1),
                      #Debugger(), # add this debugger layer to start pdb
                      #View(-1,256), # remove this to generate a bug
                      nn.Linear(256,10)
                     )

In [99]:
model(torch.randn(64,3,28,28)).shape # (bs, c, w, h)

RuntimeError: size mismatch, m1: [16384 x 1], m2: [256 x 10] at /opt/conda/conda-bld/pytorch_1549636813070/work/aten/src/TH/generic/THTensorMath.cpp:940

### Conv helper functions

 From https://discuss.pytorch.org/t/utility-function-for-calculating-the-shape-of-a-conv-output/11173/7, have a look there for more helpfer functions. For the theory see http://cs231n.github.io/convolutional-networks/.

In [163]:
import math

def num2tuple(num):
    return num if isinstance(num, tuple) else (num, num)

def conv2d_output_shape(h_w, kernel_size=1, stride=1, pad=0, dilation=1):
    h_w, kernel_size, stride, pad, dilation = num2tuple(h_w), \
        num2tuple(kernel_size), num2tuple(stride), num2tuple(pad), num2tuple(dilation)
    pad = num2tuple(pad[0]), num2tuple(pad[1])
    
    h = math.floor((h_w[0] + sum(pad[0]) - dilation[0]*(kernel_size[0]-1) - 1) / stride[0] + 1)
    w = math.floor((h_w[1] + sum(pad[1]) - dilation[1]*(kernel_size[1]-1) - 1) / stride[1] + 1)
    
    return h, w

In [164]:
conv2d_output_shape(28, 3)

(26, 26)

## Tensorboard visualisation

In [5]:
from fastai.callbacks.tensorboard import *

In [6]:
path = untar_data(URLs.MNIST_SAMPLE)

  with open(fpath, 'r') as yaml_file: return yaml.load(yaml_file)


In [7]:
data = ImageDataBunch.from_folder(path)

In [8]:
learn = cnn_learner(data, models.resnet18, metrics=[accuracy])

In [9]:
proj_id = 'standard_lr'
tboard_path = Path('data/tensorboard/' + proj_id)
learn.callback_fns.append(partial(LearnerTensorboardWriter, base_dir=tboard_path, name='Learner'))

In [10]:
learn.fit(20)

epoch,train_loss,valid_loss,accuracy,time
0,0.150627,0.092322,0.968597,00:02
1,0.085459,0.069342,0.975466,00:02
2,0.054751,0.038325,0.987242,00:02
3,0.035391,0.037705,0.987242,00:02
4,0.039326,0.029945,0.987242,00:02
5,0.023949,0.035706,0.989696,00:02
6,0.025465,0.025775,0.99264,00:02
7,0.018803,0.020634,0.995584,00:02
8,0.025024,0.019474,0.995584,00:02
9,0.013619,0.022533,0.995093,00:02


In [11]:
learn = cnn_learner(data, models.resnet18, metrics=[accuracy])

In [12]:
proj_id = 'too_high_lr'
tboard_path = Path('data/tensorboard/' + proj_id)
learn.callback_fns.append(partial(LearnerTensorboardWriter, base_dir=tboard_path, name='Learner'))

In [14]:
learn.fit(20, lr=1)

epoch,train_loss,valid_loss,accuracy,time
0,8736754.0,110.347351,0.649166,00:02
1,173508.4375,27.968611,0.770854,00:02
2,3524.548584,744759.0,0.504416,00:02
3,82.290344,2.554286,0.904318,00:02
4,8.307852,564.973633,0.495584,00:02
5,8.27118,94.207603,0.941609,00:02
6,11.314397,7.316569,0.86212,00:02
7,9.801261,92.336212,0.746811,00:02
8,8.133174,14077.842773,0.504416,00:02
9,12.748188,210.775955,0.515211,00:02


In [15]:
learn = cnn_learner(data, models.resnet18, metrics=[accuracy])

In [16]:
proj_id = 'too_low_lr'
tboard_path = Path('data/tensorboard/' + proj_id)
learn.callback_fns.append(partial(LearnerTensorboardWriter, base_dir=tboard_path, name='Learner'))

In [17]:
learn.fit(20, lr=1e-6)

epoch,train_loss,valid_loss,accuracy,time
0,0.970415,0.813136,0.499509,00:02
1,0.91627,0.775494,0.516192,00:02
2,0.885066,0.742918,0.546614,00:02
3,0.863296,0.708381,0.577527,00:02
4,0.814499,0.678808,0.59421,00:02
5,0.795118,0.654919,0.634446,00:02
6,0.776424,0.640722,0.633955,00:02
7,0.750561,0.614644,0.660451,00:02
8,0.714594,0.591084,0.6737,00:02
9,0.691315,0.575833,0.699706,00:02


Start tensorboard with `tensorboard --logdir=path/to/log-directory`, see https://www.tensorflow.org/guide/summaries_and_tensorboard.