# Using the pytorch example for CIFAR10

#### Note on downloading the dataset:
Downloaded the cifar10 dataset manually using curl, but the code below will download it if it missing from the datasets folder.

In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.transforms import v2
import matplotlib.pyplot as plt
import numpy as np
import multiprocessing

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

d = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [2]:
workers = multiprocessing.cpu_count()-1
print(f'Workers: {workers}')

Workers: 11


In [3]:
transform = transforms.Compose(
    [transforms.RandomHorizontalFlip(p=0.5),
     transforms.RandomPerspective(distortion_scale=0.5, p=0.5),
     transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
     transforms.ToTensor()])

batch_size = 100
workers = multiprocessing.cpu_count()-1
print(f'Workers: {workers}')
trainset = torchvision.datasets.CIFAR10(root='./datasets', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=workers)
testset = torchvision.datasets.CIFAR10(root='./datasets', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=workers)
classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Workers: 11


100%|██████████| 170M/170M [00:01<00:00, 102MB/s]


In [35]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(   3, 2**4, 3, padding = 1)
        self.conv2 = nn.Conv2d(2**4, 2**5, 3, padding = 1)

        self.conv3 = nn.Conv2d(2**5, 2**6, 3, padding = 1)
        self.conv4 = nn.Conv2d(2**6, 2**7, 3, padding = 1)

        self.conv5 = nn.Conv2d(2**7, 2**8, 3, padding = 1)
        self.conv6 = nn.Conv2d(2**8, 2**9, 3, padding = 1)

        self.bn1 = nn.BatchNorm2d(2**4)
        self.bn2 = nn.BatchNorm2d(2**6)
        self.bn3 = nn.BatchNorm2d(2**8)

        self.dropout = nn.Dropout2d(0.2)
        self.shortcut1 = nn.Sequential(nn.Conv2d(3, 2**5, kernel_size=3, padding = 1, bias=False),
                                       nn.BatchNorm2d(2**5))
        self.shortcut2 = nn.Sequential(nn.Conv2d(2**5, 2**7, kernel_size=3, padding = 1, bias=False),
                                       nn.BatchNorm2d(2**7))
        self.shortcut3 = nn.Sequential(nn.Conv2d(2**7, 2**9, kernel_size=3, padding = 1, bias=False),
                                       nn.BatchNorm2d(2**9))

        self.pool2 = nn.MaxPool2d(2, 2)
        self.fc0 = nn.Linear(2**13, 2**13)
        self.fc1 = nn.Linear(2**13, 2**12)
        self.fc2 = nn.Linear(2**12, 2**11)
        self.fc3 = nn.Linear(2**11, 10)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.conv2(out)
        out += self.shortcut1(x)
        out = F.relu(out)
        x = self.pool2(out)

        out = F.relu(self.bn2(self.conv3(x)))
        out = self.conv4(out)
        out += self.shortcut2(x)
        out = F.relu(out)
        x = self.pool2(out)

        out = F.relu(self.bn3(self.conv5(x)))
        out = self.conv6(out)
        out += self.shortcut3(x)
        out = F.relu(out)
        x = self.pool2(out)

        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc0(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x


In [36]:
loss_list = []

cutmix = v2.CutMix(num_classes=10)
mixup = v2.MixUp(num_classes=10)
cutmix_or_mixup = v2.RandomChoice([cutmix, mixup])


transform = transforms.Compose(
    [transforms.RandomHorizontalFlip(p=0.5),
     transforms.RandomCrop(32, padding=4),
     transforms.RandomPerspective(distortion_scale=0.5, p=0.5),
     transforms.ColorJitter(brightness=0.15, contrast=0.15, saturation=0.15, hue=0.15),
     transforms.ToTensor()])

batch_size = 80
workers = multiprocessing.cpu_count()-2
trainset = torchvision.datasets.CIFAR10(root='./datasets', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=workers)
testset = torchvision.datasets.CIFAR10(root='./datasets', train=False,
                                      download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                        shuffle=False, num_workers=workers)
classes = ('plane', 'car', 'bird', 'cat',
          'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

net = Net().to(d)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
#lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor = 0.1, patience=10,min_lr=0.00001)
import time
t1 = time.time()
correct, total = 0,1
for epoch in range(300):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        data = cutmix_or_mixup(data[0], data[1])
        inputs, labels = data[0].to(d), data[1].to(d)

        optimizer.zero_grad()

        outputs = net(inputs).to(d)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    if epoch % 10 == 5:
        correct = 0
        total = 0
        with torch.no_grad():
            for data in testloader:
                images, labels = data[0].to(d), data[1].to(d)
                outputs = net(images).to(d)
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        print(f'Accuracy: {round(100.0 * float(correct / total),2)}, Loss: {round(running_loss / 2000,2)}, time: {round(time.time()-t1, 2)}s')

    #lr_scheduler.step(running_loss / i)
    print(f'epoch: {epoch}, Loss: {round(running_loss / 2000,2)}, time: {round(time.time()-t1, 2)}s')

print('Finished Training')



epoch: 0, Loss: 0.67, time: 12.28s
epoch: 1, Loss: 0.63, time: 24.51s
epoch: 2, Loss: 0.61, time: 36.96s
epoch: 3, Loss: 0.59, time: 49.18s
epoch: 4, Loss: 0.58, time: 61.53s
Accuracy: 53.38, Loss: 0.57, time: 75.81s
epoch: 5, Loss: 0.57, time: 75.81s
epoch: 6, Loss: 0.56, time: 88.07s
epoch: 7, Loss: 0.55, time: 100.5s
epoch: 8, Loss: 0.55, time: 112.91s
epoch: 9, Loss: 0.54, time: 125.18s
epoch: 10, Loss: 0.53, time: 137.58s
epoch: 11, Loss: 0.53, time: 149.8s
epoch: 12, Loss: 0.52, time: 162.15s
epoch: 13, Loss: 0.51, time: 174.31s
epoch: 14, Loss: 0.52, time: 186.58s
Accuracy: 64.65, Loss: 0.5, time: 200.93s
epoch: 15, Loss: 0.5, time: 200.93s
epoch: 16, Loss: 0.51, time: 213.31s
epoch: 17, Loss: 0.51, time: 225.78s
epoch: 18, Loss: 0.5, time: 238.12s
epoch: 19, Loss: 0.5, time: 250.47s
epoch: 20, Loss: 0.49, time: 262.85s
epoch: 21, Loss: 0.5, time: 275.16s
epoch: 22, Loss: 0.49, time: 287.48s
epoch: 23, Loss: 0.49, time: 299.6s
epoch: 24, Loss: 0.48, time: 311.91s
Accuracy: 70.64

KeyboardInterrupt: 

In [None]:
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for data in testloader:
        images, labels = data
        # calculate outputs by running images through the network
        outputs = net(images)
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')

Accuracy of the network on the 10000 test images: 73 %


# Failed experiments

#### Experiment 1

```python
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 3, padding = 1)
        self.conv2 = nn.Conv2d(6, 16, 3, padding = 1)
        self.conv3 = nn.Conv2d(16, 32, 3, padding = 1)
        self.conv4 = nn.Conv2d(32, 64, 3, padding = 1)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.pool4 = nn.MaxPool2d(4, 4)
        self.fc1 = nn.Linear(64, 30)
        self.fc2 = nn.Linear(30, 10)

    def forward(self, x):
        x = self.pool2(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = self.pool2(F.relu(self.conv3(x)))
        x = self.pool4(F.relu(self.conv4(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


net = Net()
```
After 10 epochs got to 65% on test. 16mins on cpu.
After 20 epochs got to 66% on test. 23mins on cpu.

#### Test 2

I made it bigger, and got no decrease in computation speed, but now in 10 epochs i got 73%! the loss began platouing at the end anyway, so I dont think more iterations will increase the test test by that much.
```python
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 16, 3, padding = 1)
        self.conv2 = nn.Conv2d(16, 32, 3, padding = 1)
        self.conv3 = nn.Conv2d(32, 64, 3, padding = 1)
        self.conv4 = nn.Conv2d(64, 128, 3, padding = 1)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.pool4 = nn.MaxPool2d(4, 4)
        self.fc1 = nn.Linear(128, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 10)

    def forward(self, x):
        x = self.pool2(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = self.pool2(F.relu(self.conv3(x)))
        x = self.pool4(F.relu(self.conv4(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()
```

#### Test 3

I Added more linear layers at the end, and ran it for 20 epochs. Interestingly this did nothing whatsoever, still got a test accuracy of 73%, thought this time ~30 minutes on cpu.

#### Test4 - Have been getting the best results on the bigger models, so now to compensate for overfitting, I'll add some data augmentation techniques.

this got it up to 83% Which is nice, but not quite there.

#### Test5 - you dont actually need to max pool every conv layer.

This lets you add more conv layers cause you dont reduce the size of the image till you cant do convolutions

```python
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(   3, 2**4, 3, padding = 1)
        self.conv2 = nn.Conv2d(2**4, 2**5, 3, padding = 1)
        self.conv3 = nn.Conv2d(2**5, 2**6, 3, padding = 1)

        self.conv4 = nn.Conv2d(2**6, 2**7, 3, padding = 1)
        self.conv5 = nn.Conv2d(2**7, 2**8, 3, padding = 1)
        self.conv6 = nn.Conv2d(2**8, 2**9, 3, padding = 1)

        self.pool2 = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(2**13, 2**12)
        self.fc2 = nn.Linear(2**12, 2**11)
        self.fc3 = nn.Linear(2**11, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))

        x = self.pool2(x)

        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))

        x = self.pool2(x)

        x = F.relu(self.conv5(x))
        x = F.relu(self.conv6(x))

        x = self.pool2(x)
        
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net().to(d)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.0004, momentum=0.9)
```
Got up to 77% after 20 epochs, ~1hr on google colab.

#### Added batch normalization, and increased learning rate

Got up to 86% after 23 epochs, ~1.5 hours on google colab, inctreased batch size to 100 and got %91 after 90 epochs, 29mins.

```python
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(   3, 2**4, 3, padding = 1)
        self.conv2 = nn.Conv2d(2**4, 2**5, 3, padding = 1)

        self.conv3 = nn.Conv2d(2**5, 2**6, 3, padding = 1)
        self.conv4 = nn.Conv2d(2**6, 2**7, 3, padding = 1)

        self.conv5 = nn.Conv2d(2**7, 2**8, 3, padding = 1)
        self.conv6 = nn.Conv2d(2**8, 2**9, 3, padding = 1)

        self.bn1 = nn.BatchNorm2d(2**4)
        self.bn2 = nn.BatchNorm2d(2**6)
        self.bn3 = nn.BatchNorm2d(2**8)

        self.pool2 = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(2**13, 2**12)
        self.fc2 = nn.Linear(2**12, 2**11)
        self.fc3 = nn.Linear(2**11, 10)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.conv2(x))

        x = self.pool2(x)

        x = F.relu(self.bn2(self.conv3(x)))
        x = F.relu(self.conv4(x))

        x = self.pool2(x)

        x = F.relu(self.bn3(self.conv5(x)))
        x = F.relu(self.conv6(x))

        x = self.pool2(x)
        
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
```

In [17]:
from sklearn.datasets import fetch_lfw_people
from sklearn.model_selection import train_test_split

import torch
import numpy as np
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

d = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
transform = transforms.Compose(
    [transforms.ToTensor()])

# Download the data, if not already on disk and load it as numpy arrays
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)

X = lfw_people.images
Y = lfw_people.target
# Verify the value range of X_train. No normalization is necessary in this case,
# as the input values already fall within the range of 0.0 to 1.0.
print("X_min:",X.min(),"X_train_max:", X.max())
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
X_train = X_train[:, np.newaxis, :, :]
X_test = X_test[:, np.newaxis, :, :]
print("X_train shape:", X_train.shape)

class zipped_ds:
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
    def __getitem__(self, i):
        return self.data[i], self.labels[i]
    def __len__(self):
        return len(self.data)
batch_size = 8
trainloader = torch.utils.data.DataLoader(zipped_ds(X_train, y_train), batch_size=batch_size,
                                          shuffle=True, num_workers=2)
testloader = torch.utils.data.DataLoader(zipped_ds(X_test, y_test), batch_size=batch_size,
                                         shuffle=False, num_workers=2)

X_min: 0.0 X_train_max: 1.0
X_train shape: (966, 1, 50, 37)


In [25]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(   1, 32, 3, padding = 1)
        self.conv2 = nn.Conv2d(32, 64, 3, padding = 1)
        self.conv3 = nn.Conv2d(64, 96, 3, padding = 1)

        self.pool2 = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(2304, 1024)
        self.fc2 = nn.Linear(1024, 7)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool2(x)
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        x = F.relu(self.conv3(x))
        x = self.pool2(x)

        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


net = Net().to(d)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)

In [26]:
import matplotlib.pyplot as plt
for epoch in range(10):
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data[0].to(d), data[1].to(d)
        optimizer.zero_grad()

        outputs = net(inputs).to(d)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data[0].to(d), data[1].to(d)
            outputs = net(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f'epoch: {epoch}, loss: {round(running_loss / i,2)}, Accuracy: {100 * correct // total} %')

print('Finished Training')

epoch: 0, loss: 1.77, Accuracy: 45 %
epoch: 1, loss: 1.73, Accuracy: 45 %
epoch: 2, loss: 1.72, Accuracy: 45 %
epoch: 3, loss: 1.72, Accuracy: 45 %
epoch: 4, loss: 1.68, Accuracy: 46 %
epoch: 5, loss: 1.56, Accuracy: 47 %
epoch: 6, loss: 1.41, Accuracy: 59 %
epoch: 7, loss: 1.12, Accuracy: 70 %
epoch: 8, loss: 0.9, Accuracy: 77 %
epoch: 9, loss: 0.67, Accuracy: 73 %
Finished Training
