## UNDERSTANDING DEEP LEARNING REQUIRES RE-THINKING GENERALIZATION

Chiyuan Zhang∗
Massachusetts Institute of Technology
chiyuan@mit.edu

Samy Bengio
Google Brain
bengio@google.com

Moritz Hardt
Google Brain
mrtz@google.com

Benjamin Recht†
University of California, Berkeley
brecht@berkeley.edu

Oriol Vinyals
Google DeepMind
vinyals@google.com

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import time
import numpy as np
import matplotlib.pyplot as plt
import random
import pandas as pd
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
input_channels = 3
num_classes = 10

device = 'cuda'
lr = 0.001
epochs = 20
batch_size = 32

In [3]:
img_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.CenterCrop(28)
])

def norm_image(data_sample):
    img_tensor = data_sample[0]
    label = data_sample[1]

    img_means = img_tensor.mean(axis=[1,2])
    img_sds = img_tensor.std(axis=[1,2])

    mean_sub = img_tensor - img_means.unsqueeze(1).unsqueeze(2)
    img_norm = mean_sub.true_divide(img_sds.unsqueeze(1).unsqueeze(2))

    return (img_norm, label)

In [4]:
# training set

all_train = list(datasets.CIFAR10(root = 'data/', transform=img_transform, train = True, download=True))

random.shuffle(all_train)

train_data = all_train[:40000]
train_transformed = list(map(norm_image, train_data))
train_loader = DataLoader(dataset=train_transformed, batch_size=batch_size, shuffle=True)

val_data = all_train[40000:]
val_transformed = list(map(norm_image, val_data))
val_loader = DataLoader(dataset=val_transformed, batch_size=batch_size, shuffle=True)

test_data = datasets.CIFAR10(root='data/', transform=img_transform, train=False, download=True)
test_transformed = list(map(norm_image, val_data))
test_loader = DataLoader(dataset=val_transformed, batch_size=batch_size)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:03<00:00, 43081727.19it/s]


Extracting data/cifar-10-python.tar.gz to data/
Files already downloaded and verified


In [5]:
class conv_block(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(conv_block, self).__init__()
        self.relu = nn.ReLU()
        self.conv = nn.Conv2d(in_channels, out_channels, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)

    def forward(self, x):
        x = self.conv(x)
        x = self.batchnorm(x)
        x = self.relu(x)
        return x

class inception_block(nn.Module):
    def __init__(self, in_channels, out_ch1, out_ch3):
        super(inception_block, self).__init__()
        self.ch1 = conv_block(in_channels=in_channels, out_channels=out_ch1, kernel_size=(3,3), stride=(1,1), padding='same')
        self.ch3 = conv_block(in_channels=in_channels, out_channels=out_ch3, kernel_size=(3,3), stride=(1,1), padding='same')

    def forward(self,x):
        return torch.cat([self.ch1(x),self.ch3(x)],1)

class downsample_block(nn.Module):
    def __init__(self, in_channels, conv_out):
        super(downsample_block, self).__init__()
        self.convblock = conv_block(in_channels, conv_out, kernel_size=(3,3), stride=(2,2))
        self.maxpool = nn.MaxPool2d(kernel_size=(3,3), stride=(2,2))

    def forward(self,x):
        return torch.cat([self.convblock(x),self.maxpool(x)],1)

class mini_GoogLeNet(nn.Module):
    def __init__(self,in_channels=3, num_classes=10, dropout_prob=0):
        super(mini_GoogLeNet, self).__init__()
        self.conv1 = conv_block(in_channels=3, out_channels=96, kernel_size=(3,3), stride=(1,1))
        self.inception1 = inception_block(96,32,32)
        self.inception2 = inception_block(64,32,48)
        self.downsample1 = downsample_block(80,80)
        self.inception3 = inception_block(160,112,48)
        self.inception4 = inception_block(160,96,64)
        self.inception5 = inception_block(160,80,80)
        self.inception6 = inception_block(160,48,96)
        self.downsample2 = downsample_block(144,96)
        self.inception7 = inception_block(240,176,160)
        self.inception8 = inception_block(336,176,160)
        self.avgpool = nn.AvgPool2d(kernel_size=(7,7), padding=(1,1))
        self.dropout = nn.Dropout(p = dropout_prob)
        self.fc = nn.Linear(336,10)

    def forward(self,x):
        x = self.conv1(x)
        x = self.inception1(x)
        x = self.inception2(x)
        x = self.downsample1(x)
        x = self.inception3(x)
        x = self.inception4(x)
        x = self.inception5(x)
        x = self.inception6(x)
        x = self.downsample2(x)
        x = self.inception7(x)
        x = self.inception8(x)
        x = self.avgpool(x)
        x = x.reshape(x.shape[0],-1)
        x = self.dropout(x)
        x = self.fc(x)
        return x

In [6]:
model = mini_GoogLeNet(in_channels=3, num_classes=10, dropout_prob=0).to(device=device)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.LinearLR(optimizer)

In [7]:
def train(epoch):
    model.train()
    curr_loss_train = 0
    correct_train = 0
    total_train = 0

    for ind, (data_train, true_labels_train) in enumerate(train_loader):
        data_train = data_train.to(device=device)
        true_labels_train = true_labels_train.to(device=device)

        out_train = model(data_train)
        loss_train = loss_function(out_train, true_labels_train)

        optimizer.zero_grad()
        loss_train.backward()
        optimizer.step()

        curr_loss_train += loss_train.item()
        ix, predicted_train = out_train.max(1)
        correct_train += predicted_train.eq(true_labels_train).sum().item()
        total_train += true_labels_train.size(0)

    train_loss = curr_loss_train/len(train_loader)
    acc_train_val = (correct_train/total_train)*100

    train_acc.append(acc_train_val)
    train_all_loss.append(train_loss)

def test(epoch):
    model.eval()
    curr_loss_test = 0
    correct_test = 0
    total_test = 0

    num_class = 10
    confusion_matrix = torch.zeros(num_class, num_class)
    with torch.no_grad():
        for data_test, true_labels_test in test_loader:

            data_test = data_test.to(device=device)
            true_labels_test = true_labels_test.to(device=device)

            out_test = model(data_test)
            loss_test = loss_function(out_test, true_labels_test)

            # metrics
            curr_loss_test += loss_test.item()
            ix, predicted_test = out_test.max(1)
            correct_test += predicted_test.eq(true_labels_test).sum().item()
            total_test += true_labels_test.size(0)


    test_loss = curr_loss_test/len(test_loader)
    acc_test_val = (correct_test/total_test)*100

    test_acc.append(acc_test_val)
    test_all_loss.append(test_loss)
    con_mats.append(confusion_matrix)

In [8]:
train_acc = []
train_all_loss = []

test_acc = []
test_all_loss = []

con_mats = []

times = []
train_times = []

In [9]:
train_start = time.time()
for epoch in range(epochs):
    ep_start = time.time()
    print(f"Epoch {epoch}")
    train(epoch)
    test(epoch)
    scheduler.step()

    epoch_time = time.time() - ep_start
    print(f"Epoch time: {epoch_time:0.2f} seconds")
    times.append(epoch_time)

    train_time = time.time() - train_start
    train_times.append(train_time)

print(f"Total training time: {train_time:0.2f} seconds")

Epoch 0
Epoch time: 31.04 seconds
Epoch 1
Epoch time: 30.34 seconds
Epoch 2
Epoch time: 30.30 seconds
Epoch 3
Epoch time: 29.86 seconds
Epoch 4
Epoch time: 30.13 seconds
Epoch 5
Epoch time: 30.22 seconds
Epoch 6
Epoch time: 30.30 seconds
Epoch 7
Epoch time: 30.08 seconds
Epoch 8
Epoch time: 30.14 seconds
Epoch 9
Epoch time: 30.15 seconds
Epoch 10
Epoch time: 30.14 seconds
Epoch 11
Epoch time: 30.19 seconds
Epoch 12
Epoch time: 30.19 seconds
Epoch 13
Epoch time: 30.25 seconds
Epoch 14
Epoch time: 30.19 seconds
Epoch 15
Epoch time: 30.17 seconds
Epoch 16
Epoch time: 30.21 seconds
Epoch 17
Epoch time: 30.16 seconds
Epoch 18
Epoch time: 30.17 seconds
Epoch 19
Epoch time: 30.15 seconds
Total training time: 604.39 seconds


In [10]:
df_res = pd.DataFrame()
df_res['TrainAccuracy'] = train_acc
df_res['TrainLoss'] = train_all_loss
df_res['TestAccuracy'] = test_acc
df_res['TestLoss'] = test_all_loss
df_res['EpochTime'] = times
df_res['Totaltime'] = train_times

df_res.to_csv('results.csv', index=False)

In [11]:
with open('19.pickle','wb') as handle:
    pickle.dump(con_mats, handle)

In [12]:
pd.set_option('display.max_rows', None)
df_res

Unnamed: 0,TrainAccuracy,TrainLoss,TestAccuracy,TestLoss,EpochTime,Totaltime
0,20.335,2.237839,29.28,2.151448,31.041313,31.041621
1,32.5325,2.067018,37.3,1.938996,30.337848,61.379571
2,38.645,1.867816,43.68,1.740216,30.299116,91.678797
3,45.0175,1.663938,49.76,1.522678,29.861329,121.540226
4,51.1125,1.469453,54.53,1.341906,30.134783,151.675122
5,56.03,1.311972,52.37,1.381297,30.224611,181.900501
6,60.3625,1.186065,63.07,1.101262,30.304548,212.205151
7,63.855,1.086796,63.77,1.05322,30.083076,242.288325
8,66.195,1.01053,63.21,1.070389,30.135294,272.423751
9,69.0125,0.937463,66.81,0.945797,30.146601,302.570465


In [13]:
max_accuracy = df_res['TestAccuracy'].max()
print("Best Accuracy: ", max_accuracy)

Best Accuracy:  72.08
