In [0]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as transforms
import pandas as pd
import torch.nn as nn
from PIL import Image
import numpy as np
from torch.utils.data.sampler import SubsetRandomSampler
from torch.autograd import Variable as var


# 1: Data preprocessing

Read csv files into panda dataframes, create custom pytorch training + validation datasets to load into dataloaders


In [0]:
#Write custom pytorch map-style dataset by implementing __len__() and __getitem__() methods
#Assume csv files are in same directory as notebook for simplicity sake
class ds(Dataset):
    def __init__(self,csv_file,transform,train_val):
        if train_val:
            self.train_df = pd.read_csv(csv_file)
            self.image_arr = np.asarray(self.train_df.iloc[:, 1:])
            self.label_arr = np.asarray(self.train_df.iloc[:,0])
            self.train_df_len = len(self.train_df)
        else:
            self.test_df = pd.read_csv(csv_file)
            self.image_arr = np.asarray(self.test_df)
            self.test_df_len = len(self.test_df)
        self.train_val = train_val
        self.transform = transform
        
    def __len__(self):
        if self.train_val:
            return self.train_df_len
        else:
            return self.test_df_len

    def __getitem__(self, index):
        if self.train_val:
            label = self.label_arr[index]
            img = self.image_arr[index]
            img_as_np = np.asarray(img).reshape(28, 28).astype('uint8')
            img_as_img = Image.fromarray(img_as_np)
            img_as_img = img_as_img.convert('L')
            if self.transform is not None:
                img_as_tensor = self.transform(img_as_img)
            return (img_as_tensor, label)
        else:
          img = self.image_arr[index]
          img_as_np = np.asarray(img).reshape(28, 28).astype('uint8')
          img_as_img = Image.fromarray(img_as_np)
          img_as_img = img_as_img.convert('L')
          if self.transform is not None:
              img = self.transform(img_as_img)
          return (img)
  


One way to improve the model's accuracy is by artificially increasing the number of training examples.
https://www.kaggle.com/cdeotte/25-million-images-0-99757-mnist generated 25 million more images for 15 
CNN's to train on in order to obtain the highest known MNIST accuracy of 99.75%! For now, we keep everything simple.

In [0]:
transform = transforms.Compose([
    #transforms.RandomHorizontalFlip(),
    #transforms.RandomRotation(30),
    transforms.ToTensor()
])

Validation set is incredibly important for kaggle competitions, as it seems the testing dataset we are given contains no labels!

In [0]:
dataset = ds('train.csv',transform=transform,train_val=True)
batch_size = 16
validation_split = .2
shuffle_dataset = True
random_seed= 42
# Creating data indices for training and validation splits:
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]
# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                                           sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                sampler=valid_sampler)

##2. Define Model

For this we just use a pretty simple CNN architecture.

In [0]:
class myCNN(nn.Module):
    def __init__(self):
        super(myCNN,self).__init__()
        self.cnn1 = nn.Conv2d(1,3,3)
        self.cnn2 = nn.Conv2d(3,2,5)
        self.relu = nn.ReLU()
        self.linear1 = nn.Linear(968,200)
        self.linear2 = nn.Linear(200,10)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self,x):
        n = x.size(0)
        x = self.relu(self.cnn1(x))
        x = self.relu(self.cnn2(x))
        x = x.view(n,-1) 
        x = self.linear1(x)
        x = self.linear2(self.relu(self.dropout(x)))
        return x
        

# 3.Train CNN

In [0]:
import torch.optim as optim
learning_rate = 1e-3
mycnn = myCNN().cuda()
cec = nn.CrossEntropyLoss()
optimizer = optim.Adam(mycnn.parameters(),lr = learning_rate)
n_epoch = 30
n_print = 1000
n_batch = 64

In [0]:
def validate(model,data):
  # To get validation accuracy = (correct/total)*100.
  total = 0
  correct = 0
  for i,(images,labels) in enumerate(data):
    images = var(images.cuda())
    x = model(images)
    value,pred = torch.max(x,1)
    pred = pred.data.cpu()
    total += x.size(0)
    correct += torch.sum(pred == labels)
  return correct*100./total

In [59]:
for e in range(n_epoch):
  for i,(images,labels) in enumerate(train_loader):
    images = var(images.cuda())
    labels = var(labels.cuda())
    optimizer.zero_grad()
    pred = mycnn(images)
    loss = cec(pred,labels)
    loss.backward()
    optimizer.step()
    if (i+1) % n_print == 0:
      accuracy = float(validate(mycnn,validation_loader))
      print('Epoch :',e+1,'Batch :',i+1,'Loss :',float(loss.data),'Accuracy :',accuracy,'%')

Epoch : 1 Batch : 1000 Loss : 0.061713606119155884 Accuracy : 94.75 %
Epoch : 1 Batch : 2000 Loss : 0.05642193555831909 Accuracy : 95.44047546386719 %
Epoch : 2 Batch : 1000 Loss : 0.0018565952777862549 Accuracy : 96.52381134033203 %
Epoch : 2 Batch : 2000 Loss : 0.09067606925964355 Accuracy : 96.94047546386719 %
Epoch : 3 Batch : 1000 Loss : 0.004040926694869995 Accuracy : 97.28571319580078 %
Epoch : 3 Batch : 2000 Loss : 0.6924932599067688 Accuracy : 96.82142639160156 %
Epoch : 4 Batch : 1000 Loss : 0.062066853046417236 Accuracy : 97.3452377319336 %
Epoch : 4 Batch : 2000 Loss : 0.008477121591567993 Accuracy : 96.47618865966797 %
Epoch : 5 Batch : 1000 Loss : 0.004570364952087402 Accuracy : 97.19047546386719 %
Epoch : 5 Batch : 2000 Loss : 0.004425138235092163 Accuracy : 97.38095092773438 %
Epoch : 6 Batch : 1000 Loss : 0.0014649033546447754 Accuracy : 97.26190185546875 %
Epoch : 6 Batch : 2000 Loss : 0.009091228246688843 Accuracy : 97.32142639160156 %
Epoch : 7 Batch : 1000 Loss : 0

In [0]:
save_path = '/model.pth'
torch.save(mycnn.state_dict(), save_path)

#4.Test model and submit predictions

In [0]:
test_dataset = ds('test.csv',transform=transform,train_val=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1)

In [0]:
index = range(0,test_dataset.test_df_len)
columns = ['ImageId','Label']
df = pd.DataFrame(index=index,columns=columns)
#del df.index.name

In [0]:
for i, data in enumerate(test_loader, 0):
    images = data
    images = images.cuda()
    out = mycnn(images)
    _, predicted = torch.max(out, 1)
    df.iloc[i,:] = [i+1,predicted.item()] 

In [87]:
print(df[['ImageId','Label']])

       ImageId  Label
0            1      2
1            2      0
2            3      9
3            4      0
4            5      3
...        ...    ...
27995    27996      9
27996    27997      7
27997    27998      3
27998    27999      9
27999    28000      2

[28000 rows x 2 columns]


In [0]:
df.to_csv('submission.csv',index=False)