In [278]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as transforms
import pandas as pd
import torch.nn as nn
from PIL import Image
import numpy as np
from torch.utils.data.sampler import SubsetRandomSampler

# 1: Data preprocessing

-Read csv files into panda dataframes, create custom pytorch training + validation datasets to load into dataloaders

In [274]:
#Write custom pytorch map-style dataset by implementing __len__() and __getitem__() methods
#Assume csv files are in same directory as notebook for simplicity sake
class ds(Dataset):
    def __init__(self,csv_file,transform):
        self.train_df = pd.read_csv(csv_file)
        self.transform = transform
        self.image_arr = np.asarray(self.train_df.iloc[:, 1:])
        self.label_arr = np.asarray(self.train_df.iloc[:,0])
        self.train_df_len = len(self.train_df)
    def __len__(self):
        return self.train_df_len
    def __getitem__(self, index):
        label = self.label_arr[index]
        img = self.image_arr[index]
        img_as_np = np.asarray(img).reshape(28, 28).astype('uint8')
        img_as_img = Image.fromarray(img_as_np)
        img_as_img = img_as_img.convert('L')
        if self.transform is not None:
            img_as_tensor = self.transform(img_as_img)
        # Return image and the label
        return (img_as_tensor, label)


In [283]:
transform = transforms.Compose([
    transforms.ToTensor()
])

Validation set is incredibly important for kaggle competitions, as it seems the testing dataset we are given contains no labels!

In [284]:
dataset = ds('train.csv',transform=transform)
batch_size = 16
validation_split = .2
shuffle_dataset = True
random_seed= 42
# Creating data indices for training and validation splits:
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]
# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                                           sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                sampler=valid_sampler)

# 2. Define Model

In [225]:
class myCNN(nn.Module):
    def __init__(self):
        super(myCNN,self).__init__()
        self.cnn1 = nn.Conv2d(1,3,3)
        self.cnn2 = nn.Conv2d(3,2,5)
        self.relu = nn.ReLU()
        self.linear1 = nn.Linear(968,10)
        
    def forward(self,x):
        n = x.size(0)
        x = self.relu(self.cnn1(x))
        x = self.relu(self.cnn2(x))
        x = x.view(n,-1) 
        x = self.linear1(x)
        return x
        

In [280]:
import torch.optim as optim
learning_rate = 1e-3
mycnn = myCNN()
cec = nn.CrossEntropyLoss()
optimizer = optim.Adam(mycnn.parameters(),lr = learning_rate)
n_epoch = 1
n_print = 10
n_batch = 64

In [281]:
def validate(model,data):
  # To get validation accuracy = (correct/total)*100.
  total = 0
  correct = 0
  for i,(images,labels) in enumerate(data):
    #images = var(images.cuda())
    x = model(images)
    value,pred = torch.max(x,1)
        #pred = pred.data.cpu()
    total += x.size(0)
    correct += torch.sum(pred == labels)
  return correct*100./total

In [282]:
for e in range(n_epoch):
  for i,(images,labels) in enumerate(train_loader):
    #images = var(images.cuda())
    #labels = var(labels.cuda())
    optimizer.zero_grad()
    pred = mycnn(images)
    loss = cec(pred,labels)
    loss.backward()
    optimizer.step()
    if (i+1) % n_print == 0:
      accuracy = float(validate(mycnn,validation_loader))
      print('Epoch :',e+1,'Batch :',i+1,'Loss :',float(loss.data),'Accuracy :',accuracy,'%')

Epoch : 1 Batch : 10 Loss : 2.282987594604492 Accuracy : 15.0 %
Epoch : 1 Batch : 20 Loss : 2.160853385925293 Accuracy : 19.0 %
Epoch : 1 Batch : 30 Loss : 2.0959384441375732 Accuracy : 38.0 %
Epoch : 1 Batch : 40 Loss : 1.8996684551239014 Accuracy : 64.0 %
Epoch : 1 Batch : 50 Loss : 1.2505484819412231 Accuracy : 67.0 %
Epoch : 1 Batch : 60 Loss : 0.9471915364265442 Accuracy : 74.0 %
Epoch : 1 Batch : 70 Loss : 0.8029947876930237 Accuracy : 79.0 %
Epoch : 1 Batch : 80 Loss : 0.8752391934394836 Accuracy : 79.0 %
Epoch : 1 Batch : 90 Loss : 0.8643077611923218 Accuracy : 82.0 %
Epoch : 1 Batch : 100 Loss : 0.6359731554985046 Accuracy : 83.0 %
Epoch : 1 Batch : 110 Loss : 0.7015008926391602 Accuracy : 83.0 %
Epoch : 1 Batch : 120 Loss : 0.4983919858932495 Accuracy : 83.0 %
Epoch : 1 Batch : 130 Loss : 0.7775877714157104 Accuracy : 84.0 %
Epoch : 1 Batch : 140 Loss : 0.8596277236938477 Accuracy : 83.0 %
Epoch : 1 Batch : 150 Loss : 0.7116053104400635 Accuracy : 84.0 %
Epoch : 1 Batch : 160

KeyboardInterrupt: 