In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Conv2D, Dropout
from tensorflow.keras import Model

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing data

In [None]:
train_data = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")
X_test = pd.read_csv("/kaggle/input/digit-recognizer/test.csv")

In [None]:
y = train_data["label"]
X = train_data.drop("label", axis = 1)

# Preprocessing

The data is 784 pixels (28*28) and it's in grayscale so the input to the model is going to be 28 * 28 * 1 as it only has one channel.
It also helps for the data to be between 0 and 1 instead of 0 and 255 as it helps the model converge faster and wouldn't have massive gradients at initialization.

In [None]:
# Converting dataframe to tensorflow tensor then scaling the data to [0,1]
X = tf.convert_to_tensor(X, tf.float64) / 255.0
X_test = tf.convert_to_tensor(X_test, tf.float64) / 255.0
y = tf.convert_to_tensor(y)
X = tf.reshape(X, (-1, 28, 28, 1))
X_test = tf.reshape(X_test, (-1,28,28,1))

In [None]:
n1 = int(X.shape[0]*.9)
X_train, y_train = X[:n1], y[:n1]
X_valid, y_valid = X[n1:], y[n1:]

# Model training

A basic CNN with 2 conv layers and 2 hidden layers, I use dropout on every single layer with p = 0.3 except for the input where I have found that it hinders performance. 

There is some data augmentation as well with a random rotation below 10° and a random translation between 0 and .1 in both dimensions.


In [None]:
kernel_size  = 3
dropout_rate = .3

In [None]:
class MNIST(Model):
    def __init__(self):
        super().__init__()
        self.training = True
        
        self.translate = tf.keras.layers.RandomTranslation(.1, .1)
        self.rotate = tf.keras.layers.RandomRotation(.03)
        
        self.conv1 = Conv2D(64, kernel_size, activation = "relu")
        self.dropout1 = Dropout(dropout_rate)
        
        self.conv2 = Conv2D(30, kernel_size, activation = "relu")
        self.flatten = Flatten()
        self.dropout2 = Dropout(dropout_rate)
        
        self.ln1 = Dense(1024, activation = "relu")
        self.dropout3 = Dropout(dropout_rate)
        
        self.ln2 = Dense(2048, activation = "relu")
        self.dropout4 = Dropout(dropout_rate)
        
        self.out = Dense(10)
        
    def call(self, x):
            
        x = self.translate(x, training = self.training)
        x = self.rotate(x, training = self.training)
        
        x = self.conv1(x)
        x = self.dropout1(x, training = self.training)
        
        x = self.conv2(x)
        x = self.dropout2(x, training = self.training)
        x = self.flatten(x)
        
        x = self.ln1(x)
        x = self.dropout3(x, training = self.training)
        
        x = self.ln2(x)
        x = self.dropout4(x, training = self.training)
        
        out = self.out(x)
        
        return out
    

In [None]:
model= MNIST()
model.build((None, 28, 28, 1))
model.summary()

The dropout paper claims that decaying learning rate works best for NNs with dropout so that's why I chose exponential decay. As for the actual values, several were tested and this seemed to work best.

In [None]:
lr = tf.keras.optimizers.schedules.ExponentialDecay(5e-4, 500, .9)
optimizer = tf.keras.optimizers.Adam(lr)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)

I use the training and testing steps straight from the tensorflow docs with minor changes.

In [None]:
@tf.function
def train_step(X:tf.Tensor, y:tf.Tensor):
    model.training = True
    
    with tf.GradientTape() as tape:
        predictions = model(X)
        loss = loss_fn(y, predictions)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    return loss

In [None]:
@tf.function
def test_step(images, labels):
    model.training = False
    predictions = model(images)
    loss = loss_fn(labels, predictions)
    return loss

I used a fairly typical training loop.

In [None]:
batch_size = 128
EPOCHS = 60

In [None]:
"""
for epoch in range(1, EPOCHS+1):
    
    dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    dataset = dataset.shuffle(buffer_size = len(X_train), seed = 10)
    dataset = dataset.batch(batch_size, drop_remainder = False)
    iterator = iter(dataset)
    loss = 0
    
    for i in range(0, X_train.shape[0] - batch_size, batch_size): 
        X_batch, y_batch = next(iterator)
        loss += train_step(X_batch, y_batch)

    loss *= (batch_size/len(X))

    val_loss = test_step(X_valid, y_valid)
    val_accuracy = (np.argmax(model(X_valid), 1) == np.array(y_valid)).mean()  
    
    print(f"Epoch {epoch}/{EPOCHS}: loss: {loss}, val_loss: {val_loss}, val_accuracy: {val_accuracy}")
"""

This is the final training loop (without validation).

In [None]:
for epoch in range(EPOCHS):
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    dataset = dataset.shuffle(buffer_size = len(X), seed = 10)
    dataset = dataset.batch(batch_size, drop_remainder = False)
    iterator = iter(dataset)
    loss = 0
    
    for i in range(0, X.shape[0] - batch_size, batch_size): 
        X_batch, y_batch = next(iterator)
        loss += train_step(X_batch, y_batch)
        
    loss *= (batch_size/len(X))
    print(f"Epoch {epoch+1}/{EPOCHS}: loss: {loss}")

Putting the model in inference mode and passing the test data in batches to support GPU's memory limitations and then creating a submission that complies to the kaggle competition format.

In [None]:
model.training = False
y_pred = []

for i in range(0, len(X_test)-100+1, 100):
    partial = np.argmax(model(X_test[i:i+100]), axis = 1)
    y_pred.append(partial)
    
y_pred = np.array(y_pred).reshape(-1)

In [None]:
image_id = np.arange(start = 1, stop = y_pred.shape[0]+1)

submission = pd.DataFrame(zip(image_id,y_pred), columns = ["ImageId","label"])
submission.set_index("ImageId",inplace = True)
submission.to_csv("submission.csv")