In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Conv2D, Dropout, MaxPooling2D, BatchNormalization, ReLU
from tensorflow.keras import Model

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


/kaggle/input/digit-recognizer/sample_submission.csv
/kaggle/input/digit-recognizer/train.csv
/kaggle/input/digit-recognizer/test.csv


# Importing data

In [2]:
train_data = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")
X_test = pd.read_csv("/kaggle/input/digit-recognizer/test.csv")

In [3]:
y = train_data["label"]
X = train_data.drop("label", axis = 1)

# Preprocessing

The data is 784 pixels (28*28) and it's in grayscale so the input to the model is going to be 28 * 28 * 1 as it only has one channel.
It also helps for the data to be between 0 and 1 instead of 0 and 255 as it helps the model converge faster and wouldn't have massive gradients at initialization.

In [4]:
# Converting dataframe to tensorflow tensor then scaling the data to [0,1]
X = tf.convert_to_tensor(X, tf.float64) / 255.0
X_test = tf.convert_to_tensor(X_test, tf.float64) / 255.0
y = tf.convert_to_tensor(y)
X = tf.reshape(X, (-1, 28, 28, 1))
X_test = tf.reshape(X_test, (-1,28,28,1))

In [5]:
n1 = int(X.shape[0]*.9)
X_train, y_train = X[:n1], y[:n1]
X_valid, y_valid = X[n1:], y[n1:]

# Model training

A basic CNN with 2 conv layers and 2 hidden layers, I use dropout on every single layer with p = 0.3 except for the input where I have found that it hinders performance. 

There is some data augmentation as well with a random rotation below 10° and a random translation between 0 and .1 in both dimensions.


In [6]:
kernel_size  = 3
dropout_rate = 0.3 #.25 in prev experiment

In [7]:
class MNIST(Model):
    def __init__(self):
        super().__init__()
        self.training = True
        
        # Data augmentation
        self.translate = tf.keras.layers.RandomTranslation(.05, .05)
        self.rotate = tf.keras.layers.RandomRotation(.05)
        
        self.conv1 = Conv2D(6, 5, activation = "relu")
        self.pool1 = MaxPooling2D()
        self.dropout1 = Dropout(dropout_rate)
        
        self.conv2 = Conv2D(16, 3, activation = "relu")
        #self.pool2 = MaxPooling2D()
        self.flatten = Flatten()
        self.dropout2 = Dropout(dropout_rate)
        
        self.ln1 = Dense(256, activation = "relu")
        self.dropout3 = Dropout(dropout_rate)
        
        self.ln2 = Dense(64, activation = "relu")
        self.dropout4 = Dropout(dropout_rate)
        
        self.out = Dense(10)
        
    def call(self, x):
            
        x = self.translate(x, training = self.training)
        x = self.rotate(x, training = self.training)
        
        x = self.conv1(x)
        x = self.pool1(x)
        x = self.dropout1(x, training = self.training)
        
        x = self.conv2(x)
        #x = self.pool2(x)
        x = self.dropout2(x, training = self.training)
        x = self.flatten(x)
        
        x = self.ln1(x)
        x = self.dropout3(x, training = self.training)
        
        x = self.ln2(x)
        x = self.dropout4(x, training = self.training)
        
        out = self.out(x)
        
        return out

In [8]:
model= MNIST()
model.build((None, 28, 28, 1))
model.summary()

Model: "mnist"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 random_translation (RandomT  multiple                 0         
 ranslation)                                                     
                                                                 
 random_rotation (RandomRota  multiple                 0         
 tion)                                                           
                                                                 
 conv2d (Conv2D)             multiple                  156       
                                                                 
 max_pooling2d (MaxPooling2D  multiple                 0         
 )                                                               
                                                                 
 dropout (Dropout)           multiple                  0         
                                                             

The dropout paper claims that decaying learning rate works best for NNs with dropout so that's why I chose exponential decay. As for the actual values, several were tested and this seemed to work best.

In [9]:
lr = 5e-4
optimizer = tf.keras.optimizers.Adam(lr)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)

I use the training and testing steps straight from the tensorflow docs with minor changes.

In [10]:
@tf.function
def train_step(X:tf.Tensor, y:tf.Tensor):
    model.training = True
    
    with tf.GradientTape() as tape:
        predictions = model(X)
        loss = loss_fn(y, predictions)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    return loss

In [11]:
@tf.function
def test_step(images, labels):
    model.training = False
    predictions = model(images)
    loss = loss_fn(labels, predictions)
    return loss

I used a fairly typical training loop.

In [12]:
batch_size = 512
EPOCHS = 500

In [13]:
data = tf.data.Dataset.from_tensor_slices((X_train, y_train))
data = data.shuffle(buffer_size = len(X_train), reshuffle_each_iteration = True, 
                          seed = 10)

In [14]:
# Training loop
for epoch in range(1, EPOCHS+1):
    
    dataset = data.batch(batch_size, drop_remainder = False)
    loss = 0
    
    for X_batch, y_batch in dataset:
        loss += train_step(X_batch, y_batch) * len(X_batch)
        
    loss /= len(X_train)

    val_loss = test_step(X_valid, y_valid)
    val_accuracy = (np.argmax(model(X_valid), 1) == np.array(y_valid)).mean()  
    
    print(f"Epoch {epoch}/{EPOCHS}: loss: {loss:.5f}, val_loss: {val_loss:.5f}, val_accuracy: {val_accuracy:.5f}")

2023-12-03 15:50:22.726100: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inmnist/dropout/dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer
2023-12-03 15:50:31.540639: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inmnist/dropout/dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer


Epoch 1/500: loss: 1.52443, val_loss: 0.39706, val_accuracy: 0.90119
Epoch 2/500: loss: 0.74127, val_loss: 0.20583, val_accuracy: 0.94071
Epoch 3/500: loss: 0.52166, val_loss: 0.14377, val_accuracy: 0.95500
Epoch 4/500: loss: 0.41157, val_loss: 0.10754, val_accuracy: 0.96524
Epoch 5/500: loss: 0.34664, val_loss: 0.08998, val_accuracy: 0.96905
Epoch 6/500: loss: 0.29924, val_loss: 0.07976, val_accuracy: 0.97310
Epoch 7/500: loss: 0.27395, val_loss: 0.07401, val_accuracy: 0.97452
Epoch 8/500: loss: 0.25506, val_loss: 0.06641, val_accuracy: 0.97714
Epoch 9/500: loss: 0.23107, val_loss: 0.06231, val_accuracy: 0.97810
Epoch 10/500: loss: 0.21631, val_loss: 0.06105, val_accuracy: 0.97976
Epoch 11/500: loss: 0.20558, val_loss: 0.05623, val_accuracy: 0.98048
Epoch 12/500: loss: 0.19616, val_loss: 0.04935, val_accuracy: 0.98381
Epoch 13/500: loss: 0.18697, val_loss: 0.04780, val_accuracy: 0.98357
Epoch 14/500: loss: 0.18055, val_loss: 0.04543, val_accuracy: 0.98333
Epoch 15/500: loss: 0.17166, 

Putting the model in inference mode and passing the test data in batches to support GPU's memory limitations and then creating a submission that complies to the kaggle competition format.

In [15]:
model.training = False
y_pred = []

for i in range(0, len(X_test)-100+1, 100):
    partial = np.argmax(model(X_test[i:i+100]), axis = 1)
    y_pred.append(partial)
    
y_pred = np.array(y_pred).reshape(-1)

In [16]:
image_id = np.arange(start = 1, stop = y_pred.shape[0]+1)

submission = pd.DataFrame(zip(image_id,y_pred), columns = ["ImageId","label"])
submission.set_index("ImageId",inplace = True)
submission.to_csv("submission.csv")