In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Conv2D, Dropout, MaxPooling2D, BatchNormalization, ReLU
from tensorflow.keras import Model

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


# Importing data

In [2]:
train_data = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")
X_test = pd.read_csv("/kaggle/input/digit-recognizer/test.csv")

In [3]:
y = train_data["label"]
X = train_data.drop("label", axis = 1)

# Preprocessing

The data is 784 pixels (28*28) and it's in grayscale so the input to the model is going to be 28 * 28 * 1 as it only has one channel.
It also helps for the data to be between 0 and 1 instead of 0 and 255 as it helps the model converge faster and wouldn't have massive gradients at initialization.

In [4]:
# Converting dataframe to tensorflow tensor then scaling the data to [0,1]
X = tf.convert_to_tensor(X, tf.float64) / 255.0
X_test = tf.convert_to_tensor(X_test, tf.float64) / 255.0
y = tf.convert_to_tensor(y)
X = tf.reshape(X, (-1, 28, 28, 1))
X_test = tf.reshape(X_test, (-1,28,28,1))

In [5]:
n1 = int(X.shape[0]*.9)
X_train, y_train = X[:n1], y[:n1]
X_valid, y_valid = X[n1:], y[n1:]

# Model training

A basic CNN with 2 conv layers and 2 hidden layers, I use dropout on every single layer with p = 0.25 except for the input where I have found that it hinders performance. 

In [6]:
kernel_size  = 3
dropout_rate = 0.25

In [7]:
class MNIST(Model):
    def __init__(self):
        super().__init__()
        self.training = True
        
        # Data augmentation
        self.translate = tf.keras.layers.RandomTranslation(.05, .05)
        self.rotate = tf.keras.layers.RandomRotation(.05)
        
        self.conv1 = Conv2D(6, 5, activation = "relu")
        self.pool1 = MaxPooling2D()
        self.dropout1 = Dropout(dropout_rate)
        
        self.conv2 = Conv2D(16, 3, activation = "relu")
        self.flatten = Flatten()
        self.dropout2 = Dropout(dropout_rate)
        
        self.ln1 = Dense(256, activation = "relu")
        self.dropout3 = Dropout(dropout_rate)
        
        self.ln2 = Dense(64, activation = "relu")
        self.dropout4 = Dropout(dropout_rate)
        
        self.out = Dense(10)
        
    def call(self, x):
            
        x = self.translate(x, training = self.training)
        x = self.rotate(x, training = self.training)
        
        x = self.conv1(x)
        x = self.pool1(x)
        x = self.dropout1(x, training = self.training)
        
        x = self.conv2(x)
        x = self.dropout2(x, training = self.training)
        x = self.flatten(x)
        
        x = self.ln1(x)
        x = self.dropout3(x, training = self.training)
        
        x = self.ln2(x)
        x = self.dropout4(x, training = self.training)
        
        out = self.out(x)
        
        return out

In [8]:
model= MNIST()
model.build((None, 28, 28, 1))
model.summary()

Model: "mnist"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 random_translation (RandomT  multiple                 0         
 ranslation)                                                     
                                                                 
 random_rotation (RandomRota  multiple                 0         
 tion)                                                           
                                                                 
 conv2d (Conv2D)             multiple                  156       
                                                                 
 max_pooling2d (MaxPooling2D  multiple                 0         
 )                                                               
                                                                 
 dropout (Dropout)           multiple                  0         
                                                             

In [9]:
lr = 5e-4
optimizer = tf.keras.optimizers.Adam(lr)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)

I use the training and testing steps straight from the tensorflow docs with minor changes.

In [10]:
@tf.function
def train_step(X:tf.Tensor, y:tf.Tensor):
    model.training = True
    
    with tf.GradientTape() as tape:
        predictions = model(X)
        loss = loss_fn(y, predictions)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    return loss

In [11]:
@tf.function
def test_step(images, labels):
    model.training = False
    predictions = model(images)
    loss = loss_fn(labels, predictions)
    return loss

I used a fairly typical training loop.

In [12]:
batch_size = 512
EPOCHS = 100

In [13]:
data = tf.data.Dataset.from_tensor_slices((X_train, y_train))
data = data.shuffle(buffer_size = len(X_train), reshuffle_each_iteration = True, 
                          seed = 10)

In [14]:
# Training loop
for epoch in range(1, EPOCHS+1):
    
    dataset = data.batch(batch_size, drop_remainder = False)
    loss = 0
    
    for X_batch, y_batch in dataset:
        loss += train_step(X_batch, y_batch) * len(X_batch)
        
    loss /= len(X_train)

    val_loss = test_step(X_valid, y_valid)
    val_accuracy = (np.argmax(model(X_valid), 1) == np.array(y_valid)).mean()  
    
    print(f"Epoch {epoch}/{EPOCHS}: loss: {loss:.5f}, val_loss: {val_loss:.5f}, val_accuracy: {val_accuracy:.5f}")

2025-11-08 22:08:11.819059: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inmnist/dropout/dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer
2025-11-08 22:08:16.923849: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inmnist/dropout/dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer


Epoch 1/100: loss: 1.41062, val_loss: 0.32593, val_accuracy: 0.90905
Epoch 2/100: loss: 0.64632, val_loss: 0.18660, val_accuracy: 0.94143
Epoch 3/100: loss: 0.45624, val_loss: 0.12441, val_accuracy: 0.96048
Epoch 4/100: loss: 0.35870, val_loss: 0.09748, val_accuracy: 0.96810
Epoch 5/100: loss: 0.30483, val_loss: 0.08157, val_accuracy: 0.97333
Epoch 6/100: loss: 0.26633, val_loss: 0.07729, val_accuracy: 0.97524
Epoch 7/100: loss: 0.23980, val_loss: 0.06686, val_accuracy: 0.97690
Epoch 8/100: loss: 0.22462, val_loss: 0.05873, val_accuracy: 0.97976
Epoch 9/100: loss: 0.21116, val_loss: 0.05295, val_accuracy: 0.98167
Epoch 10/100: loss: 0.19569, val_loss: 0.05216, val_accuracy: 0.98119
Epoch 11/100: loss: 0.18377, val_loss: 0.05033, val_accuracy: 0.98262
Epoch 12/100: loss: 0.17458, val_loss: 0.04529, val_accuracy: 0.98357
Epoch 13/100: loss: 0.16267, val_loss: 0.04627, val_accuracy: 0.98381
Epoch 14/100: loss: 0.15730, val_loss: 0.03723, val_accuracy: 0.98881
Epoch 15/100: loss: 0.15683, 

Putting the model in inference mode and passing the test data in batches to support GPU's memory limitations and then creating a submission that complies to the kaggle competition format.

In [15]:
model.training = False
y_pred = []

for i in range(0, len(X_test)-100+1, 100):
    partial = np.argmax(model(X_test[i:i+100]), axis = 1)
    y_pred.append(partial)
    
y_pred = np.array(y_pred).reshape(-1)

In [16]:
image_id = np.arange(start = 1, stop = y_pred.shape[0]+1)

submission = pd.DataFrame(zip(image_id,y_pred), columns = ["ImageId","label"])
submission.set_index("ImageId",inplace = True)
submission.to_csv("submission.csv")