# Digit Recognizer Data Exploration Notebook

In this [Kaggle competition](https://www.kaggle.com/competitions/digit-recognizer/overview) 

>MNIST ("Modified National Institute of Standards and Technology") is the de facto “hello world” dataset of computer vision. Since its release in 1999, this classic dataset of handwritten images has served as the basis for benchmarking classification algorithms. As new machine learning techniques emerge, MNIST remains a reliable resource for researchers and learners alike.

>In this competition, your goal is to correctly identify digits from a dataset of tens of thousands of handwritten images.

## Install necessary packages

We use the requirement.txt file to list all the dependencies and then run pip install for the requirements.

In [None]:
%pip install -r requirements.txt --user --quiet

If this is the first time running this pip command, restart the kernel.

We need to install graphviz for Keras Visualize to work.  The following is the command to install it.  However, if you are following the labs, this tool has been preinstalled installed in the container hosting this notebook.  

In [None]:
#%sudo apt-get install graphviz

## Imports

In this section, we import the packages needed in this example.  It is good practice to gather your imports into a single place.  

In [None]:
# Imports
import sys, os, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from zipfile import ZipFile
from datetime import datetime
import seaborn as sns
from IPython.display import Image

import tensorflow as tf
from tensorflow import keras, optimizers
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras import layers
print("tensorflow version: ", tf.__version__)

import kfp
import kfp.dsl as dsl

from keras_visualizer import visualizer 

from netapp_dataops.k8s import clone_volume, create_volume, \
delete_volume, list_volumes, create_volume_snapshot, \
delete_volume_snapshot, list_volume_snapshots, restore_volume_snapshot

## Data Volumes

The data exists in 4 volumes with data defined by the data prep notebook (1_digits-dataprep.ipynb):
- digits-train
- digits-valid
- digits-test
- digits-prod

There is an additional volume for the model:
- digits-model

These are all mounted at the path represented by DATA_ROOT. 

In [None]:
ROOT_DIR = '/home/jovyan'

In [None]:
# Designate a root folder for the data
DATA_DIR = 'data'
DATA_ROOT = os.path.join(ROOT_DIR, DATA_DIR)
os.makedirs(DATA_ROOT, exist_ok=True)
assert os.path.exists(DATA_ROOT)

In [None]:
# Training data paths
DATA_TRAIN_PVC = 'digits-train'
DATA_TRAIN_ROOT = os.path.join(DATA_ROOT, DATA_TRAIN_PVC)
os.makedirs(DATA_TRAIN_ROOT, exist_ok=True)
assert os.path.exists(DATA_TRAIN_ROOT)
DATA_TRAIN_FILE = os.path.join(DATA_TRAIN_ROOT,'train.csv')
print(DATA_TRAIN_FILE)
assert os.path.exists(DATA_TRAIN_FILE)

# Testing data paths
DATA_TEST_PVC = 'digits-test'
DATA_TEST_ROOT = os.path.join(DATA_ROOT, DATA_TEST_PVC)
os.makedirs(DATA_TEST_ROOT, exist_ok=True)
assert os.path.exists(DATA_TEST_ROOT)
DATA_TEST_FILE = os.path.join(DATA_TEST_ROOT,'test.csv')
assert os.path.exists(DATA_TEST_FILE)

# Validation data paths
DATA_VALID_PVC = 'digits-valid'
DATA_VALID_ROOT = os.path.join(DATA_ROOT,DATA_VALID_PVC)
os.makedirs(DATA_VALID_ROOT, exist_ok=True)
assert os.path.exists(DATA_VALID_ROOT)
DATA_VALID_FILE = os.path.join(DATA_VALID_ROOT,'valid.csv')
assert os.path.exists(DATA_VALID_FILE)

# Production data paths
DATA_PROD_PVC = 'digits-prod'
DATA_PROD_ROOT = os.path.join(DATA_ROOT, DATA_PROD_PVC)
os.makedirs(DATA_PROD_ROOT, exist_ok=True)
assert os.path.exists(DATA_PROD_ROOT)
DATA_PROD_FILE = os.path.join(DATA_PROD_ROOT,'prod.csv')
assert os.path.exists(DATA_PROD_FILE)

In [None]:
# Model data paths
DATA_MODEL_PVC = 'digits-model'
DATA_MODEL_ROOT = os.path.join(DATA_ROOT, DATA_MODEL_PVC)
os.makedirs(DATA_MODEL_ROOT, exist_ok=True)
assert os.path.exists(DATA_MODEL_ROOT)

## Training Data Preparation

In [None]:
# Loading dataset into pandas 
TRAIN_DF = pd.read_csv(DATA_TRAIN_FILE)

In [None]:
# View the top 5 rows of the training data
TRAIN_DF.head()

In [None]:
# Initial shape of the training data
TRAIN_DF.shape

In [None]:
# Separate out the image data (_X) from the label (_Y) for the train set
TRAIN_X = TRAIN_DF.drop('label', axis=1)
TRAIN_Y = TRAIN_DF.label
# Reshape image in 3 dimensions (height = 28px, width = 28px , channel = 1)... This is needed for the Keras API
TRAIN_X = TRAIN_X.values.reshape(-1,28,28,1)
# Normalize the data
# Each pixel has a value between 0-255. Here we divide by 255, to get values from 0-1
TRAIN_X = TRAIN_X /255.0
TRAIN_X.shape, TRAIN_Y.shape

## Model Building


In [None]:
# hyper parameters
EPOCHS = 3

In [None]:
#Set random seed for reproducibility and ignore warning messages
tf.random.set_seed(42)
np.random.seed(42)

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

In [None]:

# Creating a model using a stack of layers
model = keras.models.Sequential()

# Creating 3 layers of a convolution network
model.add(keras.layers.Conv2D(64, (3, 3), activation='relu', input_shape=(28,28,1)))
model.add(keras.layers.MaxPool2D(2, 2))

model.add(keras.layers.Conv2D(64, (3, 3), activation='relu'))
model.add(keras.layers.MaxPool2D(2, 2))

model.add(keras.layers.Conv2D(64, (3, 3), activation='relu'))
model.add(keras.layers.MaxPool2D(2, 2))

# Flatting the results
model.add(keras.layers.Flatten())

# Creating output
model.add(keras.layers.Dense(64, activation='relu'))

model.add(keras.layers.Dense(32, activation='relu'))

# Most important
# Output are 10 classes, numbers from 0-9
model.add(keras.layers.Dense(10, activation='softmax')) 

# Show model summary - how it looks
model.summary()

In [None]:
PNG_MODEL_FILE = os.path.join(ROOT_DIR,'digits-model')
visualizer(model, file_name=PNG_MODEL_FILE, file_format='png', view=False)

In [None]:
Image(PNG_MODEL_FILE+ '.png')


In [None]:
# Compile the model - we want to have a multiple outcome
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=['accuracy'])

In [None]:
# Fit the model and return the history while training
history = model.fit(
  x=TRAIN_X,
  y=TRAIN_Y,
  epochs=EPOCHS
)

In [None]:
# Create a subfolder in the model volume with the datetime stamp
now = datetime.now()
DATA_MODEL_VERSION = now.strftime("%Y-%m-%d-%H-%M-%S")
DATA_MODEL_VERSION_PATH = os.path.join(DATA_MODEL_ROOT, DATA_MODEL_VERSION)
os.makedirs(DATA_MODEL_VERSION_PATH, exist_ok=True)
print("Path to the model: " + DATA_MODEL_VERSION_PATH)

In [None]:
# Save the model the model volume
keras.models.save_model(model, DATA_MODEL_VERSION_PATH)

## Model Access

In [None]:
# Reload the model already trained
model = keras.models.load_model(DATA_MODEL_VERSION_PATH)

## Model Validation

In [None]:
VALID_DF = pd.read_csv(DATA_VALID_FILE)

In [None]:
VALID_DF.head()

In [None]:
VALID_DF.shape

In [None]:
# Spilt the training data into so the label is in TRAIN_Y and TRAIN_X doesn't include the label
VALID_X = VALID_DF.drop('label', axis=1)
VALID_Y = VALID_DF.label

# Reshape image in 3 dimensions (height = 28px, width = 28px , channel = 1)
VALID_X = VALID_X.values.reshape(-1,28,28,1)


# Normalize the data
# Each pixel has a value between 0-255. Here we divide by 255, to get values from 0-1
VALID_X = VALID_X / 255.0

In [None]:
VALID_X.shape

In [None]:
# Test the model against the test dataset
# Returns the loss value & metrics values for the model in test mode.
model_loss, model_accuracy = model.evaluate(x=VALID_X,y=VALID_Y, verbose=0)
print("Test_loss: {}, Test_accuracy: {} ".format(model_loss,model_accuracy))

In [None]:
# Confusion Matrix

# Generates output predictions for the input samples.
test_predictions = model.predict(x=VALID_X)

# Returns the indices of the maximum values along an axis.
test_predictions = np.argmax(test_predictions,axis=1) # the prediction outputs 10 values, we take the index number of the highest value, which is the prediction of the model

# generate confusion matrix
confusion_matrix = tf.math.confusion_matrix(labels=VALID_Y,predictions=test_predictions)

# plot confusion matrix
h = sns.heatmap(confusion_matrix, fmt='g', cbar=False, annot=True,cmap='Blues')
h.set(xlabel='Predicted', ylabel='Actual', title="Confusion Matrix")

## Create a snapshot of the model volume

Creating a snapshot of the model volume, allows for protection and also cloning of the volume in the future.

In [None]:
USER_NAMESPACE = "kubeflow-user-example-com"
DATA_MODEL_SNAP = 'digits-model-snap-' + DATA_MODEL_VERSION 

In [None]:
# Create a VolumeSnapshot for the volume attached to the 
#   PersistentVolumeClaim (PVC) named in the variable DATA_MODEL_PVC in namespace in USER_NAMESPACE.
#   NOTE: if snapshotName is not specified, the snapshot name will be set to 'ntap-dsutil.<timestamp>
create_volume_snapshot(pvc_name=DATA_MODEL_PVC, namespace=USER_NAMESPACE, snapshot_name=DATA_MODEL_SNAP, print_output=True)

In [None]:
#List the VolumeSnapshots for the namespace
list_volume_snapshots(namespace=USER_NAMESPACE)