# Digit Recognizer Data Pipeline Notebook

In this [Kaggle competition](https://www.kaggle.com/competitions/digit-recognizer/overview) 

>MNIST ("Modified National Institute of Standards and Technology") is the de facto “hello world” dataset of computer vision. Since its release in 1999, this classic dataset of handwritten images has served as the basis for benchmarking classification algorithms. As new machine learning techniques emerge, MNIST remains a reliable resource for researchers and learners alike.

>In this competition, your goal is to correctly identify digits from a dataset of tens of thousands of handwritten images.

## Install necessary packages

We use the requirement.txt file to list all the dependencies and then run pip install for the requirements.

In [None]:
%pip install -r requirements.txt --user --quiet

If this is the first time running this pip command, restart the kernel.

## Imports

In this section, we import the packages needed in this example.  It is good practice to gather your imports into a single place.  

In [None]:
# Imports
import kfp
import kfp.dsl as dsl
import kfp.components as components
from typing import NamedTuple



In [None]:
# Define pipeline variables and set default values
user_namespace: str = "kubeflow-user-example-com"

clone_step_container_image: str = "curtisab/ndot-jupyter-scipy:v1alpha1"
clone_step_train_pvc_existing: str = "digits-train"
clone_step_valid_pvc_existing: str = "digits-valid"

train_step_container_image: str = "curtisab/ndot-jupyter-scipy:v1alpha1"
train_step_train_pvc: str = "digits-train-clone"
train_step_train_mountpoint: str = "/mnt/train"
train_step_valid_pvc: str = "digits-valid-clone"
train_step_valid_mountpoint: str = "/mnt/valid"
train_step_model_pvc_existing: str = "digits-model"
train_step_model_mountpoint: str = "/mnt/model"


serve_step_container_image: str = "curtisab/ndot-jupyter-scipy:v1alpha1"
serve_step_model_pvc_existing: str = "digits-model"
serve_step_model_mountpoint: str = "/mnt/model"

In [None]:
# Set GPU limits; Due to SDK limitations, this must be hardcoded
train_step_num_gpu = 0
valid_step_num_gpu = 0

## Clone the data volumes
This step will run in separate container that will execute the clone volume step

In [None]:
def clone_step():
    print("Data Clone Step")
    
    """
    Clone the existing volumes
    Export clone pvc name
    """

    from netapp_dataops.k8s import clone_volume
    
    clone_volume(sourcePvcName=clone_step_train_pvc_existing, newPvcName=train_step_train_pvc, namespace=user_namespace)
    clone_volume(sourcePvcName=clone_step_valid_pvc_existing, newPvcName=train_step_valid_pvc, namespace=user_namespace)


## Model generation step
This step will execute in a separate container.  It will save the model to the model persistent volume claim.  

In [None]:
def train_step(    
    no_epochs:int = 1,
    optimizer: str = "adam"
) -> NamedTuple('Output', [('mlpipeline_ui_metadata', 'UI_metadata'),('mlpipeline_metrics', 'Metrics')]):

    print("Model Generation Step")

    """
    Build the model with Keras API
    Export model parameters
    """
    from tensorflow import keras
    import tensorflow as tf
    import numpy as np
    import pandas as pd
    import json

    # Construct the model structure
    
    model = keras.models.Sequential()
    model.add(keras.layers.Conv2D(64, (3, 3), activation='relu', input_shape=(28,28,1)))
    model.add(keras.layers.MaxPool2D(2, 2))

    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(64, activation='relu'))

    model.add(keras.layers.Dense(32, activation='relu'))

    model.add(keras.layers.Dense(10, activation='softmax')) # Output are 10 classes, numbers from 0-9

    # Show model summary - how it looks
    stringlist = []
    model.summary(print_fn=lambda x: stringlist.append(x))
    metric_model_summary = "\n".join(stringlist)
    
    # Compile the model - we want to have a binary outcome
    model.compile(optimizer=optimizer,
              loss="sparse_categorical_crossentropy",
              metrics=['accuracy'])

    # Get the data
    
    minio_client.fget_object(minio_bucket,"x_train","/tmp/x_train.npy")
    x_train = np.load("/tmp/x_train.npy")
    
    minio_client.fget_object(minio_bucket,"y_train","/tmp/y_train.npy")
    y_train = np.load("/tmp/y_train.npy")
    
    # Fit the model and return the history while training
    history = model.fit(
      x=x_train,
      y=y_train,
      epochs=no_epochs,
      batch_size=20,
    )
    
    minio_client.fget_object(minio_bucket,"x_test","/tmp/x_test.npy")
    x_test = np.load("/tmp/x_test.npy")
    
    minio_client.fget_object(minio_bucket,"y_test","/tmp/y_test.npy")
    y_test = np.load("/tmp/y_test.npy")
    

    # Test the model against the test dataset
    # Returns the loss value & metrics values for the model in test mode.
    model_loss, model_accuracy = model.evaluate(x=x_test,y=y_test)
    
    # Confusion Matrix

    # Generates output predictions for the input samples.
    test_predictions = model.predict(x=x_test)

    # Returns the indices of the maximum values along an axis.
    test_predictions = np.argmax(test_predictions,axis=1) # the prediction outputs 10 values, we take the index number of the highest value, which is the prediction of the model

    # Generate confusion matrix
    confusion_matrix = tf.math.confusion_matrix(labels=y_test,predictions=test_predictions)
    confusion_matrix = confusion_matrix.numpy()
    vocab = list(np.unique(y_test))
    data = []
    for target_index, target_row in enumerate(confusion_matrix):
        for predicted_index, count in enumerate(target_row):
            data.append((vocab[target_index], vocab[predicted_index], count))

    df_cm = pd.DataFrame(data, columns=['target', 'predicted', 'count'])
    cm_csv = df_cm.to_csv(header=False, index=False)
    
    metadata = {
        "outputs": [
            {
                "type": "confusion_matrix",
                "format": "csv",
                "schema": [
                    {'name': 'target', 'type': 'CATEGORY'},
                    {'name': 'predicted', 'type': 'CATEGORY'},
                    {'name': 'count', 'type': 'NUMBER'},
                  ],
                "target_col" : "actual",
                "predicted_col" : "predicted",
                "source": cm_csv,
                "storage": "inline",
                "labels": [0,1,2,3,4,5,6,7,8,9]
            },
            {
                'storage': 'inline',
                'source': '''# Model Overview
## Model Summary

```
{}
```

## Model Performance

**Accuracy**: {}
**Loss**: {}

'''.format(metric_model_summary,model_accuracy,model_loss),
                'type': 'markdown',
            }
        ]
    }
    
    metrics = {
      'metrics': [{
          'name': 'model_accuracy',
          'numberValue':  float(model_accuracy),
          'format' : "PERCENTAGE"
        },{
          'name': 'model_loss',
          'numberValue':  float(model_loss),
          'format' : "PERCENTAGE"
        }]}
    
    ### Save model to minIO
    
    keras.models.save_model(model,"/tmp/detect-digits")
    
    from minio import Minio
    import os

    minio_client = Minio(
            "100.65.11.110:9000",
            access_key="minio",
            secret_key="minio123",
            secure=False
        )
    minio_bucket = "mlpipeline"


    import glob

    def upload_local_directory_to_minio(local_path, bucket_name, minio_path):
        assert os.path.isdir(local_path)

        for local_file in glob.glob(local_path + '/**'):
            local_file = local_file.replace(os.sep, "/") # Replace \ with / on Windows
            if not os.path.isfile(local_file):
                upload_local_directory_to_minio(
                    local_file, bucket_name, minio_path + "/" + os.path.basename(local_file))
            else:
                remote_path = os.path.join(
                    minio_path, local_file[1 + len(local_path):])
                remote_path = remote_path.replace(
                    os.sep, "/")  # Replace \ with / on Windows
                minio_client.fput_object(bucket_name, remote_path, local_file)

    upload_local_directory_to_minio("/tmp/detect-digits",minio_bucket,"models/detect-digits/1/") # 1 for version 1
    
    print("Saved model to minIO")
    
    from collections import namedtuple
    output = namedtuple('output', ['mlpipeline_ui_metadata', 'mlpipeline_metrics'])
    return output(json.dumps(metadata),json.dumps(metrics))
    


In [None]:
def serve_step():
    print("Model Serve Step")
    """
    Create kserve instance
    """
    from kubernetes import client 
    from kserve import KServeClient
    from kserve import constants
    from kserve import utils
    from kserve import V1beta1InferenceService
    from kserve import V1beta1InferenceServiceSpec
    from kserve import V1beta1PredictorSpec
    from kserve import V1beta1TFServingSpec
    from datetime import datetime

    namespace = utils.get_default_target_namespace()

    now = datetime.now()
    v = now.strftime("%Y-%m-%d--%H-%M-%S")

    name='digits-recognizer-{}'.format(v)
    kserve_version='v1beta1'
    api_version = constants.KSERVE_GROUP + '/' + kserve_version

    isvc = V1beta1InferenceService(api_version=api_version,
                                   kind=constants.KSERVE_KIND,
                                   metadata=client.V1ObjectMeta(
                                       name=name, namespace=namespace, annotations={'sidecar.istio.io/inject':'false'}),
                                   spec=V1beta1InferenceServiceSpec(
                                   predictor=V1beta1PredictorSpec(
                                       tensorflow=(V1beta1TFServingSpec(
                                           storage_uri="pvc://" + serve_step_model_pvc_existing)))) 
                                            #QUESTION: Does this need to be something else
    )
    
    KServe = KServeClient()
    KServe.create(isvc)

In [None]:
# Generate components
comp_clone = components.create_component_from_func(clone_step, base_image=clone_step_container_image,
                                                            packages_to_install=['netapp-dataops-k8s==2.4.0'])

comp_train= components.create_component_from_func(train_step, base_image=train_step_container_image, )
comp_train.apply(
    kfp.onprem.mount_pvc(train_step_train_pvc, 'train', train_step_train_mountpoint)
)
comp_train.apply(
    kfp.onprem.mount_pvc(train_step_model_pvc, 'model', train_step_model_mountpoint)
)

comp_serve= components.create_component_from_func(serve_step, base_image=serve_step_container_image,
                                                           packages_to_install=['kserve==0.10.1'])
comp_serve.apply(
    kfp.onprem.mount_pvc(serve_step_model_pvc, 'model', serve_step_model_mountpoint)
)

In [None]:
@dsl.pipeline(
    name='digits-recognizer-pipeline',
    description='Detect digits'
)

In [None]:
def create_pipe(no_epochs,optimizer):
    step1 = comp_clone()
    step2 = comp_train(no_epochs,optimizer)
    step2.after(step1)
    step3 = comp_serve()
    step3.after(step2)


OLD

In [None]:
ROOT_DIR = '/home/jovyan'

In [None]:
# Designate a root folder for the data
DATA_DIR = 'data'
DATA_ROOT = os.path.join(ROOT_DIR, DATA_DIR)
os.makedirs(DATA_ROOT, exist_ok=True)
assert os.path.exists(DATA_ROOT)

In [None]:
# Training data paths
DATA_TRAIN_PVC = 'digits-train'
DATA_TRAIN_ROOT = os.path.join(DATA_ROOT, DATA_TRAIN_PVC)
os.makedirs(DATA_TRAIN_ROOT, exist_ok=True)
assert os.path.exists(DATA_TRAIN_ROOT)
DATA_TRAIN_FILE = os.path.join(DATA_TRAIN_ROOT,'train.csv')
print(DATA_TRAIN_FILE)
assert os.path.exists(DATA_TRAIN_FILE)

# Testing data paths
DATA_TEST_PVC = 'digits-test'
DATA_TEST_ROOT = os.path.join(DATA_ROOT, DATA_TEST_PVC)
os.makedirs(DATA_TEST_ROOT, exist_ok=True)
assert os.path.exists(DATA_TEST_ROOT)
DATA_TEST_FILE = os.path.join(DATA_TEST_ROOT,'test.csv')
assert os.path.exists(DATA_TEST_FILE)

# Validation data paths
DATA_VALID_PVC = 'digits-valid'
DATA_VALID_ROOT = os.path.join(DATA_ROOT,DATA_VALID_PVC)
os.makedirs(DATA_VALID_ROOT, exist_ok=True)
assert os.path.exists(DATA_VALID_ROOT)
DATA_VALID_FILE = os.path.join(DATA_VALID_ROOT,'valid.csv')
assert os.path.exists(DATA_VALID_FILE)

# Production data paths
DATA_PROD_PVC = 'digits-prod'
DATA_PROD_ROOT = os.path.join(DATA_ROOT, DATA_PROD_PVC)
os.makedirs(DATA_PROD_ROOT, exist_ok=True)
assert os.path.exists(DATA_PROD_ROOT)
DATA_PROD_FILE = os.path.join(DATA_PROD_ROOT,'prod.csv')
assert os.path.exists(DATA_PROD_FILE)

In [None]:
# Model data paths
DATA_MODEL_PVC = 'digits-model'
DATA_MODEL_ROOT = os.path.join(DATA_ROOT, DATA_MODEL_PVC)
os.makedirs(DATA_MODEL_ROOT, exist_ok=True)
assert os.path.exists(DATA_MODEL_ROOT)

## Copy the data volumes
We will not touch the original volumes but instead will work with cloned volumes only.  

In [None]:
#USER_NAMESPACE = "kubeflow-user-example-com"

In [None]:
# New persistentvolumeclaims names for the clone volumes
#CLONE_TRAIN_PVC = 'digits-train-clone' 
#CLONE_VALID_PVC = 'digits-valid-clone'
#CLONE_TEST_PVC = 'digits-test-clone'
#CLONE_PROD_PVC = 'digits-prod-clone'

In [None]:
# Clone the training volume 
# in the USER_NAMESPACE namespace and create a new persistentvolumeclaim
#cloneVolume(sourcePvcName=DATA_TRAIN_PVC, newPvcName=CLONE_TRAIN_PVC, namespace=USER_NAMESPACE)

In [None]:
#CLONE_DIR = 'clone'
#CLONE_ROOT = os.path.join(ROOT_DIR, CLONE_DIR)
#os.makedirs(CLONE_ROOT, exist_ok=True)
#assert os.path.exists(CLONE_ROOT)

In [None]:
# Mount the new clone volume under the DATA_ROOT
#import subprocess
#nfs_server = "192.168.0.71"
#nfs_export = "/trident_pvc_a9abdc63_4840_493b_bdfe_45f2238dcc15"
#CLONE_TRAIN_ROOT = os.path.join(CLONE_ROOT, CLONE_TRAIN_PVC)
#os.makedirs(CLONE_TRAIN_ROOT, exist_ok=True)
#assert os.path.exists(CLONE_TRAIN_ROOT)

#subprocess.run(['mount', '-t', 'nfs', f'{nfs_server}:{nfs_export}', CLONE_TRAIN_ROOT])

In [None]:
# Clone the validation volume 
# in the USER_NAMESPACE namespace and create a new persistentvolumeclaim
#cloneVolume  --source-pvc-name=DATA_VALID_PVC --new-pvc-name=CLONE_VALID_PVC --namespace=USER_NAMESPACE

In [None]:
# Clone the test volume 
# in the USER_NAMESPACE namespace and create a new persistentvolumeclaim
#cloneVolume  --source-pvc-name=DATA_TEST_PVC --new-pvc-name=CLONE_TEST_PVC --namespace=USER_NAMESPACE

In [None]:
# Clone the production volume 
# in the USER_NAMESPACE namespace and create a new persistentvolumeclaim
#cloneVolume  --source-pvc-name=DATA_PROD_PVC --new-pvc-name=CLONE_PROD_PVC --namespace=USER_NAMESPACE

## Training Data Preparation

In [None]:
# Loading dataset into pandas 
TRAIN_DF = pd.read_csv(DATA_TRAIN_FILE)

In [None]:
# View the top 5 rows of the training data
TRAIN_DF.head()

In [None]:
# Initial shape of the training data
TRAIN_DF.shape

In [None]:
# Separate out the image data (_X) from the label (_Y) for the train set
TRAIN_X = TRAIN_DF.drop('label', axis=1)
TRAIN_Y = TRAIN_DF.label
# Reshape image in 3 dimensions (height = 28px, width = 28px , channel = 1)... This is needed for the Keras API
TRAIN_X = TRAIN_X.values.reshape(-1,28,28,1)
# Normalize the data
# Each pixel has a value between 0-255. Here we divide by 255, to get values from 0-1
TRAIN_X = TRAIN_X /255.0
TRAIN_X.shape, TRAIN_Y.shape

## Model Building


In [None]:
# hyper parameters
EPOCHS = 3

In [None]:
#Set random seed for reproducibility and ignore warning messages
tf.random.set_seed(42)
np.random.seed(42)

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

In [None]:

# Creating a model using a stack of layers
model = keras.models.Sequential()

# Creating 3 layers of a convolution network
model.add(keras.layers.Conv2D(64, (3, 3), activation='relu', input_shape=(28,28,1)))
model.add(keras.layers.MaxPool2D(2, 2))

model.add(keras.layers.Conv2D(64, (3, 3), activation='relu'))
model.add(keras.layers.MaxPool2D(2, 2))

model.add(keras.layers.Conv2D(64, (3, 3), activation='relu'))
model.add(keras.layers.MaxPool2D(2, 2))

# Flatting the results
model.add(keras.layers.Flatten())

# Creating output
model.add(keras.layers.Dense(64, activation='relu'))

model.add(keras.layers.Dense(32, activation='relu'))

# Most important
# Output are 10 classes, numbers from 0-9
model.add(keras.layers.Dense(10, activation='softmax')) 

# Show model summary - how it looks
model.summary()

In [None]:
PNG_MODEL_FILE = os.path.join(ROOT_DIR,'digits-model')
visualizer(model, file_name=PNG_MODEL_FILE, file_format='png', view=False)

In [None]:
from IPython.display import Image
Image(PNG_MODEL_FILE+ '.png')


In [None]:
# Compile the model - we want to have a multiple outcome
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=['accuracy'])

In [None]:
# Fit the model and return the history while training
history = model.fit(
  x=TRAIN_X,
  y=TRAIN_Y,
  epochs=EPOCHS
)

In [None]:
# Save the model the model volume
keras.models.save_model(model, DATA_MODEL_ROOT)

## Model Access

In [None]:
# Load model if already trained
model = keras.models.load_model(DATA_MODEL_ROOT)

## Model Validation

In [None]:
VALID_DF = pd.read_csv(DATA_VALID_FILE)

In [None]:
VALID_DF.head()

In [None]:
VALID_DF.shape

In [None]:
# Spilt the training data into so the label is in TRAIN_Y and TRAIN_X doesn't include the label
VALID_X = VALID_DF.drop('label', axis=1)
VALID_Y = VALID_DF.label

# Reshape image in 3 dimensions (height = 28px, width = 28px , channel = 1)
VALID_X = VALID_X.values.reshape(-1,28,28,1)


# Normalize the data
# Each pixel has a value between 0-255. Here we divide by 255, to get values from 0-1
VALID_X = VALID_X / 255.0

In [None]:
VALID_X.shape

In [None]:
# Test the model against the test dataset
# Returns the loss value & metrics values for the model in test mode.
model_loss, model_accuracy = model.evaluate(x=VALID_X,y=VALID_Y, verbose=0)
print("Test_loss: {}, Test_accuracy: {} ".format(model_loss,model_accuracy))

In [None]:
# Confusion Matrix

# Generates output predictions for the input samples.
test_predictions = model.predict(x=VALID_X)

# Returns the indices of the maximum values along an axis.
test_predictions = np.argmax(test_predictions,axis=1) # the prediction outputs 10 values, we take the index number of the highest value, which is the prediction of the model

# generate confusion matrix
confusion_matrix = tf.math.confusion_matrix(labels=VALID_Y,predictions=test_predictions)

# plot confusion matrix
h = sns.heatmap(confusion_matrix, fmt='g', cbar=False, annot=True,cmap='Blues')
h.set(xlabel='Predicted', ylabel='Actual', title="Confusion Matrix")

## Create a snapshot of the model volume

Creating a snapshot of the model volume, allows for protection and also cloning of the volume in the future.

In [None]:
USER_NAMESPACE = "kubeflow-user-example-com"
DATA_MODEL_SNAP = 'digits-model-snap'  

In [None]:
# Create a VolumeSnapshot for the volume attached to the 
#   PersistentVolumeClaim (PVC) named in the variable DATA_MODEL_PVC in namespace in USER_NAMESPACE.
#   NOTE: if snapshotName is not specified, the snapshot name will be set to 'ntap-dsutil.<timestamp>
createVolumeSnapshot(pvcName=DATA_MODEL_PVC, namespace=USER_NAMESPACE, snapshotName=DATA_MODEL_SNAP, printOutput=True)

## Restore the snapshots of the volumes used
We will revert the volumes for the train and valid volumes to ensure that nothing has changed while the model creation process

In [None]:
DATA_TRAIN_SNAP = 'digits-train-snap'
DATA_VALID_SNAP = 'digits-valid-snap'

In [None]:
# Restore a VolumeSnapshot for the volume attached 
restoreVolumeSnapshot(snapshoptName=DATA_TRAIN_SNAP, namespace=USER_NAMESPACE,  printOutput=True)

In [None]:
# Restore a VolumeSnapshot for the volume attached 
restoreVolumeSnapshot(snapshoptName=DATA_VALID_SNAP, namespace=USER_NAMESPACE,  printOutput=True)