# Lab 3 Modeling 
1. Define a model 
2. Experiment and hyper parameter tuning
3. Train model on training cluster
4. Save model back into project repo 

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import random as ran
from datetime import datetime 
import os

%store -r STUDENT

In [None]:
mnist = tf.keras.datasets.mnist

# Project repo path function - file system mount available to all app containers
def ProjectRepo(path):
    ProjectRepo = "/bd-fs-mnt/project_repo"
    return str(ProjectRepo + '/' + path)

# Locations of mnist data   
MNIST_LOC = ProjectRepo("/data/" + STUDENT + "_MNIST/mnist.npz")

(train_images, train_labels),(test_images, test_labels) = mnist.load_data(path=MNIST_LOC)
train_images, test_images = train_images / 255.0, test_images / 255.0

# Before running the next couple cells for training, please shut down all other kernels to free up resources. 

# Model Development 1 
- defining a small model to train on 2 epochs

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(512, activation=tf.nn.relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])
model.compile(optimizer='Adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
TrainstartTime = datetime.now()
model.fit(train_images, train_labels, epochs=2)

TrainendTime = datetime.now()
print("\nTraining Time:", TrainendTime - TrainstartTime)

print("\nEvaluate Test Images:")
EvalstartTime = datetime.now()

#model.evaluate(test_images, test_labels)
EvalendTime = datetime.now()
print("\nEvaluate Time:", EvalendTime - EvalstartTime)
tf.keras.backend.clear_session()

# Model Development 2 
- introducing batch size variable 

In [None]:
BATCH_SIZE = 128  

model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(512, activation=tf.nn.relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])
model.compile(optimizer='Adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
TrainstartTime = datetime.now()
model.fit(train_images, train_labels, epochs=2,batch_size=BATCH_SIZE)

TrainendTime = datetime.now()
print("\nTraining Time:", TrainendTime - TrainstartTime)

print("\nEvaluate Test Images:")

## Train the model on the remote shared training cluster

In general, data scientists use their local Jupyter Notebook to **experiment** several learning algorithms with a variety of parameters. They do so to determine the ML model that works best for the business problem they try to address and develop the model that yields to the best prediction result. Then, within their notebooks, they submit their code to large scaled computing training cluster environment to train and test their full ML models, in a reasonable time, typically against a larger training dataset and test dataset. The output of this step is a trained model ready for deployment in production.

>**Note:** _This workshop is not intended to teach you about AI/ML model experimentation and development. It is intended to give a use case for data science end-to-end ML workflow with HPE Ezmeral ML Ops. Therefore we will assume that the experimentation step has already been done and that the data science team has shared the best performant ML model in a notebook in the GitHub version control system repository set up by the Operations team for the data science team. The notebook is actually this notebook pulled from GitHub repository by the local Jupyter Notebook cluster. Here you will submit the ML model code to the tenant-shared training cluster environment to train and test your model against the train/test dataset

In [None]:
%attachments

## Here we will train on 5 epochs
- Don't forget to fill in your <b>STUDENT</b> variable and your training cluster

In [None]:
%%training

import tensorflow as tf
import matplotlib.pyplot as plt
import random as ran
from datetime import datetime 
import os
mnist = tf.keras.datasets.mnist

STUDENT = ""

# Project repo path function - file system mount available to all app containers
def ProjectRepo(path):
    ProjectRepo = "/bd-fs-mnt/project_repo/"
    return str(ProjectRepo + '/' + path)

# Locations of mnist data   
MNIST_LOC = ProjectRepo("/data/" + STUDENT + "_MNIST/mnist.npz")

(train_images, train_labels),(test_images, test_labels) = mnist.load_data(path=MNIST_LOC)
train_images, test_images = train_images / 255.0, test_images / 255.0
BATCH_SIZE = 128  

model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(512, activation=tf.nn.relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])
model.compile(optimizer='Adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
TrainstartTime = datetime.now()
model.fit(train_images, train_labels, epochs=5,batch_size=BATCH_SIZE)

TrainendTime = datetime.now()
print("\nTraining Time:", TrainendTime - TrainstartTime)

print("\nEvaluate Test Images:")

#save model in h5 format
model.save(ProjectRepo('models/' + STUDENT + '_MNIST/mnist_digits.h5'))

In [None]:
# Fill in the history url from the output of the previous cell
%logs --url <your history url>