## What we will cover:
- Preprocess the data
- Load the data
- Train the model and store the artifacts & metrics to Mlflow
- Prediction test to validate the results

# Prerequisites 
<div class="alert alert-block alert-danger">
<b>Important</b> Make sure it's valid
</div>


In [2]:
%ls /mnt/shared/end2end-main-exercises/exercises

 01.exploring_data_with_spark.ipynb   exercise5.ipynb
 02.query_with_ezpreso.ipynb          exercise6.ipynb
 03.visualizing_data_superset.ipynb   exercise7.ipynb
 create_csv.py                        final_challenge.ipynb
 currency_conversion.json             [0m[01;34mimages[0m/
 [01;34mdata[0m/                                [01;34mold[0m/
[01;34m'end2end application'[0m/                requirements.txt
 exercise4.ipynb


In [None]:
# Set parametes

# adapt to your EZUA Domain name
EZAF_ENV = "i007ua.tryezmeral.com"
# path to end2end demo (not data)
end2end_path = '/mnt/datasources/datafabric/ezua/end2end/' 
# path to data for model training, etc.
path = '/mnt/datasources/datafabric/ezua/end2end-data/fruits/' 
# path to GROUP INDIVIDUAL data for model training, etc.
group_data_path = '/mnt/datasources/datafabric/ezua/end2end-group-data/' 
# experiment name prefix for mlflow
experiment_name = "end2end-retail-demo"
model_name = "end2end-retail-demo"
g_model_name = group_name + "-" + model_name
# artifact_path = "end2end-retail-demo"
artifact_path = "model"

### Validating the setup 
<div class="alert alert-block alert-danger">
<b>Important</b> DO NOT CHANGE
</div>


In [None]:
import os

## check if directory group data directories exists and create if not

if not os.path.exists(group_data_path + group_name):
    os.makedirs(group_data_path + group_name) 

if not os.path.exists(group_data_path + group_name + "/train"):
    os.makedirs(group_data_path + group_name + "/train") 

if not os.path.exists(group_data_path + group_name + "/test"):
    os.makedirs(group_data_path + group_name + "/test") 

if not os.path.exists(group_data_path + group_name + "/validation"):
    os.makedirs(group_data_path + group_name + "/validation") 

## check if directories for model training exists

dirExist_path = os.path.exists(path)
dirExist_end2end_path = os.path.exists(end2end_path)
dirExist_group_data_path = os.path.exists(group_data_path)

if dirExist_path and dirExist_end2end_path and dirExist_group_data_path:
    print()
else:
    print("Do not run this notebook further, dataset is missing...")

### Import required libraries & refresh token
- Ignore the warnings

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import os.path
import matplotlib.pyplot as plt
import tensorflow as tf
import mlflow
import mlflow.tensorflow  # Choose either mlflow.tensorflow or mlflow.keras based on your needs
import os
import urllib3
import time
import requests
import json
from keras.callbacks import CSVLogger
from IPython.display import display
from PIL import Image
from io import BytesIO
from tensorflow.keras.preprocessing.image import load_img, img_to_array

`%update_token` is used to refresh the access to the platform. It might be needed to run it in case the exercise is taking too long

In [None]:
%update_token

# 1. Preprocessing & Loading<a class="anchor" id="1"></a><a class="anchor" id="1"></a>

### Load images from the folder and use folder name as Label

In [None]:
# Create a Path object for the training directory and get a list of all .jpg files in the directory
train_dir = Path(path + 'train')
train_filepaths = [p for p in train_dir.glob('**/*') if p.suffix.lower() in ['.jpg', '.jpeg', '.png'] and not p.name.startswith('.')]

# Create a Path object for the testing directory and get a list of all .jpg files in the directory
test_dir = Path(path + 'test')
test_filepaths = [p for p in test_dir.glob('**/*') if p.suffix.lower() in ['.jpg', '.jpeg', '.png'] and not p.name.startswith('.')]

# Create a Path object for the validation directory and get a list of all .jpg files in the directory
val_dir = Path(path + 'validation')
val_filepaths = [p for p in val_dir.glob('**/*') if p.suffix.lower() in ['.jpg', '.jpeg', '.png'] and not p.name.startswith('.')]

# Define a function to create a DataFrame with filepaths and labels for a given list of filepaths
def proc_img(filepath):
    """ Create a DataFrame with the filepath and the labels of the pictures
    """
    # Remove subfolders created by jupyter
    filepath = [x for x in filepath if not ".ipynb_checkpoints" in str(x)]

    # Get the labels from the filepath by splitting on the directory separator and taking the second-to-last element
    labels = [str(filepath[i]).split("/")[-2] \
              for i in range(len(filepath)) \
              if not str(filepath[i]).split("/")[-2].startswith('.')]


    # Convert the list of filepaths to a pandas Series object
    filepath = pd.Series(filepath, name='Filepath',dtype=str)

    # Convert the list of labels to a pandas Series object
    labels = pd.Series(labels, name='Label',dtype=str)

    # Concatenate the filepaths and labels into a DataFrame
    df = pd.concat([filepath, labels], axis=1)
        
    # Shuffle the DataFrame and reset the index
    df = df.sample(frac=1).reset_index(drop = True)
    
    return df

# Call the proc_img function on the training filepaths to create a DataFrame for training
train_df = proc_img(train_filepaths)
train_df.dropna(inplace=True)

# Call the proc_img function on the testing filepaths to create a DataFrame for testing
test_df = proc_img(test_filepaths)
test_df.dropna(inplace=True)

# Call the proc_img function on the validation filepaths to create a DataFrame for validation
val_df = proc_img(val_filepaths)
val_df.dropna(inplace=True)

if {len(train_df.Label.unique())} != {len(test_df.Label.unique())} != {len(val_df.Label.unique())}:
    print('incorrect amount of Labels, please do not continue...')


print()
print('#### Training set ####')
print()
print('-- Training set --\n')
print(f'Number of pictures: {train_df.shape[0]}\n')
print(f'Number of different labels: {len(train_df.Label.unique())}\n')
print(f'Labels: {train_df.Label.unique()}')
print()
print('#### Test set ####')
print()
print('-- Test set --\n')
print(f'Number of pictures: {test_df.shape[0]}\n')
print(f'Number of different labels: {len(test_df.Label.unique())}\n')
print(f'Labels: {test_df.Label.unique()}')
print()
print('#### Validation set ####')
print()
print('-- Validate set --\n')
print(f'Number of pictures: {val_df.shape[0]}\n')
print(f'Number of different labels: {len(val_df.Label.unique())}\n')
print(f'Labels: {val_df.Label.unique()}')


In [None]:
# The DataFrame with the filepaths in one column and the labels in the other one
train_df.head(5)

### Drop duplicates and diplay 1 item of each label

In [None]:
# Create a DataFrame with one Label of each category
df_unique = train_df.copy().drop_duplicates(subset=["Label"]).reset_index()

# Display some pictures of the dataset
fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(15, 10),
                        subplot_kw={'xticks': [], 'yticks': []})

for i, ax in enumerate(axes.flat):
    if i < len(df_unique):
        ax.imshow(plt.imread(df_unique.iloc[i]['Filepath']))
        ax.set_title(df_unique.iloc[i]['Label'], fontsize = 12)
plt.tight_layout(pad=0.5)
#plt.show()

# 2. Load the Images with a generator and Data Augmentation<a class="anchor" id="2"></a>

### Setting up data generators for training, validation, and testing image datasets using the MobileNetV2 preprocessing function
- Utilizes tf.keras.preprocessing.image.ImageDataGenerator for real-time data augmentation during training.
- The training images are loaded from a Pandas DataFrame using the specified 'Filepath' column for input data and 'Label' column for output data.
- Images are resized to (224, 224) pixels, and the MobileNetV2 preprocessing function is applied.
- Data augmentation techniques like rotation, zoom, shift, shear, and horizontal flip are employed to enhance the diversity of the training dataset.
- Batches of 32 images are generated, and the order is shuffled for each epoch.

In [None]:
# Create an image data generator for preprocessing train images using MobileNetV2
train_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input
)

# Generate a flow of images and labels from a Pandas dataframe for training
train_images = train_generator.flow_from_dataframe(
    dataframe=train_df, # Use the specified Pandas dataframe
    x_col='Filepath', # Use the 'Filepath' column as the input (x) data
    y_col='Label', # Use the 'Label' column as the output (y) data
    target_size=(224, 224), # Resize the images to the specified dimensions
    color_mode='rgb', # Use RGB color mode
    class_mode='categorical', # Use categorical classification
    batch_size=32, # Generate batches of 32 images at a time
    shuffle=True, # Shuffle the order of the images
    seed=0, # Use a fixed seed for reproducibility
    rotation_range=30, # Randomly rotate images up to 30 degrees
    zoom_range=0.15, # Randomly zoom images up to 15%
    width_shift_range=0.2, # Randomly shift images horizontally up to 20%
    height_shift_range=0.2, # Randomly shift images vertically up to 20%
    shear_range=0.15, # Randomly apply shearing transformations to images
    horizontal_flip=True, # Randomly flip images horizontally
    fill_mode="nearest", # Use the nearest pixel to fill any empty spaces created by image transformations
)
# Create an image data generator for preprocessing validation images using MobileNetV2
val_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input
)

# Generate a flow of images and labels from a Pandas dataframe for validation
val_images = val_generator.flow_from_dataframe(
    dataframe=val_df, # Use the specified Pandas dataframe
    x_col='Filepath', # Use the 'Filepath' column as the input (x) data
    y_col='Label', # Use the 'Label' column as the output (y) data
    target_size=(224, 224), # Resize the images to the specified dimensions
    color_mode='rgb', # Use RGB color mode
    class_mode='categorical', # Use categorical classification
    batch_size=32, # Generate batches of 32 images at a time
    shuffle=True, # Shuffle the order of the images
    seed=0, # Use a fixed seed for reproducibility
    rotation_range=30, # Randomly rotate images up to 30 degrees
    zoom_range=0.15, # Randomly zoom images up to 15%
    width_shift_range=0.2, # Randomly shift images horizontally up to 20%
    height_shift_range=0.2, # Randomly shift images vertically up to 20%
    shear_range=0.15, # Randomly apply shearing transformations to images
    horizontal_flip=True, # Randomly flip images horizontally
    fill_mode="nearest" # Use the nearest pixel to fill any empty spaces created by image transformations
)
# Create an image data generator for preprocessing test images using MobileNetV2
test_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input
)

# Generate a flow of images and labels from a Pandas dataframe
test_images = test_generator.flow_from_dataframe(
    dataframe=test_df, # Use the specified Pandas dataframe
    x_col='Filepath', # Use the 'Filepath' column as the input (x) data
    y_col='Label', # Use the 'Label' column as the output (y) data
    target_size=(224, 224), # Resize the images to the specified dimensions
    color_mode='rgb', # Use RGB color mode
    class_mode='categorical', # Use categorical classification
    batch_size=32, # Generate batches of 32 images at a time
    shuffle=False # Do not shuffle the order of the images
)

### Load the pretained model
- Load MobileNetV2 model pretrained on ImageNet with input size (224, 224, 3).
- Configure the model for feature extraction (excluding top layer) and freeze its weights for transfer learning.

In [None]:
# Load the pretained model
pretrained_model = tf.keras.applications.MobileNetV2(
    input_shape=(224, 224, 3),
    include_top=False,
    weights='imagenet',
    pooling='avg'
)
pretrained_model.trainable = False

# 3. Train the model<a class="anchor" id="3"></a>

### 3.0 Set parameters for model training - feel free to adjust for the best results in terms of training duration but also accuracy of the result

In [None]:
# Define the number of epochs to train the model
param_epoch = 15

# Define the batch size to use for training and validation
param_batch_size = 32

# Define the number of epochs to wait before early stopping if the validation loss does not improve
param_patience = 4

### 3.1 MLflow config setup 

In [None]:
%update_token

In [None]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Set up an experiment with set_exp from ezmllib.mlflow
exp_name = group_name + "-" + experiment_name

# Search for experiments that match the group name 
experiments = mlflow.search_experiments(view_type=3, order_by=["experiment_id"], filter_string="name = '" + exp_name + "'")

# Check if the list is not empty before accessing its elements
if experiments and experiments[0].lifecycle_stage == 'deleted':
    MlflowClient().restore_experiment(experiments[0].experiment_id)

# Set/create mlflow experiment and generate mlflow run name
mlflow.set_experiment(exp_name)

# Define mlflow run name
run_name = "end2end-retail-demo-" + time.strftime("%Y%m%d-%H%M%S", time.localtime())

### 3.2 Model training (using Keras and Tensorflow) .... the execution of this cell takes a while ;)

- Extends the pretrained MobileNetV2 model with additional dense layers for transfer learning.
- Creates a new Keras model with specified architecture and compiles it using Adam optimizer and categorical crossentropy loss.

In [None]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Get the input layer of the pretrained model
inputs = pretrained_model.input

# Add a new dense layer with 128 units and ReLU activation to the output of the pretrained model
x = tf.keras.layers.Dense(128, activation='relu')(pretrained_model.output)

# Add another new dense layer with 128 units and ReLU activation to the previous layer
x = tf.keras.layers.Dense(128, activation='relu')(x)

# Get the number of unique labels in the training set
labels = len(train_df.Label.unique())

# Add a new dense layer with softmax activation to the previous layer to get the output of the new model
outputs = tf.keras.layers.Dense(labels, activation='softmax')(x)

# Create the model
model = tf.keras.Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    
    metrics=['accuracy']
)

In [None]:
labels = (train_images.class_indices)

In [None]:
print(labels)

- Initiates an MLflow run, logs model architecture, and records hyperparameters.
- Trains the model using the specified hyperparameters and early stopping callback.
- Saves the trained model in TensorFlow Serving format.
- Logs the saved model as an MLflow artifact and logs metrics for each epoch.
- Metrics include training and validation accuracy and loss.
- The model is saved with a specific version (1) for TensorFlow Serving.
- The validation dataset is used to evaluate the model's performance on unseen data and prevent overfitting.

In [None]:
# Start a new MLflow run
if mlflow.active_run():
    mlflow.end_run()
    
# Start a new MLflow run
with mlflow.start_run(run_name=run_name):
    
    # Log the model architecture as a Keras summary
    mlflow.autolog()
    
    # Log the hyperparameters
    mlflow.log_param("batch_size", param_batch_size)
    mlflow.log_param("epochs", param_epoch)
    mlflow.log_param("patience", param_patience)
    
    # Get the full model path
    run_id = mlflow.active_run().info.run_id
    artifact_uri = mlflow.get_artifact_uri(run_id)
    #artifact_path = "model"
    test_uri = "s3://mlflow/3/{run_id}/artifacts/{artifact_path}".format(run_id=run_id, artifact_path=artifact_path)

    # Train the model with the specified hyperparameters and callbacks
    history = model.fit(
        train_images,
        validation_data=val_images,
        batch_size=param_batch_size,
        epochs=param_epoch,
        verbose='auto',
        callbacks=[tf.keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=param_patience,
                restore_best_weights=True
            )
        ]
    )

    # Save the model in TensorFlow Serving format
    tf.saved_model.save(model, "tf_serving_model/1")

    # Log the saved model as an artifact
    mlflow.log_artifact("tf_serving_model")
    
    # Log values/metric from all epochs in mlflow
    for item in history.history.items():
        for value in item[1]:
            mlflow.log_metric(item[0], value)
    

## Validate Accuracy by predicting on the test dataset

In [None]:
# Predict the label of the test_images
pred = model.predict(test_images)
pred = np.argmax(pred,axis=1)

# Map the label
labels = (train_images.class_indices)
labels = dict((v,k) for k,v in labels.items())
pred = [labels[k] for k in pred]

y_test = [labels[k] for k in test_images.classes]

In [None]:
labels.items()

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, pred)
print(f'Accuracy on the test set: {100*acc:.2f}%')

## Try to predict yourself using a picture from the web
- `Fill in` the missing parts

In [None]:
def predict(url):
    # Preprocess the image
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    img = img.resize((224, 224))  
    img_array = img_to_array(img)
    img_array = img_array / 255.0
    img_array = np.expand_dims(img_array, axis=0)
    
    # Make a prediction
    prediction = model.predict(img_array)
    predicted_class = np.argmax(prediction, axis=-1)
    result = labels[predicted_class[0]]
    
    # Display the image
    display(img)
    
    return result

In [None]:
# Example usage with an image URL
image_url = "https://www.telegraph.co.uk/multimedia/archive/01834/orange_1834038b.jpg?imwidth=1280"

predicted_label = predict(image_url)
print("The model predicts: " + str(predicted_label))

## END