# 5. Textbook Enhancements 

> Start TensorBoard

References: [Get started with TensorBoard](https://www.tensorflow.org/tensorboard/get_started)

Clear any logs from previous runs with:

```bash
# rm -rf ./logs
```

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

> Set TensorFlow environment variables

In [None]:
import os
os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=/usr/local/cuda-11.1"
os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"

> Import Dependencies

In [None]:
import datetime
import pandas as pd
import numpy as np
import gc

from IPython.display import clear_output, Markdown
from numba import cuda

from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.metrics import f1_score, precision_score, recall_score

import matplotlib.pyplot as plt
import seaborn as sns
import yellowbrick as yb

import tensorflow as tf

from keras import models
from keras import layers
from keras import optimizers
from keras import callbacks
from keras import backend as K

## 5.1. Data Collection

In [None]:
# load the source data
df_source = pd.read_csv('data/raw/DataCoSupplyChainDataset.csv', encoding='unicode_escape')
df_source.drop_duplicates(inplace=True)

print(df_source.shape)
with pd.option_context('display.max_columns', None):
    display(df_source.head())

## 5.2 Data Preparation & Cleaning

### 5.2.1 Add the Target Variable

In [None]:
def add_is_fraud(df_data: pd.DataFrame) -> pd.DataFrame:
    """
    Add a new column to the dataframe that indicates
    0: No Fraud and 1: Fraud

    Parameters
    ----------
    df_data : pd.DataFrame
        The source dataframe
    """
    df_data['is_fraud'] = df_data['Order Status'].apply(lambda x: 1 if x == 'SUSPECTED_FRAUD' else 0)
    return df_data

df_data = df_source.reset_index(drop=True)
add_is_fraud(df_data)
df_data['is_fraud'].value_counts()

### 5.3.2 Hour-Month

This variable is calculated based on the variable "order-date".

In [None]:
def add_hour_month(df_data: pd.DataFrame) -> pd.DataFrame:
    """from sklearn.model_selection import train_test_split
    Parameters
    ----------
    df_data : pd.DataFrame
        The source dataframe
    """
    df_data['order_date'] = pd.to_datetime(df_data['order date (DateOrders)'])
    df_data['hour_month'] = (df_data['order_date'].dt.day * 24.0) + df_data['order_date'].dt.hour
    return df_data

add_hour_month(df_data)
df_data['hour_month'].describe()

### 5.2.3 Data Selection

Select the subset of columns to use for training the model.

In [None]:
df_data = df_data \
    .filter([
        'hour_month',
        'is_fraud',
        'Type',
        'Sales per customer',
        'Customer State',
        'Order State',
        'order_date',
    ]) \
    .rename(columns={
        'Type': 'payment_type',
        'Sales per customer': 'sales_per_customer',
        'Customer State': 'customer_state',
        'Order State': 'order_state',
    })

print(df_data.shape)
with pd.option_context('display.max_columns', None):
    display(df_data.head())    

### 5.2.4 Set Categorical Columns

In [None]:
cat_columns = {
        'payment_type',
        'customer_state',
        'order_state'
}

for col in cat_columns:
    df_data[col] = df_data[col].astype('category')

df_data.dtypes

## 5.3 Feature Engineering

### 5.3.1 One Hot Encode Categorical Columns

In [None]:
def one_hot_encode(df_data: pd.DataFrame) -> pd.DataFrame:
    """
    Perform one-hot encoding on the categorical columns 
    in the dataframe.

    Parameters
    ----------
    df_data : pd.DataFrame
        The source dataframe

    Returns
    -------
    pd.DataFrame
        The dataframe with the columns one-hot encoded.
    """
    df_output = df_data.copy()

    # one-hot encode the categorical columns
    for column in df_output.select_dtypes(include='category').columns:
        df_output = pd.concat([df_output, pd.get_dummies(df_output[column], prefix=column)], axis=1)
        df_output.drop(columns=[column], inplace=True)
        
    return df_output
    
df_features = one_hot_encode(df_data) \
    .drop(columns=['order_date'])

print(df_features.shape)
with pd.option_context('display.max_columns', None):
    display(df_features.head())

### 5.3.2 Normalize the Data

In [None]:
def scale_features(df_data: pd.DataFrame) -> pd.DataFrame:
    """
    Scale the features in the dataframe.

    Parameters
    ----------
    df_data : pd.DataFrame
        The source dataframe

    Returns
    -------
    pd.DataFrame
        The dataframe with the columns scaled.
    """
    df_output = df_data.copy()
    float_columns = df_output.select_dtypes(include='float64').columns

    # scale the features
    scaler = StandardScaler()
    df_output[float_columns] = scaler.fit_transform(df_output[float_columns])

    return df_output

df_features = scale_features(df_features)

print(df_features.shape)
with pd.option_context('display.max_columns', None):
    display(df_features.head())

## 5.4 Model Training

In [None]:
random_state = 105

### 5.4.1 Data Selection

In [None]:
# split the data into train and test, and validation sets
df_train, df_test = train_test_split(df_features, test_size=0.2, random_state=random_state)
df_train, df_val = train_test_split(df_train, test_size=0.3, random_state=random_state)

Markdown(f'''
| Dataset | Rows | Columns | Not Fraud | Fraud |
| ------- | ---- | ------- | --------- | ----- |
| Train | {df_train.shape[0]} | {df_train.shape[1]} | {df_train[df_train['is_fraud'] == 0].shape[0]} | {df_train[df_train['is_fraud'] == 1].shape[0]} |
| Validation | {df_val.shape[0]} | {df_val.shape[1]} | {df_val[df_val['is_fraud'] == 0].shape[0]} | {df_val[df_val['is_fraud'] == 1].shape[0]} |
| Test | {df_test.shape[0]} | {df_test.shape[1]} | {df_test[df_test['is_fraud'] == 0].shape[0]} | {df_test[df_test['is_fraud'] == 1].shape[0]} |
''')

In [None]:
# get the features and labels
x_train = df_train.drop(columns=['is_fraud'])
y_train = df_train['is_fraud']

x_val = df_val.drop(columns=['is_fraud'])
y_val = df_val['is_fraud']

x_test = df_test.drop(columns=['is_fraud'])
y_test = df_test['is_fraud']

In [None]:
# try to reclaim some memory
del df_source
del df_data
del df_features

del df_train
del df_val
del df_test


### 5.4.2 Model Definition

In [None]:
def get_model() -> models.Sequential:
    """
    Get the deep learning model to use for training.
    """
    model = models.Sequential()
    model.add(layers.Dense(512, activation='relu', input_shape=(x_train.shape[1],)))
    model.add(layers.Dense(512, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    return model

In [None]:
model = get_model()

### 5.4.3 Model Compilation

In [None]:
model.compile(
    optimizer=optimizers.RMSprop(learning_rate=0.0001),
    loss='binary_crossentropy',
    metrics=['accuracy'])

### 5.4.4 Model Training

In [None]:
# configure tensorboard log dir
log_dir = 'logs/initial/' + datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
# fit the model
history = model.fit(
    x=x_train,
    y=y_train,
    epochs=30,
    batch_size=512,
    validation_data=(x_val, y_val),
    callbacks=[tensorboard_callback])

In [None]:
history_dict = history.history
print(history_dict.keys())


In [None]:
def plot_loss(history: callbacks.History) -> None:
    """
    Plot the loss and validation loss.

    Parameters
    ----------
    history : keras.callbacks.History
        The history of the model training.
    """
    epochs = range(1, len(history.history['accuracy']) + 1)

    # summarize history for loss
    plt.plot(epochs, history.history['loss'])
    
    if ('val_loss' in history.history):
        plt.plot(epochs, history.history['val_loss'])
        plt.legend(['Training loss', 'Validation loss'], loc='upper left')
        plt.title('Training and validation loss')
    else:
        plt.title('Training loss')

    plt.xlabel('Epochs')
    plt.ylabel('Loss')

    plt.show()        

In [None]:
def plot_accuracy(history: callbacks.History) -> None:
    """
    Plot the accuracy and validation accuracy.

    Parameters
    ----------
    history : keras.callbacks.History
        The history of the model training.
    """
    epochs = range(1, len(history.history['accuracy']) + 1)

    # summarize history for accuracy
    plt.plot(epochs, history.history['accuracy'])

    if ('val_accuracy' in history.history):
        plt.plot(epochs, history.history['val_accuracy'])
        plt.legend(['Training acc', 'Validation acc'], loc='upper left')
        plt.title('Training and validation accuracy')
    else:
        plt.title('Training accuracy')

    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')

    plt.show()

In [None]:
plot_loss(history)
plot_accuracy(history)

In [None]:
def show_loss_accuracy(model: models.Sequential) -> None:
    """
    Show the loss and accuracy for the model.

    Parameters
    ----------
    model : keras.models.Sequential
        The model to evaluate.
    """
    print('-- Training --')
    train_loss, train_acc = model.evaluate(x_train, y_train)

    print('-- Validation --')
    val_loss, val_acc = model.evaluate(x_val, y_val)

    print('-- Test --')
    test_loss, test_acc = model.evaluate(x_test, y_test)

In [None]:
# show the accuracy and loss on the data sets
show_loss_accuracy(model)

In [None]:
# open tensorboard
%tensorboard --logdir logs/initial/

## 5.5 Model Evaluation

In [None]:
def predict(x: pd.DataFrame, y: pd.Series, model: models.Sequential, threshold: float = 0.5) -> pd.DataFrame:
    """
    Predict the labels for the features to use for model evaluation.

    Parameters
    ----------
    x : pd.DataFrame
        The features
    y : pd.Series
        The labels
    model : models.Sequential
        The model
    threshold : float, optional
        The threshold to use for the predictions, by default 0.5

    Returns
    -------
    pd.DataFrame
        The confusion matrix
    """
    y_score = model.predict(x, use_multiprocessing=True).ravel()

    return pd.DataFrame({
        'y_true': y,
        'y_score': y_score,
        'y_pred': y_score > threshold,
    })
    

#return pd.DataFrame(confusion_matrix(y, y_pred), columns=['Predicted Not Fraud', 'Predicted Fraud'], index=['Actual Not Fraud', 'Actual Fraud'])


In [None]:
threshold = 0.2

train_predict = predict(x_train, y_train, model, threshold)
val_predict = predict(x_val, y_val, model, threshold)
test_predict = predict(x_test, y_test, model, threshold)

### 5.5.1. Confusion Matrix

In [None]:
def plot_confusion_matrix(df_predict: pd.DataFrame, title: str, axes: plt.Axes = None):
    """
    Plot the confusion matrix for the predictions.

    Parameters
    ----------
    df_predict : pd.DataFrame
        The predictions
    title : str
        The title for the plot
    axes : plt.Axes, optional
        The axes to plot on, by default None
    """
    cm = confusion_matrix(df_predict['y_true'], df_predict['y_pred'])
    sns.heatmap(
        cm,
        annot=True,
        fmt='d',
        cmap='Blues',
        cbar=False,
        xticklabels=['Not Fraud', 'Fraud'],
        yticklabels=['Not Fraud', 'Fraud'],
        linewidths=0.5,
        linecolor='black',
        square=True,
        ax=axes,
    ).set_title(f'{title} Confusion Matrix')

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=False)

plot_confusion_matrix(train_predict, 'Training', axes=axes[0])
plot_confusion_matrix(val_predict, 'Validation', axes=axes[1])
plot_confusion_matrix(test_predict, 'Test', axes=axes[2])

### 5.5.2 Classification Result

In [None]:
def show_classification_report(df_predict: pd.DataFrame, title: str):
    """
    Show the classification report for the predictions.

    Parameters
    ----------
    df_predict : pd.DataFrame
        The predictions
    title : str
        The title for the report
    """
    print(f'--- {title} Classification Report ---')
    print(classification_report(df_predict['y_true'], df_predict['y_pred']))

In [None]:
show_classification_report(train_predict, 'Training')
show_classification_report(val_predict, 'Validation')
show_classification_report(test_predict, 'Test')

### 5.5.3 ROC Curve

The idea to use Yelowbrick to plot the ROC Curve is taken from [Validating Fastai classifier with Yelowbrick](https://github.com/micstn/micstn.github.io/blob/master/nbs/fastai_yellowbrics.ipynb) and modified specifically to use Keras.

In [None]:
class SklearnWrapper(BaseEstimator):
    _estimator_type = "classifier"
        
    def __init__(self, model, classes, target_type:str='multiclass'):
        self.model = model
        self.classes = classes
        
        self.classes_ = classes
        self.target_type_ = target_type
    
    def fit(self, X, y):
        pass
        
    def score(self, X, y):
        pass
        #return accuracy_score(y, self.predict(X))
    
    # def get_new_preds(self, X):
    #     new_to = self.model.dls.valid_ds.new(X)
    #     new_to.conts = new_to.conts.astype(np.float32)
    #     new_dl = self.model.dls.valid.new(new_to)
    #     with self.model.no_bar():
    #         preds,_,dec_preds = self.model.get_preds(dl=new_dl, with_decoded=True)
    #     return (preds, dec_preds)

    def predict_proba(self, X):
        proba = self.model.predict(X, use_multiprocessing=True).ravel()

        return np.array([1-proba, proba]).T
    
    # def predict(self, X):
    #     return self.get_new_preds(X)[1].numpy()


def plot_roc(model: models.Sequential,
             x: pd.DataFrame,
             y: pd.DataFrame,
             classes: list, 
             title: str):
    """
    Plot the ROC curve for the predictions.

    Parameters
    ----------
    model : models.Sequential
        The model
    x : pd.DataFrame
        The features
    y : pd.Series
        The labels
    classes : list
        The classes
    title : str
        The title for the plot
    """
    visualizer = yb.classifier.ROCAUC(SklearnWrapper(model, classes),
                                    classes=classes,
                                    size=[500,500],
                                    title=title)
    visualizer.score(x_test, y_test)
    visualizer.poof()


# model_wrapper = SklearnWrapper(model, classes=['Not Fraud', 'Fraud'])
# model_wrapper.predict_proba(x_test)

In [None]:
plot_roc(model, x_test, y_test, ['Not Fraud', 'Fraud'], 'ROC Curves for Initial Model')

## 5.6 Parameter Tuning

### 5.6.1 Create Model

In [None]:
# create a new model to tune
tuned_model = get_model()

# compile the model
tuned_model.compile(
    optimizer=optimizers.RMSprop(learning_rate=0.0001),
    loss='binary_crossentropy',
    metrics=['accuracy'])

### 5.6.2 Model Training

In [None]:
tuned_history = tuned_model.fit(
    pd.concat([x_train, x_val]),
    pd.concat([y_train, y_val]),
    epochs=15,
    batch_size=512)

In [None]:
plot_loss(tuned_history)
plot_accuracy(tuned_history)

In [None]:
# show the accuracy and loss on the data sets
show_loss_accuracy(tuned_model)

### 5.6.3 Model Evaluation

In [None]:
threshold = 0.2

train_predict = predict(x_train, y_train, tuned_model, threshold)
val_predict = predict(x_val, y_val, tuned_model, threshold)
test_predict = predict(x_test, y_test, tuned_model, threshold)

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=False)

plot_confusion_matrix(train_predict, 'Training', axes=axes[0])
plot_confusion_matrix(val_predict, 'Validation', axes=axes[1])
plot_confusion_matrix(test_predict, 'Test', axes=axes[2])

In [None]:
show_classification_report(train_predict, 'Training')
show_classification_report(val_predict, 'Validation')
show_classification_report(test_predict, 'Test')

In [None]:
plot_roc(tuned_model, x_test, y_test, ['Not Fraud', 'Fraud'], 'ROC Curves for Initial Model')

## 5.7 Custom Hyperparameter Tuning

Although the [Keras Tuner](https://keras.io/keras_tuner/) could be potentially used, the goal is to find the hyperparameters that maximizes the F1 score of the minority class. For this reason a simple custom grid search is implemented.

In [None]:
def clear_gpu_memory():
    """
    Clear the GPU memory.
    """
    K.clear_session()
    tf.compat.v1.reset_default_graph()
    _ = gc.collect()

clear_gpu_memory()

In [None]:
def hyperparameter_search(
        x_train: pd.DataFrame,
        y_train: pd.DataFrame,
        x_val: pd.DataFrame,
        y_val: pd.DataFrame,
        x_test: pd.DataFrame,
        y_test: pd.DataFrame,
        learning_rates: list,
        epochs: int,
        thresholds: list) -> pd.DataFrame:
    """
    Generate a dataframe of the results of the hyperparameter search on
    the learning rate, epocs and threshold.
    
    Parameters
    ----------
    x_train : pd.DataFrame
        The training features
    y_train : pd.DataFrame
        The training labels
    x_val : pd.DataFrame
        The validation features
    y_val : pd.DataFrame
        The validation labels
    x_test : pd.DataFrame
        The test features
    y_test : pd.DataFrame
        The test labels
    learning_rates : list
        The learning rates to try
    epochs : int
        The max number of epochs to train for
    thresholds : list
    """
    # create a dataframe to store the results
    df_results = pd.DataFrame()

    for learning_rate in learning_rates:
        print(f'* Learning Rate: {learning_rate}')

        # clear the session
        clear_gpu_memory()

        # create the new model
        model = get_model()

        # compile the model
        model.compile(
            optimizer=optimizers.RMSprop(learning_rate=learning_rate),
            loss='binary_crossentropy',
            metrics=['accuracy'])

        # train the model one epoch at a time
        for epoch in range(epochs):
            print(f'** Learning Rate: {learning_rate} Epoch: {epoch+1}')
            model.fit(
                x_train,
                y_train,
                epochs=1,
                batch_size=512,
                verbose=0)

            for threshold in thresholds:
                print(f'*** Learning Rate: {learning_rate} Epoch: {epoch+1} Threshold: {threshold}')
                _ = gc.collect()

                # get the predictions
                # train_predict = predict(x_train, y_train, tuned_model, threshold)
                # val_predict = predict(x_val, y_val, tuned_model, threshold)
                test_predict = predict(x_test, y_test, model, threshold)

                # calculate the metrics
                metrics = precision_recall_fscore_support(
                    test_predict['y_true'], 
                    test_predict['y_pred'],
                    zero_division=1)
                
                # format the metrics
                df_metrics = pd.DataFrame(metrics) \
                    .set_axis(['precision', 'recall', 'f_score', 'true_sum'], axis=0) \
                    .T \
                    .assign(learning_rate=learning_rate, epoch=epoch+1, threshold=threshold) \
                    .reset_index() \
                    .rename(columns={'index': 'label'})

                df_results = pd.concat([df_results, df_metrics])


    return df_results

# get the metic results
df_results = hyperparameter_search(
    x_train=x_train, 
    y_train=y_train, 
    x_val=x_val, 
    y_val=y_val, 
    x_test=x_test, 
    y_test=y_test, 
    learning_rates=[0.00001, 0.0001, 0.001, 0.01], 
    epochs=30,
    thresholds=np.arange(0.1, 1, 0.1))

# show the results
clear_output(wait=True)

display(
    df_results \
        .query('label == 1') \
        .sort_values('f_score', ascending=False) \
        .head(10)
)

In [None]:
df_results.to_csv('grid_search_results.csv', index=False)

In [None]:
df_results.query('learning_rate == 0.001 and epoch == 23 and threshold == 0.4')

### 5.7.1 Best Model

In [None]:
# get the model
best_model = get_model()

# compile the model
best_model.compile(
    optimizer=optimizers.RMSprop(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy'])

# train the model
history = best_model.fit(
    x_train,
    y_train,
    epochs=23,
    batch_size=512,
    validation_data=(x_val, y_val))

# plot the accuracy and loss
plot_loss(history)
plot_accuracy(history)

In [None]:
# get the predictions
threshold = 0.4

train_predict = predict(x_train, y_train, best_model, threshold)
val_predict = predict(x_val, y_val, best_model, threshold)
test_predict = predict(x_test, y_test, best_model, threshold)

In [None]:
# plot the confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=False)

plot_confusion_matrix(train_predict, 'Training', axes=axes[0])
plot_confusion_matrix(val_predict, 'Validation', axes=axes[1])
plot_confusion_matrix(test_predict, 'Test', axes=axes[2])

In [None]:
# show the classification reports
show_classification_report(train_predict, 'Training')
show_classification_report(val_predict, 'Validation')
show_classification_report(test_predict, 'Test')

In [None]:
plot_roc(best_model, x_test, y_test, ['Not Fraud', 'Fraud'], 'ROC Curves for Initial Model')