## Training Pipeline

### Import Libraries

In [None]:
import os
import shutil
import pandas as pd
import numpy as np

# Pycaret
#from pycaret.regression import setup
# import pycaret.nlp as pycnlp
# import pycaret.classification as pyclass
# from pycaret.classification import get_config, predict_model, plot_model, pull

import pycaret
from pycaret.classification import *

# MLFlow
import mlflow
import mlflow.keras
from mlflow.tracking import MlflowClient
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID, MLFLOW_RUN_NAME

# NLP
import spacy
import category_encoders as ce
import pickle

# LSTM
import keras
from keras.preprocessing.text import one_hot, Tokenizer
# from keras_preprocessing.sequence import pad_sequences
from keras.utils import pad_sequences


# For Coherence Score
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, LdaMulticore, CoherenceModel, LsiModel, HdpModel
# For Text Vectorization
from gensim.models import word2vec

import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['font.family'] = ['DejaVu Sans']

import src.helpers_preprocess as pp
import src.breach_words as breach
import src.helpers_mlflow as mlf
import src.helpers_evaluation as ev
import src.config as config

import importlib
importlib.reload(mlf)

#pd.set_option('max_colwidth', -1)a

### Set Configurations

In [None]:
target_class = 'incompliant'

RESET_WORKINGDIR = True
SAVE_PYCARET_DATA = True
SAVE_DECILES = True
SAVE_PROB_DIST = True

In [None]:
# Reset the working directory
if RESET_WORKINGDIR | len(os.listdir(config.output_path)) == 0:
    print("Not resetting the working directory.")
else:
    # Reset working directory
    for filename in os.listdir(config.output_path):
        file_path = os.path.join(config.output_path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))
            
    print("Reset the working directory.")

In [None]:
# Check if filepaths exists and create filepaths if do not exist
# Explainations for filepaths can be found in config.py

config.create_path(config.main_directory)
config.create_path(config.data_artifact_path)
config.create_path(config.exai_artifact_path)

### Import features dataset

In [None]:
# Get latest feature set
data_df = config.get_latest_csv(config.raw_data_path, 'full_features')

### MLFLOW Experiment

#### Setup the MYSQL database for tracking MLFLOW Model Registry

In [None]:
# Create database if it does not exists
# Database required for MLFlow model registry as only certain APIs are supported by Mlflow
mlf.create_database_storage(config.dbServerName, config.dbPort, config.dbName, config.dbUser, config.dbPassword)

In [None]:
# Check if database exists
# mlf.show_databases(dbServerName, dbUser, dbPassword)

#### Setup MLFLOW to track experiments and model registry in MYSQL

In [None]:
# Create mlflow command run configured in src/config.py
mlflow_conn = mlf.create_mlflow_cmd(config.storage_filepath, config.dbName, config.dbUser, config.dbPassword, config.dbServerName, config.dbPort)

In [None]:
# ⚠️ Run this command in anaconda environment, if running thru Jupyter, interrupt this command line after ahwile.
!mlflow ui \
--backend-store-uri file:/./aicritic_mlflow \
--registry-store-uri mysql+pymysql://root:<dbName>@localhost:<port>/mlflow_tracking_database \
--host 127.0.0.1 --port 5000 \
--serve-artifacts

In [None]:
exp_id, client = mlf.setup_mlflow(config.exp_name, config.storage_filepath)

In [None]:
# Init MLFlow start run
parent_run = client.create_run(experiment_id=exp_id, run_name = f'classification_{target_class}')
parent_run_id = parent_run.info.run_uuid
client.log_param(parent_run_id, "run_id", parent_run_id)

In [None]:
# Set columns not be used during ML modelling, these features are avoided as XGboost was used.
ignored_cols = ['data_source', 'username', 'posted_on', 'content', 'cleaned_text', 'hashtags', 'mentions', 'emojis', 'ner']
# Set columns to be one hot encoded
categorical_cols = ['breach_flagwords', 'breach_hashes', 'has_nonpru_email', 'has_hyperlinks', 'has_disclaimer', 'contains_monetary']

In [None]:
# One-hot encode specified features
data_encoded, enc_cols = pp.get_onehot(data_df.set_index('id'),
                                        feature_list = categorical_cols,
                                        save_dir = config.encoder_artifact_path)

### Train-Test Split
Doing the train test split out of Pycaret as the ML model (XGBoost) needs to be aligned with the NN Model (LSTM).

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split( data_encoded.drop(columns=['incompliant']), data_encoded['incompliant'], test_size=0.33, random_state=42 )

X_train['incompliant'] = y_train
X_valid['incompliant'] = y_valid


### Rule Based Classification Model

In [None]:
# Create child run for ML Classification
ml_run = client.create_run(
        experiment_id=exp_id,
        run_name = f'ml_model_{target_class}',
        tags={
            MLFLOW_PARENT_RUN_ID : parent_run_id
            }
        )

print("ml_run_id")
ml_run_id = ml_run.info.run_uuid
client.log_param(ml_run_id, "run_id", ml_run_id)

In [None]:
# Text Classification
classfication_exp = setup( data= X_train,
                        target = 'incompliant',
                        test_data = X_valid,
                        ignore_features = ignored_cols,
                        # categorical_features = enc_cols,
                        preprocess=False,
                        session_id=42,
                        n_jobs=1 )

In [None]:
# Best_model
classfication_exp.compare_models(include = ['lr','ridge', 'lda', 'rf', 'knn','nb','svm', 'gbc', 'ada', 'et', 'qda', 'dt', 'xgboost' ])

In [None]:
# Get dataset used for testing models
training_df = get_config('X_train')
training_df['incompliant'] = get_config('y_train')
validation_df = get_config('X_test')
validation_df['incompliant'] = get_config('y_test')

# Save the dataset for replicating training
if SAVE_PYCARET_DATA:
    training_df.to_csv(config.data_artifact_path  + f"\\{config.today}_training_{parent_run_id[:5]}.csv")
    validation_df.to_csv(config.data_artifact_path  + f"\\{config.today}_validation_{parent_run_id[:5]}.csv")

    print(f"Training saved as: {config.data_artifact_path }" + f"\\{config.today}_training_{parent_run_id[:5]}.csv")
    print(f"Validation saved as: {config.data_artifact_path }" + f"\\{config.today}_validation_{parent_run_id[:5]}.csv")
else:
    print("Train/Test Split not saved")

#### Choice Model - XGBoost

In [None]:
import xgboost as xgb

xgb_instance = xgb.XGBClassifier()
fit_kwargs = { "early_stopping_rounds": 5, "eval_metric": "logloss", "eval_set": [(get_config('X_test'), get_config('y_test'))]}
xgb_model = classfication_exp.create_model(xgb_instance, fit_kwargs=fit_kwargs, error_score ='raise')

tuned_model = xgb_model
print(f'Classifier used: {tuned_model.__class__.__name__}')
client.log_param(ml_run_id, "model_type", f"{tuned_model.__class__.__name__}")

In [None]:
# Pycaret output predictions
predictions = predict_model(tuned_model, data=validation_df, raw_score=True)

# Prepare to log result metrics into MLflow
ml_auc = roc_auc_score(y_full_data, rb_test_pred['prediction_score_1'])
ml_acc = accuracy_score(y_full_data, np.array(rb_test_pred['prediction_label']))
ml_prec = precision_score(y_full_data, np.array(rb_test_pred['prediction_label']), average='binary')
ml_recall = recall_score(y_full_data, np.array(rb_test_pred['prediction_label']), average='binary')
ml_f1 = f1_score(y_full_data, np.array(rb_test_pred['prediction_label']), average='binary')

print('ml_auc', ml_auc, '\nml_acc', ml_acc, '\nml_prec', ml_prec, '\nml_recall', ml_recall, '\nml_f1', ml_f1)

In [None]:
# Save ML model
ml_signature = mlflow.models.infer_signature( model_input = pd.DataFrame(X_train), 
                                              model_output = pd.DataFrame(predictions['prediction_label']))

mlflow.sklearn.save_model(tuned_model, 
                            config.ml_artifact_path,
                            signature = ml_signature )

print(f'Model has been saved at: {config.ml_artifact_path}')

In [None]:
# Predictions on validation (out of time dataset)
predictions['Score_1_round'] = round(predictions['prediction_score_1'], 1)
decile_table = ev.get_decile_score(predictions, f'{target_class}', 'prediction_label', 'Score_1_round')
decile_table
if SAVE_DECILES:
    # Save decile table
    decile_table.to_csv(os.path.join(config.exai_artifact_path, f"decile_table_{config.today}.csv"))

In [None]:
# Show probability distribution box plot
pred_correct = predictions[predictions[f'{target_class}'] == predictions['prediction_label']]
prob_dist = pd.DataFrame(pd.Series([round(x*100) for x in pred_correct.Score_1_round]).value_counts()).reset_index().rename(columns={0: 'count', 'index': 'probability of incompliancy (%)'})
prob_dist

if SAVE_PROB_DIST:
    # Save probabliity distribution table
    prob_dist.to_csv(os.path.join(config.exai_artifact_path, f"prob_dist_{config.today}.csv"), index=False)

In [None]:
plot_model(xgb_model, plot = 'confusion_matrix', plot_kwargs = {'percent' : False})

#### Log ML Model artifacts

In [None]:
# Log all artifacts
with mlflow.start_run(run_id=ml_run_id):
    # mlflow.log_metrics(metrics = results_dict)
    mlflow.log_metrics(metrics={"AUC": ml_auc})
    mlflow.log_metrics(metrics={"Accuracy": ml_acc})
    mlflow.log_metrics(metrics={"Prec.": ml_prec})
    mlflow.log_metrics(metrics={"Recall": ml_recall})
    mlflow.log_metrics(metrics={"F1": ml_f1})
    
    mlflow.log_artifacts(filepaths_dict['exai_artifact_path'], "Results")
    mlflow.log_artifacts(filepaths_dict['ml_artifact_path'], "model")

mlflow.end_run()

In [None]:
# End ML run and get status
client.set_terminated(ml_run.info.run_id, status="FINISHED")
ml_run = client.get_run(ml_run.info.run_id)
print(f"run_id: {ml_run.info.run_id}; status: {ml_run.info.status}")

### LSTM Classification

In [None]:
# Initialise LSTM run
lstm_run = client.create_run(
        experiment_id=exp_id,
        run_name = f'lstm_model_{target_class}',
        tags={
            MLFLOW_PARENT_RUN_ID : parent_run_id
        }
    )

print("lstm_run_id")
lstm_run_id = lstm_run.info.run_uuid
client.log_param(lstm_run_id, "run_id", lstm_run_id)

In [None]:
# Preparing Data from LSTM Model
y_train = np.array(X_train[f'{target_class}'])
y_valid = np.array(X_valid[f'{target_class}'])

X_train = X_train['cleaned_text'].astype("str")
X_valid = X_valid['cleaned_text'].astype("str")

print('train_set', len(X_train), 'validation_set', len(X_valid))

#### Word Tokenizer

In [None]:
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)

# Convert Text to Sequence
X_train = word_tokenizer.texts_to_sequences(X_train)
X_valid = word_tokenizer.texts_to_sequences(X_valid)

# Padding all reviews to fixed length 100
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_valid = pad_sequences(X_valid, padding='post', maxlen=maxlen)

In [None]:
# Save tokenizer
if not os.path.exists(config.tokenizer_artifact_path):
    os.makedirs(config.tokenizer_artifact_path)
with open(config.tokenizer_artifact_path + '\\tokenizer.pkl', 'wb') as outfile:
    pickle.dump(word_tokenizer, outfile)

#### Word Embeddings

In [None]:
from numpy import asarray

# Load GloVe word embeddings and create an Embeddings Dictionary
embeddings_dictionary = {}
glove_file = open('inputs\\glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions

glove_file.close()

In [None]:
# Adding 1 to store dimensions for words for which no pretrained word embeddings exist.
vocab_length = len(word_tokenizer.word_index) + 1
print("vocab_length: ", vocab_length)

# Create Embedding matrix
# Containing 100-dimensional GloVe word embeddings for all words in our corpus.
embedding_matrix = np.zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

#### LSTM MODEL

In [None]:
# Build the LSTM model
from keras.models import Sequential
from keras.layers import Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM, Bidirectional, Activation, Dropout, Dense

lstm_model = Sequential()
embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=maxlen , trainable=True)
lstm_model.add(embedding_layer)
lstm_model.add(Dropout(0.5))
lstm_model.add(Bidirectional(LSTM(128)))
lstm_model.add(Dense(1, activation='sigmoid')) # Binary

# Display Model
lstm_model.summary()

# Model compiling
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
from keras.callbacks import EarlyStopping

# Model Training
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
lstm_model_history = lstm_model.fit(X_train, y_train, batch_size=128, epochs=100, verbose=1, validation_split=0.3, callbacks=[early_stopping])

#### Evaluate LSTM

In [None]:
ev.plot_lstm_performance(lstm_model_history)

In [None]:
# Make predictions in validation dataset for combining with ml models
y_pred_valid = lstm_model.predict(X_train)
y_pred_test = lstm_model.predict(X_valid)

lstm_valid_pred = pd.DataFrame(y_pred_valid, columns=['lstm_pred_score'])
lstm_valid_pred['lstm_pred'] = np.where(lstm_valid_pred['lstm_pred_score'] < 0.50, 0, 1)
lstm_valid_pred = lstm_valid_pred.merge(pd.DataFrame(validation_df.index).reset_index(drop=True), how='left', left_index=True, right_index=True)

lstm_test_pred = pd.DataFrame(y_pred_test, columns=['lstm_pred_score'])
lstm_test_pred['lstm_pred'] = np.where(lstm_test_pred['lstm_pred_score'] < 0.50, 0, 1)
lstm_test_pred = lstm_test_pred.merge(pd.DataFrame(validation_df.index).reset_index(drop=True), how='left', left_index=True, right_index=True)

In [None]:
dl_auc = roc_auc_score(y_valid, lstm_valid_pred['lstm_pred_score'])
dl_acc = accuracy_score(y_valid, np.array(lstm_valid_pred['lstm_pred']))
dl_prec = precision_score(y_valid, np.array(lstm_valid_pred['lstm_pred']), average='binary')
dl_recall = recall_score(y_valid, np.array(lstm_valid_pred['lstm_pred']), average='binary')
dl_f1 = f1_score(y_valid, np.array(lstm_valid_pred['lstm_pred']), average='binary')

print('dl_auc', dl_auc, '\ndl_acc', dl_acc, '\ndl_prec', dl_prec, '\ndl_recall', dl_recall, '\ndl_f1', dl_f1)

In [None]:
# Save model
dl_signature = mlflow.models.infer_signature( model_input = X_valid,
                                            model_output = y_pred_test )

mlflow.keras.save_model(lstm_model,
                        config.nn_artifact_path,
                        signature = dl_signature )


In [None]:
# Log all artifacts
with mlflow.start_run(run_id=lstm_run_id):
    mlflow.log_metrics(metrics={"AUC": dl_auc})
    mlflow.log_metrics(metrics={"Accuracy": dl_acc})
    mlflow.log_metrics(metrics={"Prec.": dl_prec})
    mlflow.log_metrics(metrics={"Recall": dl_recall})
    mlflow.log_metrics(metrics={"F1": dl_f1})

    mlflow.log_artifacts(config.exai_artifact_path, "Results")
    mlflow.log_artifacts(config.tokenizer_artifact_path, "Tokenizer")
    mlflow.log_artifacts(config.nn_artifact_path, "model")

mlflow.end_run()

In [None]:
# End LSTM run and get status
client.set_terminated(lstm_run.info.run_id, status="FINISHED")
lstm_run = client.get_run(lstm_run.info.run_id)
print(f"run_id: {lstm_run.info.run_id}; status: {lstm_run.info.status}")

#### Combine Predictions

In [None]:
# Combined prediction labels - validation data
rule_based_prediction = predictions[[f'{target_class}', 'prediction_label', 'prediction_score_1']].rename({'prediction_label': 'rb_pred', 'prediction_score_1':'rb_pred_score'}, axis=1)
combined_pred = lstm_valid_pred.merge(rule_based_prediction, how='left', left_index=True, right_index=True)

In [None]:
# Determine split between models
correct = []

for dec_increment in range(40, 101):
  lstm_wt = dec_increment/ 100
  combined_pred['combined_score'] = (lstm_wt*combined_pred['lstm_pred_score'] + (1-lstm_wt)*combined_pred['rb_pred_score'])
  combined_pred['combined_pred'] = np.where(combined_pred['combined_score'] < 0.50, 0, 1)
  right_prop = (combined_pred['combined_pred'] == combined_pred[f'{target_class}']).value_counts().iloc[0] / len(combined_pred)
  correct.append((lstm_wt,right_prop))

correct = pd.DataFrame(correct).rename({0: 'weight', 1:'accuracy'}, axis=1)
plt.plot(correct['weight'], correct['accuracy'])
plt.ylabel("Combined Accuracy (%)")
plt.xlabel("Proportion of LSTM prediction scores used")
plt.show()

In [None]:
# Take the weights that produce the best score
lstm_wt = correct[correct.accuracy == correct.accuracy.max()]['weight'].min()

# Validate on test data (2023)
combined_pred['combined_score'] = (lstm_wt*combined_pred['lstm_pred_score'] + (1-lstm_wt)*combined_pred['rb_pred_score'])
combined_pred['combined_pred'] = np.where(combined_pred['combined_score'] < 0.5, 0, 1)

final_acc = (combined_pred['combined_pred'] == combined_pred[f'{target_class}']).value_counts().iloc[0] / len(combined_pred)
# Rearrange columns
combined_test = combined_pred[['id', f'{target_class}', 'combined_score', 'combined_pred', 'lstm_pred_score', 'lstm_pred',
                               'rb_pred_score', 'rb_pred']]
combined_test.to_csv( os.path.join(config.exai_artifact_path + f"\\prediction_{config.today}.csv") )

print(f"Combined Accuracy for [ML wt. {1-lstm_wt} | LSTM wt. {lstm_wt}]:", round(final_acc, 3))

In [None]:
final_auc = roc_auc_score(y_valid, combined_test['combined_score'])
final_acc = accuracy_score(y_valid, np.array(combined_test['combined_pred']))
final_prec = precision_score(y_valid, np.array(combined_test['combined_pred']), average='binary')
final_recall = recall_score(y_valid, np.array(combined_test['combined_pred']), average='binary')
final_f1 = f1_score(y_valid, np.array(combined_test['combined_pred']), average='binary')

print('final_auc', final_auc, '\nfinal_acc', final_acc, '\nfinal_prec', final_prec, '\nfinal_recall', final_recall, '\nfinal_f1', final_f1)

#### Log artifacts and metrics

In [None]:
# Log final metrics
with mlflow.start_run(run_id=parent_run_id):
    mlflow.log_param("model_wt", lstm_wt)
    mlflow.log_metrics(metrics={"AUC": final_auc})
    mlflow.log_metrics(metrics={"Accuracy": final_acc})
    mlflow.log_metrics(metrics={"Prec.": final_prec})
    mlflow.log_metrics(metrics={"Recall": final_recall})
    mlflow.log_metrics(metrics={"F1": final_f1})
    
mlflow.end_run()

In [None]:
# End parent run and get status of Mlflow runs
client.set_terminated(parent_run.info.run_id, status="FINISHED")
parent_run = client.get_run(parent_run.info.run_id)

print(f"run_id: {parent_run.info.run_id}; status: {parent_run.info.status}")
print(f"run_id: {ml_run.info.run_id}; status: {ml_run.info.status}")
print(f"run_id: {lstm_run.info.run_id}; status: {lstm_run.info.status}")

#### Register Model into MLFlow model registry

In [None]:
# Check if all basic metrics are above/Below the threshold
if ((final_acc > 0.6)):
  is_model_good = 'yes'
else:
  is_model_good = 'no'

print("is_model_good: ", is_model_good, ", final_accuracy is", final_acc)

In [None]:
# Check existing model and compare, Based on outcome register this new model
if is_model_good == 'yes':
    mlf.mlflow_existing_model_compare_and_registry(client= client,
                                                   model_run_info=ml_run, 
                                                   registry_model_name=f'XGBoost_{target_class}',
                                                   pos_metrics_list=['Accuracy'],
                                                   pos_metrics_thresh_diff_list=[0.01],
                                                   model_type="Classification")
    
    mlf.mlflow_existing_model_compare_and_registry(client= client,
                                                   model_run_info=lstm_run, 
                                                   registry_model_name=f'LSTM_{target_class}',
                                                   pos_metrics_list=['Accuracy'],
                                                   pos_metrics_thresh_diff_list=[0.01],
                                                   model_type="Classification")
else:
    None

### End of Script