## Inference Pipeline

### Import Libraries

In [176]:
import os
import shutil
import pickle
import pandas as pd
import numpy as np

# import pycaret
# from pycaret.classification import *

# MLFlow
import mlflow
import mlflow.keras
from mlflow.tracking import MlflowClient

# LSTM
import keras
# from keras.preprocessing.text import one_hot, Tokenizer
from keras.utils import pad_sequences

import src.helpers_preprocess as pp
import src.helpers_mlflow as mlf
import src.config as config

# import importlib
# importlib.reload(config)

### Set Configurations

In [177]:
target_class = 'incompliant'
SAVE_PREDICTIONS = True

In [178]:
# Check if filepaths exists and create filepaths if do not exist
# Explainations for filepaths can be found in config.py
config.create_path(config.main_directory)
config.create_path(config.output_path)
config.create_path(config.data_artifact_path)
config.create_path(config.tokenizer_artifact_path)
config.create_path(config.inference_output)

Filepath already exists
Filepath already exists
Filepath already exists
Filepath already exists
Filepath already exists


### Import features dataset

In [179]:
# Get latest feature set
all_feature_files = [os.path.join(config.raw_data_path, x) for x in os.listdir(config.raw_data_path) if x.startswith("full_features") and x.endswith(".csv")]
curr_features_filepath = max(all_feature_files, key = os.path.getctime)
data_df = pd.read_csv(curr_features_filepath, index_col= None)
print(f"Dataset from {curr_features_filepath}")

Dataset from C:\Users\xtanl\OneDrive - Singapore Management University\Capstone\raw_data\full_features_231106.csv


### Load MLFLOW Experiment

#### Setup the MYSQL database for tracking MLFLOW Model Registry

In [180]:
# Create database if it does not exists
# Database required for MLFlow model registry as only certain APIs are supported by Mlflow
mlf.create_database_storage(config.dbServerName, config.dbPort, config.dbName, config.dbUser, config.dbPassword)

Database created at MYSQL: mlflow_tracking_database on root:isss625@127.0.0.1/3306


In [181]:
# Check if database exists
# mlf.show_databases(dbServerName, dbUser, dbPassword)

#### Setup MLFLOW to retrieve experiments and model registry (in MYSQL)

In [182]:
# Create mlflow command run configured in src/config.py
mlflow_conn = mlf.create_mlflow_cmd(config.storage_filepath, config.dbName, config.dbUser, config.dbPassword, config.dbServerName, config.dbPort)

In [183]:
# Run mlflow command line
mlf.run_cmd(mlflow_conn, config.timeout)
print(f"Running command line with timeout of {config.timeout} seconds \n{mlflow_conn}")

Running command line with timeout of 10 seconds 
mlflow ui                     --backend-store-uri file:/./aicritic_mlflow                     --registry-store-uri mysql+pymysql://root:isss625@localhost:3306/mlflow_tracking_database                     --host 127.0.0.1 --port 5000                     --serve-artifacts


In [184]:
exp_id, client = mlf.setup_mlflow(config.exp_name, config.storage_filepath)
print("exp_id", exp_id)

MLFLOW UI is at: http://127.0.0.1:5000/
RESOURCE_ALREADY_EXISTS: Experiment 'ai_critic' already exists.
exp_id 0


### Load Rule Based Classification Model

In [185]:
# Get production ML model from Model Registry
ml_model_uri = f"models:/XGBoost_{target_class}/Production"
ml_model = mlflow.sklearn.load_model(model_uri=ml_model_uri)
input_cols = ml_model.feature_names_in_

categorical_cols = [#'contains_montary', 
                    'breach_flagwords', 'breach_hashes', 'has_nonpru_email', 'has_hyperlinks', 'has_disclaimer']

# OH Encode
data_encoded, enc_cols = pp.get_onehot(data_df, feature_list = categorical_cols)

# Align Columns
pred_data = pp.column_alignment(new_dataset=data_encoded, loaded_featset=input_cols, cat_feats=['breach_flagwords', 'breach_hashes', 'has_nonpru_email',
                                                                                                'has_hyperlinks', 'has_approvals', 'has_disclaimer'])

In [186]:
# Rule-Based Predictions
rb_predictions = ml_model.predict(pred_data)
rb_predictions = pd.DataFrame(data_encoded['id']).merge(pd.DataFrame(rb_predictions, columns= ['rb_pred']), left_index= True, right_index=True)

predict_scores = pd.DataFrame(pd.DataFrame(ml_model.predict_proba(pred_data))[1]).rename({1: 'rb_pred_score'}, axis=1)
predict_scores['rb_pred_score'] = predict_scores['rb_pred_score'].astype('float')
rb_predictions = rb_predictions.merge(predict_scores, left_index= True, right_index=True)

In [187]:
# Show probability distribution box plot
pred_correct = predictions[predictions[f'{target_class}'] == predictions['prediction_label']]
prob_dist = pd.DataFrame(pd.Series([round(x*100) for x in pred_correct.Score_1_round]).value_counts()).reset_index().rename(columns={0: 'count', 'index': 'probability of incompliancy (%)'})
prob_dist

if SAVE_PROB_DIST:
    # Save probabliity distribution table
    prob_dist.to_csv(os.path.join(config.exai_artifact_path, f"prob_dist_{config.today}.csv"), index=False)

NameError: name 'predictions' is not defined

### LSTM Classification

In [None]:
nn_model_uri = f"models:/LSTM_{target_class}/Production"
nn_model = mlflow.keras.load_model(model_uri=nn_model_uri)

In [None]:
# Get LSTM production model run_id
for mv in client.get_latest_versions(name=f"LSTM_{target_class}"):
    if dict(mv)['current_stage'] == 'Production':
      nn_model_runid = dict(mv)['run_id']

print("model_runid: ", nn_model_runid)

model_runid:  9134773ea386403d9403dc7fa8ab6690


#### Load Word Tokenizer

In [None]:
# Load the tokenizer
tokenizer_uri = f"runs:/{nn_model_runid}/Tokenizer/tokenizer.pkl"
mlflow.artifacts.download_artifacts(artifact_uri=tokenizer_uri, dst_path=config.tokenizer_artifact_path)
with open(os.path.join(config.tokenizer_artifact_path, "tokenizer.pkl"), 'rb') as outfile:
    word_tokenizer = pickle.load(outfile)

In [None]:
# Convert Text to Sequence
data_tokenized = word_tokenizer.texts_to_sequences(data_df)

# Padding all reviews to fixed length 100
maxlen = 100
data_tokenized = pad_sequences(data_tokenized, padding='post', maxlen=maxlen)

#### Word Embeddings

In [None]:
from numpy import asarray

# Load GloVe word embeddings and create an Embeddings Dictionary
embeddings_dictionary = {}
glove_file = open(config.glove_file, encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions

glove_file.close()

In [None]:
# Adding 1 to store dimensions for words for which no pretrained word embeddings exist.
vocab_length = len(word_tokenizer.word_index) + 1
print("vocab_length: ", vocab_length)

# Create Embedding matrix
# Containing 100-dimensional GloVe word embeddings for all words in our corpus.
embedding_matrix = np.zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

vocab_length:  16355


#### Load LSTM MODEL

In [None]:
# Get production model from Model Registry
lstm_model_uri = f"models:/LSTM_{target_class}/Production"
lstm_model = mlflow.keras.load_model(model_uri=nn_model_uri)
lstm_pred = lstm_model.predict(data_tokenized)

lstm_prediction = pd.DataFrame(lstm_pred, columns=['lstm_pred_score'])
lstm_prediction['lstm_pred_score'] = lstm_prediction['lstm_pred_score'].astype('float')
lstm_prediction['lstm_pred'] = np.where(lstm_prediction['lstm_pred_score'] < 0.50, 0, 1)
lstm_prediction = lstm_prediction.merge(pd.DataFrame(data_df['id']).reset_index(drop=True), how='left', left_index=True, right_index=True)



#### Combine Predictions

In [None]:
# Combined prediction labels
rule_based_prediction = rb_predictions[['rb_pred', 'rb_pred_score']]
combined_pred = lstm_prediction.merge(rule_based_prediction, how='left', left_index=True, right_index=True)

In [None]:
# Take the weights that produce the best score
# parent_run = dict(mlflow.get_parent_run(nn_model_runid))
parent_runid = dict(dict(mlflow.get_run(nn_model_runid))['data'])['tags']['mlflow.parentRunId']
parent_run = dict(mlflow.get_run(parent_runid))
lstm_wt = float(dict(parent_run['data'])['params']['model_wt'])
combined_pred['combined_score'] = (lstm_wt*combined_pred['lstm_pred_score'] + (1-lstm_wt)* combined_pred['rb_pred_score'])
combined_pred['combined_pred'] = np.where(combined_pred['combined_score'] < 0.5, 0, 1)
# Rearrange columns
combined_pred = combined_pred[['id', 'combined_score', 'combined_pred', 'lstm_pred_score', 'lstm_pred',
                               'rb_pred_score', 'rb_pred']]

In [None]:
# Save predictions
if SAVE_PREDICTIONS:
    combined_pred.to_csv(os.path.join(config.inference_output, f"prediction_{config.today}.csv"))

In [None]:
# Preview output
# combined_pred[combined_pred['combined_pred'] == 1].head(1)

Unnamed: 0,id,combined_score,combined_pred,lstm_pred_score,lstm_pred,rb_pred_score,rb_pred
7,pfbid0cPYXbGHKhc8dDR7aersC4nyxbXCTLP5vFpsVJe1K...,0.602089,1,0.023425,0,0.987866,1


### End of Script