### Training Pipeline

In [None]:
import os
import pandas as pd
import numpy as np

# Pycaret
#from pycaret.regression import setup
import pycaret.nlp as pycnlp
import pycaret.classification as pyclass
from pycaret.classification import get_config, predict_model, plot_model, pull

# MLFlow
import mlflow
import mlflow.keras
from mlflow.tracking import MlflowClient
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID, MLFLOW_RUN_NAME

# NLP
import spacy
#spacy_path = MOUNT_PATH + 'libraries/en_core_web_sm-2.3.1'
#spacy.load(spacy_path)
import category_encoders as ce
import pickle

# For Coherence Score
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, LdaMulticore, CoherenceModel, LsiModel, HdpModel
# For Text Vectorization
from gensim.models import word2vec

import matplotlib as mpl
mpl.rcParams['font.family'] = ['DejaVu Sans']

#pd.set_option('max_colwidth', -1)

In [None]:
#### Dataset import
data_filepath =  "C:\\Users\\xtanl\\OneDrive\\Desktop\\data_file_20230730.csv"
data = pd.read_csv(data_filepath)

In [None]:
#### Data Preprocessing
data_df = data.copy()
data_df = data.drop_duplicates(subset=['content'])
### Features addition ###

# Fill Nulls in content column
data_df['content'] = data_df['content'].fillna('')
# Apply spaces behind the hastags to identify hashes
data_df['content'] = data_df['content'].apply(lambda x: helper.add_space_hashes(x))
# Extract all hashtags
data_df['hashtags'] = data_df['content'].apply(lambda x: helper.extract_hashtags(x))
# Extract all mentiaons
data_df['mentions'] = data_df['content'].apply(lambda x: helper.extract_mentions(x))
# Extract all emojis
data_df['emojis'] = data_df['content'].apply(lambda x: helper.extract_emojis(x))
# Translate Emojis to text
data_df['emojis_text'] = data_df['emojis'].apply(lambda x: helper.translate_emojis(x))

# Check if there are words to be flagged - breach class
data_df['breach_flagwords'] = data_df['content'].apply(lambda x: helper.contains_flagged_words(x))
# Check if there are words to be flagged in the hashes - breach class
data_df['breach_hashes'] = data_df['hashtags'].apply(lambda x: helper.contains_flagged_hashes(x))

# Create label
data_df['incompliant'] = np.where((data_df.breach_flagwords == True) | (data_df.breach_hashes == True) , 1, 0)

In [None]:
data_df = data_df[['Unnamed: 0', 'name', 'content', 'hashtags', 'mentions', 'emojis', 'emojis_text', 'breach_flagwords', 'breach_hashes', 'incompliant']].rename(columns={'Unnamed: 0': 'id'})

#### MLFLOW Experiment

In [None]:
!mlflow ui
exp_id = helper.setup_mlflow()

In [None]:
ml_run = client.create_run(
        experiment_id=exp_id,
        tags={
            MLFLOW_PARENT_RUN_ID : parent_run_id,
            MLFLOW_RUN_NAME : f'ml_model_{target_class}',
        }
    )

ml_run_id = ml_run.info.run_uuid
client.log_param(ml_run_id, "run_id", ml_run_id)

In [None]:
# Text Classification
classfication_exp = pyclass.setup(data= train_data.set_index('serial_no').drop(['post_full_text_of_the_post_profile_full_text_of_the_profile', 'cleaned_text', 'category_of_findings'], axis= 1),
                                   target = target_class,
                                   test_data = validation_data.set_index('serial_no').drop(['post_full_text_of_the_post_profile_full_text_of_the_profile', 'cleaned_text', 'category_of_findings'], axis=1),
                                   preprocess=True,
                                   silent=True,
                                   session_id=42)

In [None]:
# Best_model
best_model = pyclass.compare_models()

In [None]:
# Get dataset used for testing models
X_train = get_config('X')
Y_train = get_config('y')
X_test = get_config('X_test')
Y_test = get_config('y_test')

# Save the dataset
todays_date = datetime.today().strftime('%y%m%d')

if os.path.exists(filepaths_dict['data_artifact_path'] + f"{todays_date}_xtrain{parent_run_id[:5]}.csv"):
  print("Files have been saved")
else:
    X_train.to_csv(filepaths_dict['data_artifact_path'] + f"{todays_date}_xtrain{parent_run_id[:5]}.csv")
    Y_train.to_csv(filepaths_dict['data_artifact_path'] + f"{todays_date}_ytrain{parent_run_id[:5]}.csv")
    X_test.to_csv(filepaths_dict['data_artifact_path'] + f"{todays_date}_xtest{parent_run_id[:5]}.csv")
    Y_test.to_csv(filepaths_dict['data_artifact_path'] + f"{todays_date}_ytest{parent_run_id[:5]}.csv")

    print(f"X_Train saved in: {filepaths_dict['data_artifact_path']}" + f"{todays_date}_xtrain{parent_run_id[:5]}.csv")
    print(f"y_Train saved in: {filepaths_dict['data_artifact_path']}" + f"{todays_date}_ytrain{parent_run_id[:5]}.csv")
    print(f"X_test saved in: {filepaths_dict['data_artifact_path']}" + f"{todays_date}_xtest{parent_run_id[:5]}.csv")
    print(f"y_test saved in: {filepaths_dict['data_artifact_path']}" + f"{todays_date}_ytest{parent_run_id[:5]}.csv")

#### Rule-based Classification Model

In [None]:
import xgboost as xgb

xgb_instance = xgb.XGBClassifier()
fit_kwargs = { "early_stopping_rounds": 5, "eval_metric": "logloss", "eval_set": [(X_test, Y_test)]}
xgb_model = pyclass.create_model(xgb_instance, fit_kwargs=fit_kwargs, error_score ='raise')

tuned_model = xgb_model
print(f'Classifier used: {tuned_model.__class__.__name__}')
client.log_param(ml_run_id, "model_type", f"{tuned_model.__class__.__name__}")

In [None]:
# Pycaret output predictions
predictions = predict_model(tuned_model, data=validation_data[validation_data.serial_no.isin(X_test.index)], raw_score=True)

# MLflow log result metrics
results = pull()
results_dict = {k:v[0] for (k,v) in results.to_dict().items() if k != 'Model'}

#pd.DataFrame(results)

In [None]:
# Save model
ml_signature = mlflow.models.infer_signature( model_input = pd.DataFrame(X_train), 
                                              model_output = pd.DataFrame(rb_test_pred['Label']))

mlflow.sklearn.save_model(tuned_model, 
                        filepaths_dict['ml_artifact_path'],
                        signature = ml_signature )

In [None]:
# Predictions on validation (out of time dataset)
predictions['Score_1_round'] = round(predictions['Score_1'], 1)
decile_table = get_decile_score(predictions, f'{target_class}', 'Label', 'Score_1_round')

decile_table

In [None]:
# Show probability distribution box plot
pred_correct = rb_test_pred[rb_test_pred[f'{target_class}'] == rb_test_pred['Label']]
prob_dist = pd.DataFrame(pd.Series([round(x*100) for x in pred_correct.Score_1]).value_counts()).reset_index().rename(columns={0: 'count', 'index': 'probability of incompliancy (%)'})
prob_dist

In [None]:
plot_model(xgb_model, plot = 'confusion_matrix', plot_kwargs = {'percent' : False})

#### LSTM Classification

In [None]:
lstm_run = client.create_run(
        experiment_id=exp_id,
        tags={
            MLFLOW_PARENT_RUN_ID : parent_run_id,
            MLFLOW_RUN_NAME : f'lstm_model_{target_class}',
        }
    )

lstm_run_id = lstm_run.info.run_uuid
client.log_param(lstm_run_id, "run_id", lstm_run_id)

In [None]:
# Preparing Data from LSTM Model
lstm_fields = ['cleaned_text', 'Category of Findings']

X_train = train_data['cleaned_text']
X_valid = validation_data['category_of_findings']
X_oot = oot_data['category_of_findings']
# y_train = np.array(list(map(lambda x: 1 if x=="No further action required" else 0, train_data['category_of_findings'])))
# y_test = np.array(list(map(lambda x: 1 if x=="No further action required" else 0, test_data['category_of_findings'])))

y_train = np.array(train_data[f'{target_class}'])
y_valid = np.array(validation_data[f'{target_class}'])
y_oot = np.array(oot_data[f'{target_class}'])

print("train_set", len(X_train), "valid_set", len(X_valid), "oot_set", len(X_oot))

##### Word Tokenizer

In [None]:
from keras.preprocessing.text import one_hot, Tokenizer
from keras_preprocessing.sequence import pad_sequences

word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)

# Convert Text to Sequence
X_train = word_tokenizer.texts_to_sequences(X_train)
X_valid = word_tokenizer.texts_to_sequences(X_valid)
X_oot = word_tokenizer.texts_to_sequences(X_oot)

# Padding all reviews to fixed length 100
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_valid = pad_sequences(X_valid, padding='post', maxlen=maxlen)
X_oot = pad_sequences(X_oot, padding='post', maxlen=maxlen)

In [None]:
# Log Tokenizer
# with open(filepaths_dict['tokenizer_artifact_path'] + 'tokenizer.pkl', 'wb') as outfile:
#     pickle.dump(word_tokenizer, outfile)

# Save tokenizer
with open(filepaths_dict['tokenizer_artifact_path'] + 'tokenizer.pkl', 'wb') as outfile:
    pickle.dump(word_tokenizer, outfile)

#### Word Embeddings

In [None]:
from numpy import asarray

# Load GloVe word embeddings and create an Embeddings Dictionary
embeddings_dictionary = {}
glove_file = open('/dbfs/mnt/datahub-apps/ai_critic/libraries/glove/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions

glove_file.close()

In [None]:
# Adding 1 to store dimensions for words for which no pretrained word embeddings exist.+1
vocab_length = len(word_tokenizer.word_index) + 1
print("vocab_length: ", vocab_length)

# Create Embedding matrix
# Containing 100-dimensional GloVe word embeddings for all words in our corpus.
embedding_matrix = np.zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

#### LSTM MODEL

In [None]:
# Build the LSTM model
from keras.models import Sequential
from keras.layers import Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import LSTM ,Bidirectional

lstm_model = Sequential()
embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
lstm_model.add(embedding_layer)
lstm_model.add(Dropout(0.5))
lstm_model.add(Bidirectional(LSTM(128)))
lstm_model.add(Dense(1, activation='sigmoid')) # Binary

# Display Model
lstm_model.summary()

# Model compiling
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
from keras.callbacks import EarlyStopping

# Model Training
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
lstm_model_history = lstm_model.fit(X_train, y_train, batch_size=128, epochs=100, verbose=1, validation_split=0.3, callbacks=[early_stopping])

#### Evaluate LSTM

In [None]:
evaluate_lstm(lstm_model, X_train, y_train)

In [None]:
lstm_acc = evaluate_lstm(lstm_model, X_valid, y_valid)

In [None]:
plot_lstm_performance(lstm_model_history)

In [None]:
# Make predictions in validation dataset for combining with ml models
y_pred_valid = lstm_model.predict(X_train)
y_pred_test = lstm_model.predict(X_valid)

lstm_test_pred = pd.DataFrame(y_pred_test, columns=['lstm_pred_score'])
lstm_test_pred['lstm_pred'] = np.where(lstm_test_pred['lstm_pred_score'] < 0.50, 0, 1)
lstm_test_pred = lstm_test_pred.merge(pd.DataFrame(validation_data['serial_no']).reset_index(drop=True), how='left', left_index=True, right_index=True)

lstm_valid_pred = pd.DataFrame(y_pred_valid, columns=['lstm_pred_score'])
lstm_valid_pred['lstm_pred'] = np.where(lstm_valid_pred['lstm_pred_score'] < 0.50, 0, 1)
lstm_valid_pred = lstm_valid_pred.merge(pd.DataFrame(validation_data['serial_no']).reset_index(drop=True), how='left', left_index=True, right_index=True)

In [None]:
# Make predictions in OOT dataset for testing
y_pred_oot = lstm_model.predict(X_oot)

lstm_oot_pred = pd.DataFrame(y_pred_oot, columns=['lstm_pred_score'])
lstm_oot_pred['lstm_pred'] = np.where(lstm_oot_pred['lstm_pred_score'] < 0.50, 0, 1)
lstm_oot_pred = lstm_oot_pred.merge(pd.DataFrame(oot_data['serial_no']).reset_index(drop=True), how='left', left_index=True, right_index=True)

In [None]:
# Save model
dl_signature = mlflow.models.infer_signature( model_input = X_oot,
                                              model_output = y_pred_valid )

mlflow.keras.save_model(lstm_model,
                        filepaths_dict['dl_artifact_path'],
                        signature = dl_signature )

#### Combine Predictions

In [None]:
# Combined prediction labels - validation data
rule_based_prediction = rb_valid_pred[[f'{target_class}', 'Label', 'Score_1']].rename({'Label': 'rb_pred', 'Score_1':'rb_pred_score'}, axis=1)
combined_pred = lstm_valid_pred.merge(rule_based_prediction, how='left', left_index=True, right_index=True)

# Combined prediction labels - oot data
rule_based_test = rb_test_pred[[f'{target_class}', 'Label', 'Score_1']].rename({'Label': 'rb_pred', 'Score_1':'rb_pred_score'}, axis=1)
combined_test = lstm_test_pred.merge(rule_based_test, how='left', left_index=True, right_index=True)

In [None]:
# Determine split between models
correct = []

for dec_increment in range(40, 101):
  lstm_wt = dec_increment/ 100
  combined_pred['combined_score'] = (lstm_wt*combined_pred['lstm_pred_score'] + (1-lstm_wt)*combined_pred['rb_pred_score'])
  combined_pred['combined_pred'] = np.where(combined_pred['combined_score'] < 0.50, 0, 1)
  right_prop = (combined_pred['combined_pred'] == combined_pred[f'{target_class}']).value_counts().iloc[0] / len(combined_pred)
  correct.append((lstm_wt,right_prop))

correct = pd.DataFrame(correct).rename({0: 'weight', 1:'accuracy'}, axis=1)
plt.plot(correct['weight'], correct['accuracy'])
plt.ylabel("Combined Accuracy (%)")
plt.xlabel("Proportion of LSTM prediction scores used")
plt.show()

In [None]:
# Take the weights that produce the best score
lstm_wt = correct[correct.accuracy == correct.accuracy.max()]['weight'].min()

# Validate on test data (2023)
combined_test['combined_score'] = (lstm_wt*combined_test['lstm_pred_score'] + (1-lstm_wt)*combined_test['rb_pred_score'])
combined_test['combined_pred'] = np.where(combined_test['combined_score'] < 0.5, 0, 1)

final_acc = (combined_test['combined_pred'] == combined_test[f'{target_class}']).value_counts().iloc[0] / len(combined_test)
# Rearrange columns
combined_test = combined_test[['serial_no', f'{target_class}', 'combined_score', 'combined_pred', 'lstm_pred_score', 'lstm_pred',
                               'rb_pred_score', 'rb_pred']]
combined_test.to_csv( filepaths_dict['exai_artifact_path'] + f"prediction_{todays_date}.csv")

print(f"Combined Accuracy for {lstm_wt} weight:", round(final_acc, 3))

#### Log artifacts and metrics

# Log all artifacts
with mlflow.start_run(run_id=lstm_run_id):
    mlflow.log_metrics(metrics={"Accuracy": lstm_acc})

    mlflow.log_artifacts(filepaths_dict['exai_artifact_path'], "Results")
    mlflow.log_artifacts(filepaths_dict['tokenizer_artifact_path'], "Tokenizer")
    mlflow.log_artifacts(filepaths_dict['dl_artifact_path'], "model")

mlflow.end_run()

In [None]:
# Log final metrics
with mlflow.start_run(run_id=parent_run_id):
    mlflow.log_param("model_wt", lstm_wt)
    mlflow.log_metrics(metrics={"Accuracy": round(final_acc, 3)})
mlflow.end_run()

In [None]:
print(f"run_id: {parent_run.info.run_id}; status: {parent_run.info.status}")
print(f"run_id: {ml_run.info.run_id}; status: {ml_run.info.status}")
print(f"run_id: {lstm_run.info.run_id}; status: {lstm_run.info.status}")

#### Register Model

In [None]:
# Check if all basic metrics are above/Below the threshold
if ((final_acc > 0.6)):
  is_model_good = 'yes'
else:
  is_model_good = 'no'

print("is_model_good: ", is_model_good, ", final_accuracy is", final_acc)

In [None]:
# Check existing model and compare, Based on outcome register this new model
if is_model_good == 'yes':
    mlflow_existing_model_compare_and_registry(ml_run, f'XGBoost_{target_class}', ['accuracy'], [0.01], "Classification")
    mlflow_existing_model_compare_and_registry(lstm_run, f'LSTM_{target_class}', ['accuracy'], [0.01], "Classification")
else:
    None

In [None]:
mlflow.end_run()