# Imports

In [None]:
# !pip install --extra-index-url=https://pypi.celonis.cloud/ pycelonis==1.5.8 --user
# !pip install -r requirements-txt

In [None]:
%load_ext autoreload
%autoreload 2
# %reload_ext autoreload

# local module imports
from utils import *
from data_engineering import *
from machine_learning import *
from login import *
import config

# other imports
from pycelonis import get_celonis
import pandas as pd
from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import train_test_split
import shap
import mlflow
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
import gc
import pickle
import warnings
warnings.filterwarnings('ignore')
 
# mlflow configuation
mlflow.set_experiment(f"root_cause_analysis/{config.client}")


# Login

In [None]:
celonis = get_celonis(**login)

# Dataset

In [None]:
# Prepare global parameters
datamodel = celonis.datamodels.find(config.datamodel_id)
target_name = list(config.target.keys())[0]

# Prepare table parameters    
params_context = {
    'datamodel': datamodel,
    'case_key': config.case_key,
    'target': config.target,
    'filter': config.data_filter,
    'tables_to_include': config.tables_to_include,
    'additional_columns': config.additional_columns
}
params_act_count = {
    'datamodel': datamodel,
    'case_key': config.case_key,
    'filter': config.data_filter,
    'activity_table': config.activity_table,
    'activity_name': config.activity_name
}
params_days_betw = {
    'datamodel': datamodel,
    'case_key': config.case_key,
    'filter': config.data_filter,
    'events_days_between': config.events_days_between,
}
params_mfeature = {
    'datamodel': datamodel,
    'case_key': config.case_key,
    'filter': config.data_filter,
    'multiple_value_features': config.multiple_value_features,  
}

In [None]:
# Create tables
if config.tables_to_include == {} and config.additional_columns == {}:
    context_table = pd.DataFrame()
else:
    context_table = get_context_table(**params_context)
if config.activity_table == None or  config.activity_name == None:
    activity_count_table = pd.DataFrame()
else:
    activity_count_table = get_activity_count_table(**params_act_count)
if config.events_days_between == {}:
    days_between_table = pd.DataFrame()
else:
    days_between_table = get_days_between_table(**params_days_betw)
if config.multiple_value_features == {}:
    mfeature_table = pd.DataFrame()
else:
    mfeature_table = get_mfeature_count_table(**params_mfeature)
    
# Merge into one table
input_table = pd.concat([context_table, days_between_table,activity_count_table,mfeature_table], axis=1, join="outer")

# Clean table
input_table = drop_irrelevant_data(input_table,target_name)


In [None]:
# Free up memory (delete single tables)
del context_table
del activity_count_table
del days_between_table
del mfeature_table
gc.collect()

# Feature Selection (Random Forest Feature Importance)

In [None]:
# Define list with all , categorical and numerical features
features_rf, cat_features_rf, num_features_rf, features_with_target_rf = get_feature_lists(input_table,target_name)

# Duplicate input_table (RF needs encoded categorical variables)
input_table_rf = input_table.copy()

# Handle missing values
input_table_rf = handle_missing_data(input_table_rf,target_name)

# Encode categorical variables
input_table_rf = encode_cat_features(input_table_rf)

# Get train/test set
X_train_rf,X_test_rf,y_train_rf,y_test_rf = train_test_split(input_table_rf[features_rf],input_table_rf[target_name], test_size = config.test_size)

# Define & train RandomForestClassifier
rfc = RandomForestClassifier(**config.params_RandomForestClassifier)
rfc.fit(X_train_rf,y_train_rf)

# Get RF performance measures
performance_metrics_rf = get_performance_measures(rfc, X_test_rf, y_test_rf)

# Select features with RF feature importance
selected_features = get_selected_features(rfc,X_train_rf, X_test_rf, y_test_rf, feature_importance_method= config.feature_importance_method, feature_number_to_select = config.feature_number_to_select)

# Prediction Model (CatBoost)

In [None]:
with mlflow.start_run() as run:

    # Reduce input_table by selected features + target
    if target_name not in selected_features:
        selected_features.append(target_name)
    input_table = input_table[selected_features]

    # Get feature lists by category
    features, cat_features, num_features, features_with_target = get_feature_lists(input_table,target_name)

    # Handle missing cat features
    input_table[cat_features] = input_table[cat_features].fillna('missing')

    # Get train/test set
    X_train,X_test,y_train,y_test = train_test_split(input_table[features],input_table[target_name], test_size=config.test_size)

    # Define train/test pool
    train_pool = Pool(X_train, 
                      y_train, 
                      cat_features
                     )

    test_pool = Pool(X_test,
                     y_test,
                     cat_features
                     )

    # Specify the training parameters manually
    if config.grid_search==False:
        model = CatBoostClassifier(**config.params_CatBoostClassifier)
        for key, val in config.params_CatBoostClassifier.items():
            mlflow.log_param(key, val)

    # Specify the training parameters through grid search
    else:
        model,grid_search_result = get_grid_search_results(input_table, target_name, features, cat_features, train_pool, **config.params_grid_search)

    # Train the model
    model.fit(train_pool
              ,eval_set=test_pool
              ,**config.params_fit_CatBoost    
             )
    for key, val in config.params_fit_CatBoost.items():
        mlflow.log_param(key, val)

    # Get catBoost performance measures
    performance_metrics = get_performance_measures(model, X_test, y_test)
    for key, val in performance_metrics.items():
        mlflow.log_metric(key, val)
        
    mlflow.catboost.log_model(model, 'RCA_model')    

# Model Explanation (SHAP)

In [None]:
# Load JS visualization code to notebook
shap.initjs()

# Prepare variables
X = X_train.append(X_test)
y = y_train.append(y_test)
pool = Pool(X, y, cat_features)

# Create shap_table, reason_table, top_reasons, stat_table
shap_table = get_shapley_value_table(input_table, target_name, model, X, y, summary_plot = False)
reason_table, top_reasons = get_reasons(shap_table, input_table, target_name, amount = config.cause_number_to_identify, datamodel=None)
stat_table = get_stat_table(input_table, target_name)

## Save/Load Data

In [None]:
# # Save DataFrame
# input_table.to_pickle('input_table.pkl')

# # Load DataFrame
# input_table = pd.read_pickle('input_table.pkl')

In [None]:
# # Save data to CSV files
# Save output tables as .csv files
# shap_table.round(4).to_csv('shap_table.csv')
# reason_table.to_csv('reason_table.csv')
# top_reasons.to_csv('top_reasons.csv')
# stat_table.to_csv('stat_table.csv')
# selected_features_df = pd.DataFrame(selected_features)
# selected_features_df.to_csv('selected_features.csv')

# END