# Group 23: Final tool

*The final tool for the process mining project.*

This tool contains five models:
1. Naive predictor: Type
2. Naive predictor: Time
3. Random Forest: Type
4. Neural Network: Time
5. Neural Network: Type

# 1. Setup

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import *
import time
# next command ensures that plots appear inside the notebook
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns  # also improves the look of plots
import scipy
sns.set()
plt.rcParams['figure.figsize'] = 10, 5  # default hor./vert. size of plots, in inches
plt.rcParams['lines.markeredgewidth'] = 1  # to fix issue with seaborn box plots; needed after import seaborn

# Init Pandas settings
pd.set_option("mode.chained_assignment", None) # to remove false positive chained assignment warnings
pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", None)

# Start the timer to measure how long the notebook takes
start1 = time.time()

In [None]:
df = pd.read_csv("BPI_Challenge_2012.xes.gz_UNPACKED.csv", index_col=[0])
df.head()

In [None]:
df.drop(columns=['org:resource'], inplace=True)
df.dropna(inplace=True)
df['datetime'] = pd.to_datetime(df['time:timestamp'], errors='coerce', utc=True)
df['next_event'] = df['concept:name'].shift(-1)
df['next_case'] = df['case:concept:name'].shift(-1).fillna(0)
df['next_datetime'] = df['datetime'].shift(-1)
df['timedelta'] = (df['next_datetime'] - df['datetime']).astype('timedelta64[s]')

In [None]:
df.iloc[:10]
trainlen = int(len(df)*0.7)
dftrain = df[:trainlen]
dftest = df[trainlen:]

# 2. Naive baseline
## 2.1 Type prediction

In [None]:
# Get the most common next event type for the given ev_type
# This function replaces block 10 til 15 (10: "def nth_most_common(w, n):", 15: "next_common_train")
def get_most_common_next_type(n, ev_type):
    # Find all rows of the given event type
    target_rows = n[n['concept:name'] == ev_type]
    successors = []
    
    # Loop over all rows of the given event type
    for idx, row in target_rows.iterrows():
        
        # Only check the next row if it exists in the DF
        if idx + 1 < len(n):
            current_case = row['case:concept:name']
            next_case = n.loc[n.index[idx + 1], 'case:concept:name']

            # If the next event in the DF is in the same case, add the event type to the successors list
            if current_case == next_case:
                successors.append(n.loc[n.index[idx + 1], 'concept:name'])
    
    # Return the most common event type in the successors list
    return max(set(successors), key=successors.count) if len(successors) > 0 else None


In [None]:
event_types = df['concept:name'].unique()

most_common_next_types = {}
for ev_type in event_types:
    most_common_next_types[ev_type] = get_most_common_next_type(dftrain, ev_type=ev_type)
    

## 2.2 Time prediction

In [None]:
dfs = pd.DataFrame(dftrain[['next_case', 'case:concept:name', 'concept:name', 'timedelta' ]])

time_till_next = {}

for ev_type in event_types:
    target_rows = dfs[dfs['concept:name'] == ev_type]

    a = 0
    c = 0
    for idx, row in target_rows.iterrows():
        if row['case:concept:name'] == row['next_case']:
            a += row['timedelta']
            c += 1
    
    #print(ev_type, a, c)
    time_till_next[ev_type] = a/c

time_till_next

## 2.3 Compiling baseline into dataframe

In [None]:
dftest['timedelta_baseline'] = dftest['concept:name'].map(time_till_next)
dftest['next_event_baseline'] = dftest['concept:name'].map(most_common_next_types)

# 3. Random Forest: Type prediction

## 3.1 Data preprocessing

In [None]:
data_rft = pd.read_csv('https://raw.githubusercontent.com/NickSot/process_mining/main/merged_files/bpi_2012_rft.csv')
data_rft.drop('Unnamed: 0.1', axis=1, inplace=True)
data_rft['next_case'] = data_rft['next_case'].fillna(0).astype(int)
data_rft['lifecycle + event'] = data_rft['lifecycle:transition'] + ' ' + data_rft['event']

In [None]:
data_rft.rename(columns={'A_SUBMITTED': 'A_SUBMITTED_',
                      'A_PARTLYSUBMITTED': 'A_PARTLYSUBMITTED_',
                      'A_PREACCEPTED': 'A_PREACCEPTED_',
                      'W_Completeren aanvraag': 'W_Completeren aanvraag_',
                      'A_ACCEPTED' : 'A_ACCEPTED_', 
                      'O_SELECTED': 'O_SELECTED_', 
                      'A_FINALIZED': 'A_FINALIZED_',
                      'O_CREATED': 'O_CREATED_', 
                      'O_SENT': 'O_SENT_', 
                      'W_Nabellen offertes': 'W_Nabellen offertes_', 
                      'O_SENT_BACK': 'O_SENT_BACK_',
                      'W_Valideren aanvraag': 'W_Valideren aanvraag_', 
                      'A_REGISTERED': 'A_REGISTERED_', 
                      'A_APPROVED': 'A_APPROVED_', 
                      'O_ACCEPTED': 'O_ACCEPTED_',
                      'A_ACTIVATED': 'A_ACTIVATED_', 
                      'O_CANCELLED': 'O_CANCELLED_', 
                      'A_DECLINED': 'A_DECLINED_', 
                      'A_CANCELLED': 'A_CANCELLED_',
                      'W_Afhandelen leads': 'W_Afhandelen leads_',
                      'W_Wijzigen contractgegevens': 'W_Wijzigen contractgegevens_',
                      'W_Beoordelen fraude': 'W_Beoordelen fraude_',
                      'O_DECLINED': 'O_DECLINED_', 
                      'W_Nabellen incomplete dossiers': 'W_Nabellen incomplete dossiers_',
                      'W_Beoordelen fraude': 'W_Beoordelen fraude'}, 
             inplace=True)

In [None]:
train = data_rft[:(int((len(data_rft)*0.7)))]
test = data_rft[(int((len(data_rft)*0.7))):]
all = data_rft
train, test, all = train.dropna(), test.dropna(), all.dropna()

In [None]:
# making dummy variables from the event, in this way the model can use all the different event types.
event_train = pd.get_dummies(train['event'])
event_test = pd.get_dummies(test['event'])

lifecycle_train_ = pd.get_dummies(train['lifecycle:transition'])
lifecycle_test_ = pd.get_dummies(test['lifecycle:transition'])

lifecycle_train = pd.get_dummies(train['lifecycle + event'])
lifecycle_test = pd.get_dummies(test['lifecycle + event'])

lifecycle_train_.head()

In [None]:
df_train = pd.concat([train, event_train], axis=1)
df_test = pd.concat([test, event_test], axis=1)

df_train_1 = pd.concat([df_train, lifecycle_train_], axis=1)
df_test_1 = pd.concat([df_test, lifecycle_test_], axis=1)

df_train_2 = pd.concat([df_train_1, lifecycle_train], axis=1)
df_test_2 = pd.concat([df_test_1, lifecycle_test], axis=1)
df_test.tail()

In [None]:
# Dropping the rows the model doesn't need.
from sklearn.model_selection import train_test_split
X_train_ = df_train_2.drop(columns=['Unnamed: 0', 'lifecycle:transition', 'lifecycle + event', 'event', 'time:timestamp', 'case:REG_DATE', 'next_event', 'W_Valideren aanvraag', 'W_Wijzigen contractgegevens_', 'COMPLETE O_DECLINED', 'START', 'next_case'])
y_train = df_train_2['next_event']

X_test = df_test_2.drop(columns=['Unnamed: 0', 'lifecycle:transition', 'lifecycle + event', 'event', 'time:timestamp', 'case:REG_DATE', 'next_event', 'W_Valideren aanvraag', 'W_Wijzigen contractgegevens_', 'COMPLETE O_DECLINED', 'START',  'next_case'])
y_test = df_test_2['next_event']

In [None]:
# Scaling all features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train_)
X_test = sc.fit_transform(X_test)

## 3.2 Training the RF model

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)

In [None]:
test['next_event_rfc_pred'] = rfc_pred
dff = pd.concat([dftest, test[['next_event_rfc_pred']]], axis=1)
dff.head()

## 3.3 Visualizing model performance
### 3.3.1 Confusion matrices

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

labels = list(np.unique(y_test))
cm =confusion_matrix(y_test, rfc_pred, labels=labels)

# Normalise
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
fig, ax = plt.subplots(figsize=(20,15))
sns.heatmap(cmn, annot=True, fmt='.2f', cmap='Blues', ax=ax)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show(block=False)

In [None]:
labels = list(np.unique(y_test))
a =  confusion_matrix(y_test, rfc_pred, labels=labels)

cmd = pd.DataFrame(a, index=labels, columns=labels)

cmdn = cmd.div(cmd.sum(axis=1), axis=0)

fig, ax = plt.subplots(figsize=(20,17)) 
sns.heatmap(cmdn, annot=True, cmap='Blues', ax=ax)
plt.xlabel('Predicted')
plt.ylabel('actual')
fig.savefig('norm_confusion_matrix.png', dpi=300)

In [None]:
classification_report(y_test, rfc_pred, output_dict = True)

### 3.3.2 Feature importance

In [None]:
# Constructing the feature importance for the model with name "rfr"
importances = rfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfc.estimators_], axis=0)

In [None]:
# Plotting a bar chart
X_train_col = X_train_.columns
forest_importances = pd.Series(importances)
df_importances = pd.concat([forest_importances, pd.Series(X_train_col)], axis=1)

fig, ax = plt.subplots()
df_importances.plot.bar(yerr=std, ax=ax, x=1, y=0, figsize=(18, 10))
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")

# 4. Neural Network: Time prediction

For the time prediction, two neural networks have been constructed. The first makes a rough estimate, after which the second is used to predict the precise values.

This approach was chosen because the original single network implementation performed very bad on smaller values. This approach allows us to handle the estimated lower values separately and make more accurate predictions.

## 4.1 Preprocessing

In [None]:
import tensorflow as tf
np.random.seed(1)
df = pd.read_csv('BPI_Challenge_2012.xes.gz_UNPACKED.csv', index_col=[0])
start = time.time()
df.drop(columns=['org:resource'], inplace=True)
df['next_event'] = df['concept:name'].shift(-1)
df['previous_event'] = df['concept:name'].shift(1)
df['pp_event'] = df['concept:name'].shift(2)
df['ppp_event'] = df['concept:name'].shift(3)
df['p_lifecycle:transition'] = df['lifecycle:transition'].shift(1)
df['next_case'] = df['case:concept:name'].shift(-1).fillna(0)
df['next_case1'] = df['next_case'].shift(-1).fillna(0)
df['datetime'] = pd.to_datetime(df['time:timestamp'], errors='coerce', utc=True)
df['weekday'] = df['datetime'].dt.weekday
df['previous_weekday'] = df['weekday'].shift(1)
df['hour'] =  df['datetime'].dt.hour
df['minute'] =  df['datetime'].dt.minute
df['month'] =  df['datetime'].dt.month
df['next_datetime'] = df['datetime'].shift(-1)
df['timedelta'] = (df['next_datetime'] - df['datetime']).astype('timedelta64[s]')
df = df[(df['case:concept:name'].astype(int)) == (df['next_case'].astype(int))]
df['previous_timedelta'] = df['timedelta'].shift(1)
df['pp_timedelta'] = df['timedelta'].shift(2)
df['ppp_timedelta'] = df['timedelta'].shift(3)
print(len(df))
df.dropna(inplace=True)
df.iloc[:10]
dfxx = df[['hour', 'minute', 'previous_timedelta', 'pp_timedelta', 'ppp_timedelta']]
dfx = df[['concept:name', 'previous_event', 'lifecycle:transition', 'pp_event', 'p_lifecycle:transition', 'weekday']]
dfy = (df[['timedelta']])
valx = dfx
valy = dfy
dummies = pd.get_dummies(dfx.astype('str'))
dfx = dfxx.join(dummies)
trainlen = int(len(df)*0.7)
x_train, y_train = dfx[:trainlen], dfy[:trainlen]
x_test, y_test = dfx[trainlen:], dfy[trainlen:]
df2 = x_train.join(y_train)
dftest = x_test.join(y_test)
valy = y_test
#print(x_train.iloc[:10])
columns = y_train.columns
x_train = tf.convert_to_tensor(x_train)
y_train = tf.convert_to_tensor(y_train)
x_test = tf.convert_to_tensor(x_test)
y_test = tf.convert_to_tensor(y_test)
print(len(x_test))
df.iloc[-10:]

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

## 4.2 Creation of the first estimator model

In [None]:
#splitter NN
lrelu = tf.keras.layers.LeakyReLU(alpha=0.3)
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(x_train.shape[1],1)),
  tf.keras.layers.BatchNormalization(),  
  tf.keras.layers.Dense(228, activation='swish', activity_regularizer=tf.keras.regularizers.l2(0.01)),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dense(128, activation='swish', activity_regularizer=tf.keras.regularizers.l2(0.01)),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dense(48, activation='swish', activity_regularizer=tf.keras.regularizers.l2(0.01)),
  tf.keras.layers.BatchNormalization(), 
  tf.keras.layers.Dense(48, activation='relu'),
  tf.keras.layers.Dense(1, activation='linear')
])
model.summary()
loss_fn = tf.keras.losses.MeanSquaredError(reduction="auto", name="mean_squared_error")

ftrl = tf.keras.optimizers.Ftrl(
    learning_rate=0.1,
    learning_rate_power=-0.5,
    initial_accumulator_value=0.1,
    l1_regularization_strength=0.01,
    l2_regularization_strength=0.01,
    name="Ftrl",
    l2_shrinkage_regularization_strength=0.01,
    beta=0.1)

msle = tf.keras.losses.MeanSquaredLogarithmicError()
def loss_comb(y_true, y_pred):
    return 1*msle(y_true, y_pred) + 0.00000001*(loss_fn(y_true, y_pred))

model.compile(optimizer=ftrl,
              loss=loss_fn,
              metrics=['mean_absolute_error'])

## 4.3 Training the estimator model

In [None]:
#splitter NN
model.fit(x_train, y_train, epochs=10, batch_size=512, validation_split=0.0, shuffle=True)

## 4.4 Evaluating the estimator model

In [None]:
model.evaluate(x_test,  y_test, verbose=2)

### 4.4.1 Displaying predictions

In [None]:
predictions = model(x_test).numpy()

In [None]:
predictions_train = model(x_train).numpy()
predictions_test = model(x_test).numpy()

In [None]:
df2['predictions_train'] = predictions_train
dftest['predictions_test'] = predictions_test

In [None]:
#test and train split on split-NN prediction
p=50
df2_short = df2[df2['predictions_train'] < np.nanpercentile(df2['predictions_train'], p)]
df2_long = df2[df2['predictions_train'] >= np.nanpercentile(df2['predictions_train'], p)]
dftest_long = dftest[dftest['predictions_test'] >= np.nanpercentile(dftest['predictions_test'], p)]
dftest_short = dftest[dftest['predictions_test'] < np.nanpercentile(dftest['predictions_test'], p)]
print(np.nanpercentile(dftest['predictions_test'], p))
print(np.nanpercentile(df2['predictions_train'], p))
df2.iloc[:10]

In [None]:
#creates short training data
x = df2_short.drop(columns=['timedelta', 'predictions_train'])
y = pd.DataFrame(df2_short['timedelta'])
x_train, y_train = x, y
#print(x_train.iloc[:10])
columns = y_train.columns
x_train = tf.convert_to_tensor(x_train)
y_train = tf.convert_to_tensor(y_train)
x_train.shape

## 4.5 Creation of second neural network model

In [None]:
#SHORT MODEL 
model_short = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(x_train.shape[1],1)),
  tf.keras.layers.BatchNormalization(),  
  tf.keras.layers.Dense(128, activation='relu', activity_regularizer=tf.keras.regularizers.l2(0.01)),
  tf.keras.layers.Dense(48, activation='relu', activity_regularizer=tf.keras.regularizers.l2(0.01)),
  tf.keras.layers.Dense(48, activation='relu'),
  tf.keras.layers.Dense(1, activation='linear')
])

model_short.summary()

loss_fn = tf.keras.losses.MeanSquaredError(reduction="auto", name="mean_squared_error")

ftrl = tf.keras.optimizers.Ftrl(
    learning_rate=0.1,
    learning_rate_power=-0.7,
    initial_accumulator_value=0.1,
    l1_regularization_strength=0.01,
    l2_regularization_strength=0.01,
    name="Ftrl",
    l2_shrinkage_regularization_strength=0.01,
    beta=0.1)

model_short.compile(optimizer=ftrl,
              loss=loss_comb,
              metrics=['mean_absolute_error'])

## 4.6 Training the second model

In [None]:
model_short.fit(x_train, y_train, epochs=5, batch_size=512, validation_split=0.1, shuffle=True)

In [None]:
x = dftest_short.drop(columns=['timedelta', 'predictions_test'])
y = pd.DataFrame(dftest_short['timedelta'])
x_train, y_train = x, y
#print(x_train.iloc[:10])
columns = y_train.columns
x_train = tf.convert_to_tensor(x_train)
y_train = tf.convert_to_tensor(y_train)
predictions_test_short = model_short(x_train).numpy()
dftest_short['predictions'] = predictions_test_short
dftest_short.iloc[:10]

In [None]:
x = dftest_long.drop(columns=['timedelta', 'predictions_test'])
y = pd.DataFrame(dftest_long['timedelta'])
x_train, y_train = x, y
columns = y_train.columns
x_train = tf.convert_to_tensor(x_train)
y_train = tf.convert_to_tensor(y_train)
predictions_test_long = model(x_train).numpy()
dftest_long['predictions'] = predictions_test_long
dftest_long.iloc[:10]

## 4.7 Evaluating the second model

In [None]:
print('short old test log', np.square(np.log2(dftest_short['timedelta'] + 1.) - np.log2(dftest_short['predictions_test'] + 1.)).mean())
print('short new test log',np.square(np.log2(dftest_short['timedelta'] + 1.) - np.log2(dftest_short['predictions'] + 1.)).mean(), '\n')
print('short old test mse',np.square(abs(dftest_short['timedelta'] - dftest_short['predictions_test'])).mean())
print('short new test mse',np.square(abs(dftest_short['timedelta'] - dftest_short['predictions'])).mean(), '\n')
print('long old test mse', np.square(abs(dftest_long['timedelta'] - dftest_long['predictions_test'])).mean())
print('long new test mse', np.square(abs(dftest_long['timedelta'] - dftest_long['predictions'])).mean(), '\n')
print('long old test log', np.square(np.log2(dftest_long['timedelta'] + 1.) - np.log2(dftest_long['predictions_test'] + 1.)).mean())
print('long new test log', np.square(np.log2(dftest_long['timedelta'] + 1.) - np.log2(dftest_long['predictions'] + 1.)).mean())

In [None]:
dftest_all = pd.concat([dftest_long, dftest_short], sort=False).sort_index()
dftest_all['error'] = abs(dftest_all['predictions'] - dftest_all['timedelta'])
#dftest_all.drop(columns=['predictions_test'], inplace=True)
dftest_all[-10:]
#dff[-10:]
dfff = pd.concat([dff, dftest_all[['predictions']]], axis=1)

In [None]:
dffj = dff.join(pd.DataFrame(dftest_all['predictions']))
dffj

# 5. Neural Network: Type prediction

In [None]:
df = pd.read_csv('BPI_Challenge_2012.xes.gz_UNPACKED.csv', index_col=[0])
start = time.time()
df.drop(columns=['org:resource'], inplace=True)
df['next_event'] = df['concept:name'].shift(-1)
df['next_case'] = df['case:concept:name'].shift(-1)
df['previous_event'] = df['concept:name'].shift(1)
df['pp_event'] = df['concept:name'].shift(2)
df['ppp_event'] = df['concept:name'].shift(3)
df['p_lifecycle:transition'] = df['lifecycle:transition'].shift(1)
#df = df[df['case:concept:name'] == df['next_case']]
df['datetime'] = pd.to_datetime(df['time:timestamp'], errors='coerce', utc=True)
df['day'] = df['datetime'].dt.day
df['hour'] =  df['datetime'].dt.hour
df['minute'] =  df['datetime'].dt.minute
df['month'] =  df['datetime'].dt.month
dfx = df[['concept:name', 'previous_event', 'lifecycle:transition', 'pp_event', 'ppp_event', 'p_lifecycle:transition']]
dfy = df[['next_event']]
valy = dfy
dummies = pd.get_dummies(dfx)
dfy = pd.get_dummies(dfy)
dfx = dummies
#dfx = dummies.join(df[['day, hour', 'minute', 'month']])
trainlen = int(len(df)*0.7)
# dfx.iloc[:10]
# dfx = dfx.to_numpy()
# dfy = dfy.to_numpy()
x_train, y_train = dfx[:trainlen], dfy[:trainlen]
x_test, y_test = dfx[trainlen:], dfy[trainlen:]
dftest2 = x_test.join(y_test)
valy = y_test
valx = x_test
#print(x_train.iloc[:10])
columns = y_train.columns
x_train = tf.convert_to_tensor(x_train)
y_train = tf.convert_to_tensor(y_train)
x_test = tf.convert_to_tensor(x_test)
y_test = tf.convert_to_tensor(y_test)
print(len(valx))

## 5.1 Creation of the neural network

In [None]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(x_train.shape[1],1)),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dropout(0.1),
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Dense(48, activation='relu'),
  tf.keras.layers.Dense(24, activation='softmax')])
ftrl = tf.keras.optimizers.Ftrl(
    learning_rate=0.1,
    learning_rate_power=-0.5,
    initial_accumulator_value=0.1,
    l1_regularization_strength=0.01,
    l2_regularization_strength=0.01,
    name="Ftrl",
    l2_shrinkage_regularization_strength=0.01,
    beta=0.01)
nadam = tf.keras.optimizers.Nadam(
    learning_rate=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-07, name="Nadam")

loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
model.compile(optimizer='adam',
              loss=loss_fn,
              metrics=['accuracy'])
              
model.summary()

## 5.2 Training the model

In [None]:
model.fit(x_train, y_train, epochs=6, batch_size=512, shuffle=True)

## 5.3 Evaluating the model

In [None]:
model.evaluate(x_test,  y_test, verbose=2)

In [None]:
predictions = model(x_test).numpy()
data = tf.nn.softmax(predictions).numpy()
data

In [None]:
df = pd.DataFrame(data, columns=columns)
df[:10]
predictions = pd.DataFrame(df.idxmax(axis=1), columns = ['next_event_NN_pred'])
dftest2['next_event_NN_pred'] = predictions[['next_event_NN_pred']].values
dftest2

# 6. Combining outputs and exporting CSV

In [None]:
dfffjj = dffj.join(dftest2['next_event_NN_pred'])
dfffjj2 = dfffjj
dfffjj['next_event_NN_pred'] = dfffjj2['next_event_NN_pred'].str[11:]


In [None]:
dfffjj3 = dfffjj[(dfffjj['case:concept:name']) ==(dfffjj['next_case'])]
dfffjj3.rename(columns={'predictions': 'timedelta_NN_pred'}, inplace=True)
dfffjj3.to_csv('outputs.csv')

# 7. Visualizing the results

This section is mainly for producing visualizations for our poster deliverable.

In [None]:
df = pd.read_csv('outputs.csv' , index_col=[0])
df.head()

In [None]:
def hourstomin(time):
    hours = str(int(time))
    minutes = str(int(time*60) % 60).format(".2f")
    seconds = str(int(time*3600) % 60)
    return (hours+'H'+minutes+'m'+seconds+'s')

def sectime(time):
    time = time/3600
    hours = str(int(time))
    minutes = str(int(time*60) % 60).format(".2f")
    seconds = str(int(time*3600) % 60)
    return (hours+'H'+minutes+'m'+seconds+'s')


def rmse(c1, c2):
    err = np.power(np.power((c1-c2), 2).mean(), 0.5)
    return err
    
def MARE(c1, c2):
    return np.power(2, abs(np.log2(c1 + 1) - np.log2(c2 + 1.)).mean())

def MAE(c1, c2):
    return abs(c1 - c2).mean()

def r2(c1, c2):
    return scipy.stats.pearsonr(c1, c2)
def logr2(c1, c2):
    return scipy.stats.pearsonr(np.log10(c1+1), np.log10(c2+1))

def accuracy(c1, c2, df):
    return len(df[c1 == c2])/ len(df) * 100

In [None]:
td = df['timedelta']
tdnn = df['timedelta_NN_pred']
tdbl = df['timedelta_baseline']
ne = df['next_event']
nenn = df['next_event_NN_pred']
nerfc = df['next_event_rfc_pred']
nebl = df['next_event_baseline']
print('mean absolute relative error NN:', MARE(df['timedelta_NN_pred'], df['timedelta']))
print('mean absolute relative error baseline: ', MARE(df['timedelta_baseline'], df['timedelta']))
print('mean absolute error NN: ', sectime(MAE(df['timedelta_NN_pred'], df['timedelta'])))
print('mean absolute error baseline: ',sectime(MAE(tdbl, td)))
print('r2 score NN: ', r2(td, tdnn))
print('r2 score baseline: ', r2(td, tdbl))
print('log r2 score NN: ', logr2(td, tdnn))
print('log r2 score baseline: ', logr2(td, tdbl))
print('accuracy random forest', accuracy(ne, nerfc, df))
print('accuracy NN',accuracy(ne, nenn, df))
print('accuracy baseline',accuracy(ne, nebl, df))

#sectime(rmse(df['timedelta_baseline'], df['timedelta']))

In [None]:
#mean absolute error

print('root mean square eror new:',hourstomin(np.power(np.square(dftest_all['timedelta'] - dftest_all['predictions']).mean(), 0.5)/3600))
print('root mean square eror old:',hourstomin(np.power(np.square(dftest_all['timedelta'] - dftest_all['predictions_test']).mean(), 0.5)/3600))
print('mean absolute eror new:',hourstomin((abs(dftest_all['timedelta'] - dftest_all['predictions'])).mean()/3600))
print('mean absolute eror old:',hourstomin((abs(dftest_all['timedelta'] - dftest_all['predictions_test'])).mean()/3600))
print('mean absolute relative error new: ', np.power(2, abs(np.log2(dftest_all['timedelta'] + 1.) - np.log2(dftest_all['predictions'] + 1.)).mean()))
print('mean absolute relative error old: ', np.power(2, abs(np.log2(dftest_all['timedelta'] + 1.) - np.log2(dftest_all['predictions_test'] + 1.)).mean()))
print('rmse ratio: ', (np.power(np.square(dftest_all['timedelta'] - dftest_all['predictions']).mean(), 0.5) / np.std(dftest_all['timedelta'])))

In [None]:

fig = np.log10(dftest_all['predictions']+1).hist(bins=50,  range=[0,6], alpha=0.8, figsize=(12,8), color='red')
fig = np.log10(dftest_all['timedelta']+1).hist(bins=50,  range=[0,6], alpha=0.5, color='green')
fig = np.log10(tdbl).hist(bins=50,  range=[0,6], alpha=0.5, color='purple')
fig.figure.savefig('histograms.png', dpi=300)

In [None]:
np.log10(dftest_all+1).plot.scatter(x='timedelta', y='predictions',alpha=0.1, figsize=(12,12), grid=True, ylim=(-.5,7), xlim=(-.5,7)).figure.savefig('logscatter.png', dpi=300)


In [None]:
np.log10(dftest_all+1).plot.scatter(x='timedelta', y='error',alpha=0.1, figsize=(15,10), grid=True)

In [None]:
dftest_all['timedelta_log'] = np.log10(dftest_all['timedelta']+1)
dftest_all['predictions_log'] = np.log10(dftest_all['predictions']+1)
sns.set(rc = {'figure.figsize':(8,8)})
ax = sns.regplot(x='timedelta_log', y='predictions_log', data=dftest_all, scatter_kws={'alpha':0.01}, fit_reg=True)


In [None]:
dftest_all['timedelta_log'] = np.log10(dftest_all['timedelta']+1)
dftest_all['predictions_log'] = np.log10(dftest_all['predictions']+1)
ax = sns.regplot(x='timedelta_log', y='predictions_log', data=dftest_all, scatter_kws={'alpha':0.01}, fit_reg=True)

In [None]:
logr2 = stats.pearsonr(x=dftest_all['timedelta_log'], y=dftest_all['predictions_log'])
r2 = stats.pearsonr(x=dftest_all['timedelta'], y=dftest_all['predictions'])
print('logr2 = ', logr2)
print('r2score = ', r2)

In [None]:
end1 = time.time()
print(f"The runtime of the whole tool is {end1 - start1} seconds.")