# Import tools

In [21]:
import numpy as np
import pandas as pd
import time
from decision_tree import DecisionTreeClassifier, DecisionTreeRegressor

# Config variables

In [22]:
# Config variables
training_data_path = '../datasets/bpi_2012_train_eng.csv'
testing_data_path = '../datasets/bpi_2012_test_eng.csv'

n_samples = 1000
n_trees = 3
min_split = 12
max_depth = 12


# Load the data

In [23]:
# Times (unloaded)
# 1.000 : 4 seconds (30%)
# 1.500 : 10 seconds (34%)
# 2.000 : 13 seconds (39%)
# 3.000 : 30 seconds (40%)
# 10.000 : 6 minutes (38%)

# Loading and splitting the datasets
df_train = pd.read_csv(training_data_path)
df_train = df_train.set_index('event_index').drop('Unnamed: 0', axis=1)

df_test = pd.read_csv(testing_data_path)
df_test = df_test.set_index('event_index').drop('Unnamed: 0', axis=1)

# Removing unknowable-for-the-model variables
df_train = df_train.drop(['nextEventTime', 'nextEventTimeRel'], axis=1)[0:n_samples]
df_test = df_test.drop(['nextEventTime', 'nextEventTimeRel'], axis=1)[0:n_samples]

# Selecting columns and rows
df_train = df_train.drop(['startTime', 'completeTime'], axis=1)
df_test = df_test.drop(['startTime', 'completeTime'], axis=1)

df_train = df_train.dropna()
df_test = df_test.dropna()

df_train


Unnamed: 0_level_0,case,event,AMOUNT_REQ,REG_DATE,org:resource,nextEvent,startTimeRel,indexInCase
event_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
48289,183459,O_SENT_BACK,40000,2011/11/09 14:15:46.029,10789,W_Valideren aanvraag,1375482,15
101208,195392,A_SUBMITTED,5000,2011/12/23 17:09:57.692,112,A_PARTLYSUBMITTED,0,0
51064,184171,A_PARTLYSUBMITTED,5000,2011/11/10 17:37:46.407,112,A_PARTLYSUBMITTED,0,1
77689,190543,W_Nabellen offertes,5500,2011/12/01 17:11:34.989,11003,W_Nabellen offertes,932113,12
102277,195609,A_PARTLYSUBMITTED,32500,2011/12/26 12:52:21.741,112,A_PREACCEPTED,0,1
...,...,...,...,...,...,...,...,...
20572,177797,A_PREACCEPTED,22000,2011/10/18 13:10:30.509,112,A_PREACCEPTED,66,3
68049,188401,W_Nabellen incomplete dossiers,11000,2011/11/23 12:17:42.914,11189,W_Valideren aanvraag,2344984,22
70169,188908,A_ACCEPTED,15000,2011/11/25 08:50:29.140,11169,A_FINALIZED,3169,4
26723,179194,O_SENT_BACK,5000,2011/10/23 12:53:57.257,10789,O_SENT_BACK,427505,12


# Classification

### Pre-processing and splitting the training and testing data

As far as I'm aware, this step has already been performed by Nik. For now however, I'll attempt to reproduce the same split as programmed by my source of inspiration for the Decision Tree Classifier model to check whether it works.

In [24]:
# Starting time
start_time = time.time()

X_train = df_train.drop(['nextEvent'], axis=1).values
Y_train = df_train['nextEvent'].values.reshape(-1, 1)
X_test = df_test.drop(['nextEvent'], axis=1).values
Y_test = df_test['nextEvent'].values.reshape(-1, 1)


### Fitting the model

In [25]:
# Constructing and fitting the model
classifier = DecisionTreeClassifier(min_samples_split=min_split, max_depth=max_depth)
classifier.fit(X_train, Y_train)


### Prediction and analysis

In [26]:
from sklearn.metrics import accuracy_score

# Predicting the values of our test dataset
Y_pred = classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)

# Retrieving the accuracy of the model
print(f'Accuracy score: {accuracy}')

# Ending time
end_time = time.time()
print(f'\r\nThe execution of Decision Tree Classifier took {round(end_time - start_time)} seconds')


Accuracy score: 0.5195227765726681

The execution of Decision Tree Classifier took 15 seconds


# Regression

### Pre-processing and splitting the training and testing data

In [27]:
# Loading and splitting the datasets
df_train = pd.read_csv(training_data_path)
df_train = df_train.set_index('event_index').drop('Unnamed: 0', axis=1)

df_test = pd.read_csv(testing_data_path)
df_test = df_test.set_index('event_index').drop('Unnamed: 0', axis=1)

# Removing unknowable-for-the-model variables
df_train = df_train.drop(['nextEvent', 'nextEventTime'], axis=1)[0:n_samples]
df_test = df_test.drop(['nextEvent', 'nextEventTime'], axis=1)[0:n_samples]

# Selecting columns and rows
df_train = df_train.drop(['startTime', 'completeTime'], axis=1)
df_test = df_test.drop(['startTime', 'completeTime'], axis=1)

df_train = df_train.dropna()
df_test = df_test.dropna()

df_train


Unnamed: 0_level_0,case,event,AMOUNT_REQ,REG_DATE,org:resource,nextEventTimeRel,startTimeRel,indexInCase
event_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
48289,183459,O_SENT_BACK,40000,2011/11/09 14:15:46.029,10789,347534.0,1375482,15
101208,195392,A_SUBMITTED,5000,2011/12/23 17:09:57.692,112,8205.0,0,0
51064,184171,A_PARTLYSUBMITTED,5000,2011/11/10 17:37:46.407,112,0.0,0,1
77689,190543,W_Nabellen offertes,5500,2011/12/01 17:11:34.989,11003,6659.0,932113,12
102277,195609,A_PARTLYSUBMITTED,32500,2011/12/26 12:52:21.741,112,35.0,0,1
...,...,...,...,...,...,...,...,...
20572,177797,A_PREACCEPTED,22000,2011/10/18 13:10:30.509,112,1275.0,66,3
68049,188401,W_Nabellen incomplete dossiers,11000,2011/11/23 12:17:42.914,11189,3096.0,2344984,22
70169,188908,A_ACCEPTED,15000,2011/11/25 08:50:29.140,11169,329.0,3169,4
26723,179194,O_SENT_BACK,5000,2011/10/23 12:53:57.257,10789,347600.0,427505,12


In [28]:
# Starting time
start_time = time.time()

X_train = df_train.drop(['nextEventTimeRel'], axis=1).values
Y_train = df_train['nextEventTimeRel'].values.reshape(-1, 1)
X_test = df_test.drop(['nextEventTimeRel'], axis=1).values
Y_test = df_test['nextEventTimeRel'].values.reshape(-1, 1)

In [29]:
regressor = DecisionTreeRegressor(min_samples_split=min_split, max_depth=max_depth)
regressor.fit(X_train, Y_train)
#regressor.print_tree()


In [30]:
from sklearn.metrics import mean_squared_error

# Predicting the values of our test dataset
Y_pred = regressor.predict(X_test)
mse = np.sqrt(mean_squared_error(Y_test, Y_pred))

# Retrieving the accuracy of the model
print(f'MSE score: {mse}')

# Ending time
end_time = time.time()
print(f'\r\nThe execution of Decision Tree Regressor took {round(end_time - start_time)} seconds')


MSE score: 401171.6965908199

The execution of Random Forest took 9 seconds
