# Import tools

In [57]:
import numpy as np
import pandas as pd
from decision_tree import DecisionTreeClassifier, DecisionTreeRegressor

import datetime
import dateutil.parser

# Config variables

In [58]:
# The amount of samples to use from the dataset
n_samples = 1000

# The minimum amount of samples required for a split
min_split = 24

# The maximum depth of any given decision tree
max_depth = 24

# Load the data

In [59]:
# Times (unloaded)
# 1.000 : 4 seconds (30%)
# 1.500 : 10 seconds (34%)
# 2.000 : 13 seconds (39%)
# 3.000 : 30 seconds (40%)
# 10.000 : 6 minutes (38%)

df_train = pd.read_csv('../bpi_2012_train.csv')
df_train = df_train[['eventID ', 'case REG_DATE', 'event lifecycle:transition', 'event time:timestamp', 'event concept:name']].set_index('eventID ')[0:n_samples]

df_train.head()

Unnamed: 0_level_0,case REG_DATE,event lifecycle:transition,event time:timestamp,event concept:name
eventID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2011-10-01T01:38:44.546+03:00,COMPLETE,2011-01-10 01:38:44.546,A_SUBMITTED
1,2011-10-01T01:38:44.546+03:00,COMPLETE,2011-01-10 01:38:44.880,A_PARTLYSUBMITTED
2,2011-10-01T01:38:44.546+03:00,COMPLETE,2011-01-10 01:39:37.906,A_PREACCEPTED
3,2011-10-01T01:38:44.546+03:00,SCHEDULE,2011-01-10 01:39:38.875,W_Completeren aanvraag
4294967296,2011-10-01T09:08:58.256+03:00,COMPLETE,2011-01-10 09:08:58.256,A_SUBMITTED


# Classification

### Pre-processing and splitting the training and testing data

As far as I'm aware, this step has already been performed by Nik. For now however, I'll attempt to reproduce the same split as programmed by my source of inspiration for the Decision Tree Classifier model to check whether it works.

In [60]:
from sklearn.model_selection import train_test_split

X = df_train.iloc[:, :-1].values # The original values without the target variable
Y = df_train.iloc[:, -1].values.reshape(-1, 1) # Turn the target variable into a single-column array of arrays

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)

pd.DataFrame(X).head(20)
pd.DataFrame(Y).head(10)

Unnamed: 0,0
0,A_SUBMITTED
1,A_PARTLYSUBMITTED
2,A_PREACCEPTED
3,W_Completeren aanvraag
4,A_SUBMITTED
5,A_PARTLYSUBMITTED
6,A_PREACCEPTED
7,W_Completeren aanvraag
8,A_SUBMITTED
9,A_PARTLYSUBMITTED


### Fitting the model

In [61]:
classifier = DecisionTreeClassifier(min_samples_split=min_split, max_depth=max_depth)
classifier.fit(X_train, Y_train)
#classifier.print_tree()

### Prediction and analysis

In [62]:
from sklearn.metrics import accuracy_score

Y_pred = classifier.predict(X_test)
accuracy_score(Y_test, Y_pred)

0.325

# Classification

### Pre-processing and splitting the training and testing data

In [65]:
df_train['case REG_DATE'] = pd.to_datetime(df_train['case REG_DATE'], utc=True)
df_train['event time:timestamp'] = pd.to_datetime(df_train['event time:timestamp'], utc=True)

df_train['case_reg_date_ts'] = df_train['case REG_DATE'].values.astype(np.int64) // 10 ** 9
df_train['event_timestamp_ts'] = df_train['event time:timestamp'].values.astype(np.int64) // 10 ** 9


from sklearn.model_selection import train_test_split

X = df_train[['case_reg_date_ts', 'event_timestamp_ts', 'event concept:name']].iloc[:, :-1].values # The original values without the target variable
Y = df_train[['case_reg_date_ts', 'event_timestamp_ts', 'event concept:name']].iloc[:, -1].values.reshape(-1, 1) # Turn the target variable into a single-column array of arrays

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)

pd.DataFrame(X).head(20)
pd.DataFrame(Y).head(10)

# Wait, what are we doing here? We're predicting a time, based on the time itself? Since I want all columns to be numerical, only time data is allowed...

Unnamed: 0,0
0,A_SUBMITTED
1,A_PARTLYSUBMITTED
2,A_PREACCEPTED
3,W_Completeren aanvraag
4,A_SUBMITTED
5,A_PARTLYSUBMITTED
6,A_PREACCEPTED
7,W_Completeren aanvraag
8,A_SUBMITTED
9,A_PARTLYSUBMITTED


In [None]:
classifier = DecisionTreeRegressor(min_samples_split=min_split, max_depth=max_depth)
classifier.fit(X_train, Y_train)
classifier.print_tree()

In [None]:
from sklearn.metrics import accuracy_score

Y_pred = classifier.predict(X_test)
accuracy_score(Y_test, Y_pred)