In [1]:
from sklearn import decomposition, datasets
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [2]:
data_train = pd.read_csv('.\data\BPI2012Training.csv')
data_test = pd.read_csv('.\data\BPI2012Test.csv')

In [3]:
def parseData(dataSet):
    # Parse time zone if there are any
    def convertToUnix(x):
        # If there is a timezone in the timestamp
        if 'T' in x:
            # Remove the T
            without_timezone = x[:10] + ' ' + x[11:-6]

            # Parse milliseconds if contained
            if '.' in x:
                wholesomeTime = dt.datetime.timestamp(
                    dt.datetime.strptime(without_timezone, "%Y-%m-%d %H:%M:%S.%f"))
            else:
                wholesomeTime = dt.datetime.timestamp(
                    dt.datetime.strptime(without_timezone, "%Y-%m-%d %H:%M:%S"))

        else:
            if '.' in x:
                wholesomeTime = dt.datetime.timestamp(
                    dt.datetime.strptime(x, "%d-%m-%Y %H:%M:%S.%f"))
            else:
                wholesomeTime = dt.datetime.timestamp(
                    dt.datetime.strptime(x, "%d-%m-%Y %H:%M:%S"))

        return wholesomeTime

    # Convert absolute event and reg timestamp into unix time
    dataSet['unix_abs_event_time'] = dataSet['event time:timestamp'].apply(
        lambda x: convertToUnix(x))
    dataSet['unix_reg_time'] = dataSet['case REG_DATE'].apply(
        lambda x: convertToUnix(x))

    # Time it takes for an event to occur from registeration
    dataSet['unix_rel_event_time'] = dataSet['unix_abs_event_time'] - \
        dataSet['unix_reg_time']

    # Group data set by case ID
    dataSet_grouped_by_case = dataSet.groupby(by=['case concept:name'])

    # Return data frame consisting out of the last event per case with column that indicates the number of events the case underwent appended
    dataSet_last_event_per_case = dataSet_grouped_by_case.nth([-1])
    dataSet_last_event_per_case['num_events'] = dataSet_grouped_by_case.count(
    ).iloc[:, 0]

    return (dataSet, dataSet_last_event_per_case)

In [4]:
def oneHotEncoding(dataSet, attr):
    one_hot = pd.get_dummies(dataSet[attr])
    df = dataSet.join(one_hot)
    
    return (df)

In [5]:
import numpy as np
from sklearn import tree



def dummy_variables(df):
    df_dummy = df

    for event_type in df['event concept:name'].unique()[1:]:
        df_dummy[event_type] = 0
        for event in df_dummy.index:
            if df_dummy['event concept:name'][event] == event_type:
                df_dummy[event_type][event] = 1

    return df_dummy

def dummy_trainers(dummy_data):
    x1 = dummy_data['A_PARTLYSUBMITTED'][:-1]
    x2 = dummy_data['A_PREACCEPTED'][:-1]
    x3 = dummy_data['W_Completeren aanvraag'][:-1]
    x4 = dummy_data['A_DECLINED'][:-1]
    x5 = dummy_data['W_Afhandelen leads'][:-1]
    x6 = dummy_data['A_ACCEPTED'][:-1]
    x7 = dummy_data['O_SELECTED'][:-1]
    x8 = dummy_data['A_FINALIZED'][:-1]
    x9 = dummy_data['O_CREATED'][:-1]
    x10 = dummy_data['O_SENT'][:-1]
    x11 = dummy_data['W_Nabellen offertes'][:-1]
    x12 = dummy_data['O_CANCELLED'][:-1]
    x13 = dummy_data['A_CANCELLED'][:-1]
    x14 = dummy_data['W_Beoordelen fraude'][:-1]
    x15 = dummy_data['O_SENT_BACK'][:-1]
    x16 = dummy_data['W_Valideren aanvraag'][:-1]
    x17 = dummy_data['W_Nabellen incomplete dossiers'][:-1]
    x18 = dummy_data['O_ACCEPTED'][:-1]
    x19 = dummy_data['A_APPROVED'][:-1]
    x20 = dummy_data['A_ACTIVATED'][:-1]
    x21 = dummy_data['A_REGISTERED'][:-1]
    x22 = dummy_data['O_DECLINED'][:-1]
    #x23 = dummy_data['W_Wijzigen contractgegevens'][:-1]
    x23 = dummy_data['A_SUBMITTED'][:-1]
    x_time = dummy_data['unix_rel_event_time'][1:]
    y_train = dummy_data['event concept:name'][1:]

    zipped = zip(x_time, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21,
                 x22, x23)

    X_train =[list(a) for a in zipped]

    return X_train, y_train

def x_prediction(test_data):
    x1 = test_data['A_PARTLYSUBMITTED'][:-1]
    x2 = test_data['A_PREACCEPTED'][:-1]
    x3 = test_data['W_Completeren aanvraag'][:-1]
    x4 = test_data['A_DECLINED'][:-1]
    x5 = test_data['W_Afhandelen leads'][:-1]
    x6 = test_data['A_ACCEPTED'][:-1]
    x7 = test_data['O_SELECTED'][:-1]
    x8 = test_data['A_FINALIZED'][:-1]
    x9 = test_data['O_CREATED'][:-1]
    x10 = test_data['O_SENT'][:-1]
    x11 = test_data['W_Nabellen offertes'][:-1]
    x12 = test_data['O_CANCELLED'][:-1]
    x13 = test_data['A_CANCELLED'][:-1]
    x14 = test_data['W_Beoordelen fraude'][:-1]
    x15 = test_data['O_SENT_BACK'][:-1]
    x16 = test_data['W_Valideren aanvraag'][:-1]
    x17 = test_data['W_Nabellen incomplete dossiers'][:-1]
    x18 = test_data['O_ACCEPTED'][:-1]
    x19 = test_data['A_APPROVED'][:-1]
    x20 = test_data['A_ACTIVATED'][:-1]
    x21 = test_data['A_REGISTERED'][:-1]
    x22 = test_data['O_DECLINED'][:-1]
    #x23 = test_data['W_Wijzigen contractgegevens'][:-1]
    x23 = test_data['A_SUBMITTED'][:-1]
    x_time = test_data['unix_rel_event_time'][1:]
    y_train = test_data['event concept:name'][1:]

    zipped = zip(x_time, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19, x20, x21,
                 x22, x23)

    X_test = [list(a) for a in zipped]
    return X_test


def fit_tree(X, y):
    boom = tree.DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=14,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

    boom.fit(X, y)

    return boom

#boom = fit_tree(x_train, y_train)

def tree_predict(X_test, data, boom):
    new_df = data.copy()

    predictions = boom.predict(X_test)
    predictions1 = np.insert(predictions, 0, 0)
    new_df['predictedNextEvent'] = predictions1

    return new_df

def quick_dummy(dataSet, attr):
    one_hot = pd.get_dummies(dataSet[:][attr])
    df = dataSet.join(one_hot)

    return (df)



In [6]:
def fit_forest(X_train, y_train):
    bos = RandomForestClassifier(n_estimators = 100)
    bos.fit(X_train, y_train)
    
    return bos

In [7]:
train_parsed = parseData(data_train)
test_parsed = parseData(data_test)

In [8]:
df_train = quick_dummy(data_train, 'event concept:name')
df_test = quick_dummy(data_test, 'event concept:name')

In [9]:
X_train, y_train = dummy_trainers(df_train) #current df_training doenst contain dummy variables yet
#X_validation = x_prediction(df_validation)
X_test = x_prediction(df_test)
decision_tree = fit_tree(X_train, y_train)
df_Predictions = tree_predict(X_test, df_test, decision_tree)

TypeError: __init__() got an unexpected keyword argument 'presort'

In [None]:
forest = fit_forest(X_train, y_train)
df_Predictions_forest = tree_predict(X_test, df_test, forest)

In [None]:
def accuracy(df_pred):
    acc=0
    
    for i in df_pred.index:
        if df_pred['event concept:name'][i] == df_pred['predictedNextEvent'][i]:
            acc += 1
            
    accuracy = acc/len(df_pred)
    return accuracy
    

In [None]:
accuracy(df_Predictions)

In [None]:
accuracy(df_Predictions_forest)

In [None]:
#DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=12,
#            max_features=None, max_leaf_nodes=None,
#            min_impurity_decrease=0.0, min_impurity_split=None,
#            min_samples_leaf=1, min_samples_split=2,
#            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
#            splitter='best')


## finding random forest parameters


In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 64, stop = 1640, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 95, num = 9)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [int(x) for x in np.linspace(0, 40, num = 8)]
min_samples_split.remove(0)
min_samples_split.insert(0, 1)
# Minimum number of samples required at each leaf node
min_samples_leaf = [int(x) for x in np.linspace(0, 40, num = 8)]
min_samples_split.remove(0)
min_samples_split.insert(0, 1)
# Method of selecting samples for training each tree
bootstrap = [True]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 25, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_