In [46]:
import pandas as pd
from training.predictionAlgo import naiveNextEventPredictor
from preprocessing.dataParsing import parseData
from preprocessing.dataSplitting import dataSplitter

# Convert csv into dataframe
df_training_raw = pd.read_csv('.\data\BPI2012Training.csv')
df_test_raw = pd.read_csv('.\data\BPI2012Test.csv')

# Parsing data
(df_training, df_2012_last_event_per_case_train) = parseData(df_training_raw)
(df_test, df_2012_last_event_per_case_test) = parseData(df_test_raw)

# Clean and split the data into train, validation & test data
(df_training, df_validation, df_test) = dataSplitter(df_training, df_test)

In [47]:
from sklearn.preprocessing import OneHotEncoder

unique_training_events = df_training['event concept:name'].unique().reshape(-1, 1)

# Define One-hot encoder
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
onehot_encoder = onehot_encoder.fit(unique_training_events)

In [48]:
from math import ceil

number_events_mean = df_training.groupby('case concept:name').count()['event concept:name'].mean()
number_events_mean = ceil(number_events_mean)
number_events_mean

21

In [82]:
print(x_train[0])

None


In [101]:
from sklearn.preprocessing import MinMaxScaler

# Determine actual next event
(df_training, df_validation) = naiveNextEventPredictor(df_training, df_validation)

# df with only relevant training data, i.e. loan amount, current event, next event and time elapsed since registeration.
df_training_relevant = df_training[['case concept:name', 'event concept:name', 'actual_next_event', 'case AMOUNT_REQ', 'unix_reg_time']].copy()

# One-hot encode current and next event
training_current_event = df_training_relevant['event concept:name'].to_numpy().reshape(-1, 1)
df_training_relevant['event concept:name'] = onehot_encoder.transform(training_current_event).tolist()

training_next_event = df_training_relevant['actual_next_event'].to_numpy().reshape(-1, 1)
df_training_relevant['actual_next_event'] = onehot_encoder.transform(training_next_event).tolist()

# Normalise loan amount
loan_scaler = MinMaxScaler(feature_range=(0,1))
case_amount = df_training_relevant['case AMOUNT_REQ'].to_numpy().reshape(-1, 1)
df_training_relevant['case AMOUNT_REQ'] = np.around(loan_scaler.fit_transform(case_amount), decimals = 3)

# Normalise time in seconds from case registeration to current event
time_scaler = MinMaxScaler(feature_range=(0,1))
reg_time = df_training_relevant['unix_reg_time'].to_numpy().reshape(-1, 1)
df_training_relevant['unix_reg_time'] = np.around(loan_scaler.fit_transform(reg_time), decimals = 3)

# Prepare input and output in form of [samples, features]
x_train = []
y_train = []

# Get groupby object df by case id
df_training_groupby_case_id = df_training_relevant.groupby('case concept:name')

# Unique case ids
unique_case_ids = df_training_relevant['case concept:name'].unique().tolist()

# Find input and output vector in form of [samples, features]
for unique_id in unique_case_ids:
    xy_train_unique_id = df_training_groupby_case_id.get_group(unique_id)[['event concept:name', 'actual_next_event', 'case AMOUNT_REQ', 'unix_reg_time']].values.tolist()
    
    base_case = xy_train_unique_id[0][0:2].copy()
    x_train_first_sample_per_case = base_case[0].copy()
    x_train_first_sample_per_case.extend([xy_train_unique_id[0][2], xy_train_unique_id[0][3]])
    x_train.append(x_train_first_sample_per_case)
    y_train.append(base_case[1].copy())

    # event[0] = current event, event[1] = next event, event[2] = loan amount, event[3] = time elapsed since registeration of case
    for event in xy_train_unique_id[1:]:
        base_case[0] = [prev_xs + current_x for prev_xs, current_x in zip(base_case[0], event[0])]
        x_train_sample = base_case[0].copy()
        x_train_sample.extend([event[2], event[3]])
        x_train.append(x_train_sample)
        y_train.append(event[1])

In [103]:
print(y_train[0:10])

[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.

In [104]:
import numpy as np
# Convert y_train to format [samples, features]
y_train = np.reshape(y_train, (-1, len(y_train[0])))

(193447, 24)

## finding random forest parameters


In [114]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 64, stop = 264, num = 6)]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 100, num = 5)]
# Minimum number of samples required to split a node
min_samples_split = [float(round(x)) * 0.01 for x in np.linspace(5, 30, num = 6)]
min_samples_split.insert(0, 0.02)
# Minimum number of samples required at each leaf node
min_samples_leaf = [float(round(x)) * 0.01 for x in np.linspace(5, 30, num = 6)]
min_samples_leaf.insert(0, 0.02)
# Method of selecting samples for training each tree
bootstrap = [True]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [64, 104, 144, 184, 224, 264], 'max_features': ['sqrt'], 'max_depth': [5, 28, 52, 76, 100], 'min_samples_split': [0.02, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3], 'min_samples_leaf': [0.02, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3], 'bootstrap': [True]}


In [115]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier(random_state=42)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 1, cv = 3, verbose=2, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(random_state=42),
                   n_iter=1, n_jobs=-1,
                   param_distributions={'bootstrap': [True],
                                        'max_depth': [5, 28, 52, 76, 100],
                                        'max_features': ['sqrt'],
                                        'min_samples_leaf': [0.02, 0.05, 0.1,
                                                             0.15, 0.2, 0.25,
                                                             0.3],
                                        'min_samples_split': [0.02, 0.05, 0.1,
                                                              0.15, 0.2, 0.25,
                                                              0.3],
                                        'n_estimators': [64, 104, 144, 184, 224,
                                                         264]},
                   verbose=2)

In [117]:
best_para_dict = rf_random.best_params_
best_para_dict

{'n_estimators': 64,
 'min_samples_split': 0.05,
 'min_samples_leaf': 0.15,
 'max_features': 'sqrt',
 'max_depth': 76,
 'bootstrap': True}

In [None]:
import json

with open('./BestParaRF/para{}.json'.format(1), 'w') as fp:
    json.dump(best_para_dict, fp)

In [None]:
with open('data.json', 'r') as fp:
    data = json.load(fp)