# Import libraries

In [20]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, Normalizer, LabelEncoder

# Read data

In [12]:
df_train = pd.read_csv('BPI_Challenge_2012-training.csv')
df_test = pd.read_csv('BPI_Challenge_2012-test.csv')

df_train.drop('eventID ', axis=1, inplace=True)
df_test.drop('eventID ', axis=1, inplace=True)

# Data preprocessing

In [29]:
#Apply label encoding to event lifecycle:transistion column in train data. 0: schedule, 1: start, 2: complete
df_train_LE = df_train.copy()
df_train_LE = df_train_LE.replace({'event lifecycle:transition': {'SCHEDULE': 0, 'START': 1, 'COMPLETE': 2}})
#Note: df_train_LE stands for 'dataframe train label encoded'


#One hot encoding for event concept:name column in train data
df_train_OHE = pd.get_dummies(df_train_LE, prefix=['type'], columns = ['event concept:name'])
#uncomment to show the dataframe
# df_train_OHE 
#Note: df_train_OHE stands for 'dataframe train one hot encoded'

case AMOUNT_REQ           0
event org:resource    17838
dtype: int64

In [35]:
#To show NaN values are all for the same cases. Therefore these rows can dropped from the dataframe
df_train_OHE[df_train_OHE['event org:resource'].isna()].head()

Unnamed: 0,case concept:name,case REG_DATE,case AMOUNT_REQ,event org:resource,event lifecycle:transition,event time:timestamp,type_A_ACCEPTED,type_A_ACTIVATED,type_A_APPROVED,type_A_CANCELLED,...,type_O_SELECTED,type_O_SENT,type_O_SENT_BACK,type_W_Afhandelen leads,type_W_Beoordelen fraude,type_W_Completeren aanvraag,type_W_Nabellen incomplete dossiers,type_W_Nabellen offertes,type_W_Valideren aanvraag,type_W_Wijzigen contractgegevens
50,173718,01-10-2011 10:37:39.362,15000,,1,01-10-2011 10:38:48.454,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
56,173718,01-10-2011 10:37:39.362,15000,,0,01-10-2011 10:44:42.811,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
57,173718,01-10-2011 10:37:39.362,15000,,2,01-10-2011 10:44:43.954,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
58,173718,01-10-2011 10:37:39.362,15000,,1,01-10-2011 10:44:54.041,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
59,173718,01-10-2011 10:37:39.362,15000,,2,01-10-2011 10:44:58.974,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [57]:
#Normalize case AMOUNT_REQ and event org:resource columns in train data

#drop NaN values
df_train_OHE = df_train_OHE.dropna()

#to_norm are the to-normalize columns, norm are the normalized columns
to_norm = df_train_OHE[['case AMOUNT_REQ', 'event org:resource']].copy()
norm = pd.DataFrame(Normalizer().fit_transform(to_norm), columns=to_norm.columns)
#norm is converted to a dataframe and the columns are renamed to the original column names

#append/replace the normalized columns to the dataframe
df_train_OHE_norm = df_train_OHE.copy()
df_train_OHE_norm['case AMOUNT_REQ'] = norm['case AMOUNT_REQ'].values
df_train_OHE_norm['event org:resource'] = norm['event org:resource'].values
#Note: df_train_OHE_norm stands for 'dataframe train one hot encoded normalized'

df_train_OHE_norm


Unnamed: 0,case concept:name,case REG_DATE,case AMOUNT_REQ,event org:resource,event lifecycle:transition,event time:timestamp,type_A_ACCEPTED,type_A_ACTIVATED,type_A_APPROVED,type_A_CANCELLED,...,type_O_SELECTED,type_O_SENT,type_O_SENT_BACK,type_W_Afhandelen leads,type_W_Beoordelen fraude,type_W_Completeren aanvraag,type_W_Nabellen incomplete dossiers,type_W_Nabellen offertes,type_W_Valideren aanvraag,type_W_Wijzigen contractgegevens
0,173688,01-10-2011 00:38:44.546,0.999984,0.005600,2,01-10-2011 00:38:44.546,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,173688,01-10-2011 00:38:44.546,0.999984,0.005600,2,01-10-2011 00:38:44.880,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,173688,01-10-2011 00:38:44.546,0.999984,0.005600,2,01-10-2011 00:39:37.906,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,173688,01-10-2011 00:38:44.546,0.999984,0.005600,0,01-10-2011 00:39:38.875,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,173691,01-10-2011 08:08:58.256,0.999749,0.022394,2,01-10-2011 08:08:58.256,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239781,212154,23-02-2012 14:00:21.169,0.259406,0.965768,2,06-03-2012 19:05:51.706,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
239782,203362,23-01-2012 20:02:26.492,0.532664,0.846327,1,06-03-2012 19:10:07.405,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
239783,203362,23-01-2012 20:02:26.492,0.532664,0.846327,2,06-03-2012 19:11:20.054,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
239784,212830,25-02-2012 19:30:25.147,0.616242,0.787557,2,06-03-2012 19:11:59.483,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


# Support vector regression