In [5]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [6]:
train = pd.read_csv('preprocessed_train.csv')
test = pd.read_csv('preprocessed_test.csv')

train['next position'] = train.groupby('case concept:name')['position'].shift(-1) #Creates new column with NaN values for the last even in a case
test['next position'] = test.groupby('case concept:name')['position'].shift(-1)

train = train.dropna(subset=['next position']) #Drop all last events per case
test = test.dropna(subset=['next position'])

train_LE = train.copy()
train_LE = train_LE.replace({'event lifecycle:transition': {'SCHEDULE': 0, 'START': 1, 'COMPLETE': 2}})

train_OHE = pd.get_dummies(train_LE, prefix=['type'], columns = ['event concept:name'])
test_OHE = pd.get_dummies(train_LE, prefix=['type'], columns = ['event concept:name'])

def make_val_set(dataframe):
    """make a validation set from the dataframe"""
    
    #set seed for reproducibility
    np.random.seed(69)
    
    #extract all unique case IDs
    unique_ids = dataframe['case concept:name'].unique()
    
    #select 10% of the unique IDs and use them to create a validation set
    samples = np.random.choice(unique_ids, size=int(len(unique_ids)*0.1), replace=False)
    val_set = dataframe[dataframe['case concept:name'].isin(samples)]
    
    train = dataframe[~dataframe['case concept:name'].isin(samples)]
    
    return val_set, train

val_OHE, train_OHE = make_val_set(train_OHE)

In [7]:
x_train_time = train_OHE[['case AMOUNT_REQ','timestamp_finish', 'day_week', 'time_of_day','seconds_prev', 'type_A_ACCEPTED', 'type_A_ACTIVATED', 'type_A_APPROVED','type_A_CANCELLED', 'type_A_DECLINED', 'type_A_FINALIZED','type_A_PARTLYSUBMITTED', 'type_A_PREACCEPTED', 'type_A_REGISTERED','type_A_SUBMITTED', 'type_O_ACCEPTED', 'type_O_CANCELLED','type_O_CREATED', 'type_O_DECLINED', 'type_O_SELECTED', 'type_O_SENT','type_O_SENT_BACK', 'type_W_Afhandelen leads','type_W_Beoordelen fraude', 'type_W_Completeren aanvraag','type_W_Nabellen incomplete dossiers', 'type_W_Nabellen offertes','type_W_Valideren aanvraag', 'type_W_Wijzigen contractgegevens']]

y_train_time = train_OHE['seconds_next']

x_val_time = val_OHE[['case AMOUNT_REQ','timestamp_finish', 'day_week',
 'time_of_day','seconds_prev', 'type_A_ACCEPTED', 'type_A_ACTIVATED', 'type_A_APPROVED',
 'type_A_CANCELLED', 'type_A_DECLINED', 'type_A_FINALIZED','type_A_PARTLYSUBMITTED', 'type_A_PREACCEPTED', 
 'type_A_REGISTERED','type_A_SUBMITTED', 'type_O_ACCEPTED', 'type_O_CANCELLED','type_O_CREATED', 'type_O_DECLINED', 
 'type_O_SELECTED', 'type_O_SENT','type_O_SENT_BACK', 'type_W_Afhandelen leads','type_W_Beoordelen fraude', 
 'type_W_Completeren aanvraag','type_W_Nabellen incomplete dossiers', 'type_W_Nabellen offertes','type_W_Valideren aanvraag',
 'type_W_Wijzigen contractgegevens']]

y_val_time = val_OHE['seconds_next']

x_test_time = test_OHE[['case AMOUNT_REQ','timestamp_finish', 'day_week',
 'time_of_day','seconds_prev', 'type_A_ACCEPTED', 'type_A_ACTIVATED', 'type_A_APPROVED',
 'type_A_CANCELLED', 'type_A_DECLINED', 'type_A_FINALIZED','type_A_PARTLYSUBMITTED', 'type_A_PREACCEPTED', 
 'type_A_REGISTERED','type_A_SUBMITTED', 'type_O_ACCEPTED', 'type_O_CANCELLED','type_O_CREATED', 'type_O_DECLINED', 
 'type_O_SELECTED', 'type_O_SENT','type_O_SENT_BACK', 'type_W_Afhandelen leads','type_W_Beoordelen fraude', 
 'type_W_Completeren aanvraag','type_W_Nabellen incomplete dossiers', 'type_W_Nabellen offertes','type_W_Valideren aanvraag',
 'type_W_Wijzigen contractgegevens']]
 
y_test_time = test_OHE['seconds_next']

y_test_time



0             0.334
1            53.026
2             0.969
3         39427.562
4           356.871
            ...    
172684      874.248
172685     1320.139
172686        4.322
172688        0.158
172689       39.197
Name: seconds_next, Length: 163254, dtype: float64

Alpha in sklearn is lambda


In [11]:
parameters = {"l1_ratio": [.1, .3 ,.5,.85,.95, .99, 1],
              'alpha':[0.1,0.3,0.5,0.7,0.9,1],
              'max_iter': [4000,5000]}


In [9]:
regr = ElasticNetCV(cv=5,random_state=2,l1_ratio = l1,alphas = alphas,max_iter=5000 ) 
regr.fit(x_train_time, y_train_time)

NameError: name 'l1' is not defined

In [None]:
y_pred_time = regr.predict(x_test_time)

In [14]:
def time_evaluation(y_test, y_pred, model: str):
 
    print(f"Error metrics (measured in hours) for the {model} when predicting the time until next event")
    print('\n')
    print('Mean Absolute Error:', round(mean_absolute_error(y_test, y_pred)/3600,3))
    print('Root Mean Squared Error:', round(np.sqrt(mean_squared_error(y_test, y_pred)/3600),3))
    print('R2 score:', round(r2_score(y_test, y_pred),3))

In [None]:
time_evaluation(y_test_time, y_pred_time, 'Elastic net')

Error metrics (measured in hours) for the Elastic net when predicting the time until next event


Mean Absolute Error: 15.315
Root Mean Squared Error: 2294.288
R2 score: 0.103


In [12]:
eNet = ElasticNet()
grid = GridSearchCV(eNet, param_grid = parameters,scoring='r2',cv=5,verbose=1)
grid.fit(x_train_time,y_train_time)
y_pred_grid = grid.predict(x_test_time)

Fitting 5 folds for each of 84 candidates, totalling 420 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [16]:
time_evaluation(y_test_time,y_pred_grid,'Elastic net grid search')

Error metrics (measured in hours) for the Elastic net grid search when predicting the time until next event


Mean Absolute Error: 15.328
Root Mean Squared Error: 2294.278
R2 score: 0.103
