In [5]:
import EncoderFactory
from DatasetManager import DatasetManager
import BucketFactory

import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler


import time
import os
import sys
from sys import argv
import pickle
from collections import defaultdict

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import catboost

from tensorflow.keras.backend import print_tensor
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers.core import Dense, Activation, Dropout
from keras.preprocessing import sequence
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, Flatten, Input
from keras.layers import LSTM
from keras.optimizers import Nadam, RMSprop
from keras.layers.normalization import BatchNormalization


from hyperopt import Trials, STATUS_OK, tpe, fmin, hp
import hyperopt
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample

In [6]:
def create_and_evaluate_model(args):
    global trial_nr
    trial_nr += 1
    
    start = time.time()
    score = 0
    for cv_iter in range(n_splits):
        
        dt_test_prefixes = dt_prefixes[cv_iter]
        dt_train_prefixes = pd.DataFrame()
        for cv_train_iter in range(n_splits): 
            if cv_train_iter != cv_iter:
                dt_train_prefixes = pd.concat([dt_train_prefixes, dt_prefixes[cv_train_iter]], axis=0, sort=False)
        
        # Bucketing prefixes based on control flow
        bucketer_args = {'encoding_method':bucket_encoding, 
                         'case_id_col':dataset_manager.case_id_col, 
                         'cat_cols':[dataset_manager.activity_col], 
                         'num_cols':[], 
                         'random_state':random_state}
        if bucket_method == "cluster":
            bucketer_args["n_clusters"] = args["n_clusters"]
        bucketer = BucketFactory.get_bucketer(bucket_method, **bucketer_args)
        bucket_assignments_train = bucketer.fit_predict(dt_train_prefixes)
        bucket_assignments_test = bucketer.predict(dt_test_prefixes)
        
        preds_all = []
        test_y_all = []
        if "prefix" in method_name:
            scores = defaultdict(int)
        for bucket in set(bucket_assignments_test):
            
            if cls_method == "lstm":
                data = dataset_manager.read_dataset()
                train, test = dataset_manager.split_data_strict(data, train_ratio, split='temporal')
                train, val = dataset_manager.split_val(train, 0.8)

                if "traffic_fines" in dataset_name:
                    max_len = 10
                elif "bpic2017" in dataset_name:
                    max_len = min(20, dataset_manager.get_pos_case_length_quantile(data, 0.90))
                else:
                    max_len = min(40, dataset_manager.get_pos_case_length_quantile(data, 0.90))

                dt_train = dataset_manager.encode_data_for_lstm(train)
                dt_test = dataset_manager.encode_data_for_lstm(test)
                dt_val = dataset_manager.encode_data_for_lstm(val)

                data_dim = dt_train.shape[1]-3

                dt_train_bucket, train_y = dataset_manager.generate_3d_data(dt_train, max_len)
                dt_test_bucket, y_test= dataset_manager.generate_3d_data(dt_test, max_len)
                dt_val_bucket, val_y = dataset_manager.generate_3d_data(dt_val, max_len)
                
                #remove instances that won't fit with batch size
#                 val_excess = dt_val_bucket.shape[0]%args['batch_size']
#                 to_keep = dt_val_bucket.shape[0] - val_excess
#                 dt_val_bucket = dt_val_bucket[:to_keep,:,:]
#                 val_y = val_y[:to_keep,:]
                                
#                 train_excess = dt_train_bucket.shape[0]%args['batch_size']
#                 to_keep = dt_train_bucket.shape[0] - train_excess
#                 dt_train_bucket = dt_train_bucket[:to_keep,:,:]
#                 train_y = train_y[:to_keep,:]
                
#                 print(dt_train_bucket.shape)
#                 print(dt_val_bucket.shape)
                                
                print(args)
                
                early_stop = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 5)
            
                #create model
#                 main_input = Input(shape=(max_len, data_dim), batch_size = args['batch_size'], name='main_input')

#                 if args["lstm_layers"]["layers"] == "one":
#                     l2_3 = LSTM(args['lstm1_nodes'], batch_input_shape=(args['batch_size'], max_len, data_dim), 
#                                 implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, 
#                                 recurrent_dropout=args['lstm1_dropouts'], stateful = True)(main_input)
#                     b2_3 = BatchNormalization()(l2_3)

#                 if args["lstm_layers"]["layers"] == "two":
#                     l1 = LSTM(args['lstm1_nodes'], batch_input_shape=(args['batch_size'], max_len, data_dim), 
#                               implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, 
#                               recurrent_dropout=args['lstm1_dropouts'], stateful = True)(main_input)
#                     b1 = BatchNormalization()(l1)
#                     l2_3 = LSTM(args["lstm_layers"]["lstm2_nodes"], activation="sigmoid", 
#                                 implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, 
#                                 recurrent_dropout=args["lstm_layers"]["lstm2_dropouts"], stateful = True)(b1)
#                     b2_3 = BatchNormalization()(l2_3)

#                 if args["lstm_layers"]["layers"] == "three":
#                     l1 = LSTM(args['lstm1_nodes'], batch_input_shape=(args['batch_size'], max_len, data_dim), 
#                               implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, 
#                               recurrent_dropout=args['lstm1_dropouts'], stateful = True)(main_input)
#                     b1 = BatchNormalization()(l1)
#                     l2 = LSTM(args["lstm_layers"]["lstm2_nodes"], activation="sigmoid", 
#                                 implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, 
#                                 recurrent_dropout=args["lstm_layers"]["lstm2_dropouts"], stateful = True)(b1)
#                     b2 = BatchNormalization()(l2)
#                     l2_3 = LSTM(args["lstm_layers"]["lstm3_nodes"], activation="sigmoid", 
#                                 implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, 
#                                 recurrent_dropout=args["lstm_layers"]["lstm3_dropouts"], stateful = True)(b2)
#                     b2_3 = BatchNormalization()(l2_3)


#                 if args['dense_layers']['layers'] == "two":
#                     d1 = Dense(args['dense_layers']['dense2_nodes'], activation = "relu")(b2_3)
#                     outcome_output = Dense(2, activation='sigmoid', kernel_initializer='glorot_uniform', name='outcome_output')(d1)

#                 else:
#                     outcome_output = Dense(2, activation='sigmoid', kernel_initializer='glorot_uniform', name='outcome_output')(b2_3)

#                 cls = Model(inputs=[main_input], outputs=[outcome_output])

#                 if args['optimizer'] == "adam":
#                     opt = Nadam(lr=args['learning_rate'], beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004, clipvalue=3)
#                 elif args['optimizer'] == "rmsprop":
#                     opt = RMSprop(lr=args['learning_rate'], rho=0.9, epsilon=1e-08, decay=0.0)

#                 #cls = load_model("%s/%s_%s/cls/backup_0.5_with_all.h5" % (dataset_ref, cls_method, method_name))

#                 cls.compile(loss='binary_crossentropy', optimizer=opt)

#                 history = cls.fit(dt_train_bucket, train_y, validation_data = (dt_val_bucket,val_y), verbose = 2, 
#                                   epochs = args['epochs'], batch_size = args['batch_size'],
#                                   callbacks=[early_stop], steps_per_epoch = dt_train_bucket.shape[0]//args['batch_size'])
                
#                 weights = cls.get_weights()

                main_input = Input(shape=(max_len, data_dim), batch_size = 1, name='main_input')

                if args["lstm_layers"]["layers"] == "one":
                    l2_3 = LSTM(args['lstm1_nodes'], batch_input_shape=(1, max_len, data_dim), 
                                implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, 
                                recurrent_dropout=args['lstm1_dropouts'], stateful = True)(main_input)
                    b2_3 = BatchNormalization()(l2_3)

                if args["lstm_layers"]["layers"] == "two":
                    l1 = LSTM(args['lstm1_nodes'], batch_input_shape=(1, max_len, data_dim), 
                              implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, 
                              recurrent_dropout=args['lstm1_dropouts'], stateful = True)(main_input)
                    b1 = BatchNormalization()(l1)
                    l2_3 = LSTM(args["lstm_layers"]["lstm2_nodes"], activation="sigmoid", 
                                implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, 
                                recurrent_dropout=args["lstm_layers"]["lstm2_dropouts"], stateful = True)(b1)
                    b2_3 = BatchNormalization()(l2_3)

                if args["lstm_layers"]["layers"] == "three":
                    l1 = LSTM(args['lstm1_nodes'], batch_input_shape=(1, max_len, data_dim), 
                              implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, 
                              recurrent_dropout=args['lstm1_dropouts'], stateful = True)(main_input)
                    b1 = BatchNormalization()(l1)
                    l2 = LSTM(args["lstm_layers"]["lstm2_nodes"], activation="sigmoid", 
                                implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, 
                                recurrent_dropout=args["lstm_layers"]["lstm2_dropouts"], stateful = True)(b1)
                    b2 = BatchNormalization()(l2)
                    l2_3 = LSTM(args["lstm_layers"]["lstm3_nodes"], activation="sigmoid", 
                                implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, 
                                recurrent_dropout=args["lstm_layers"]["lstm3_dropouts"], stateful = True)(b2)
                    b2_3 = BatchNormalization()(l2_3)


                if args['dense_layers']['layers'] == "two":
                    d1 = Dense(args['dense_layers']['dense2_nodes'], activation = "relu")(b2_3)
                    outcome_output = Dense(2, activation='sigmoid', kernel_initializer='glorot_uniform', name='outcome_output')(d1)

                else:
                    outcome_output = Dense(2, activation='sigmoid', kernel_initializer='glorot_uniform', name='outcome_output')(b2_3)

                pred_cls = Model(inputs=[main_input], outputs=[outcome_output])
                #pred_cls.set_weights(weights)
                pred_cls.compile(loss='binary_crossentropy', optimizer=opt)
                
                history = pred_cls.fit(dt_train_bucket, train_y, validation_data = (dt_val_bucket,val_y), verbose = 2, 
                                  epochs = args['epochs'], batch_size = 1,
                                  callbacks=[early_stop])
                
                
                probas = []
                for each in dt_test_bucket:
                    proba = pred_cls.predict(np.array([each,]))#[:,preds_pos_label_idx]
                    probas.append(proba[0])
                    
                preds = []
                for each in probas:
                    pred = each.argmax(axis=-1)
                    preds.append(pred)
                
                test_y = [np.where(act == 1)[0][0] for act in y_test]


            else:
                relevant_train_cases_bucket = dataset_manager.get_indexes(dt_train_prefixes)[bucket_assignments_train == bucket]
                relevant_test_cases_bucket = dataset_manager.get_indexes(dt_test_prefixes)[bucket_assignments_test == bucket]
                dt_test_bucket = dataset_manager.get_relevant_data_by_indexes(dt_test_prefixes, relevant_test_cases_bucket)
                test_y = dataset_manager.get_label_numeric(dt_test_bucket)
                if len(relevant_train_cases_bucket) == 0:
                    preds = [class_ratios[cv_iter]] * len(relevant_test_cases_bucket)
                else:
                    dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(dt_train_prefixes, relevant_train_cases_bucket) # one row per event
                    train_y = dataset_manager.get_label_numeric(dt_train_bucket)

                    if len(set(train_y)) < 2:
                        preds = [train_y[0]] * len(relevant_test_cases_bucket)
                    else:
                        feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods])

                        if cls_method == "rf":
                            cls = RandomForestClassifier(n_estimators=500,
                                                         max_features=args['max_features'],
                                                         random_state=random_state)

                        elif cls_method == "xgboost":
                            cls = xgb.XGBClassifier(objective='binary:logistic',
                                                    n_estimators=500,
                                                    learning_rate= args['learning_rate'],
                                                    subsample=args['subsample'],
                                                    max_depth=int(args['max_depth']),
                                                    colsample_bytree=args['colsample_bytree'],
                                                    min_child_weight=int(args['min_child_weight']),
                                                    seed=random_state)
                        elif cls_method == "cb":
                            cls =  catboost.CatBoostClassifier(learning_rate= args['learning_rate'],
                                                               subsample=args['subsample'], depth=int(args['depth']))

                        elif cls_method == "logit":
                            cls = LogisticRegression(C=2**args['C'],
                                                     random_state=random_state)
                        elif cls_method == "svm":
                            cls = SVC(C=2**args['C'],
                                      gamma=2**args['gamma'],
                                      random_state=random_state)

                        if cls_method == "svm" or cls_method == "logit":
                            pipeline = Pipeline([('encoder', feature_combiner), ('scaler', StandardScaler()), ('cls', cls)])
                        else:
                            pipeline = Pipeline([('encoder', feature_combiner), ('cls', cls)])
                        pipeline.fit(dt_train_bucket, train_y)

                        if cls_method == "svm":
                            preds = pipeline.decision_function(dt_test_bucket)
                        else:
                            preds_pos_label_idx = np.where(cls.classes_ == 1)[0][0]
                            preds = pipeline.predict_proba(dt_test_bucket)[:,preds_pos_label_idx]
            
            if "prefix" in method_name:
                auc = 0.5
                if len(set(test_y)) == 2: 
                    auc = roc_auc_score(test_y, preds)
                scores[bucket] += auc
            preds_all.extend(preds)
            test_y_all.extend(test_y)

        score += roc_auc_score(test_y_all, preds_all)
        #acc = accuracy_score(test_y_all, preds_all)
        auc = roc_auc_score(test_y_all, preds_all)
        
        #print('Accuracy:', acc, "\tROCAUC:", auc)
        print("ROCAUC:", auc)
    
    if "prefix" in method_name:
        for k, v in args.items():
            for bucket, bucket_score in scores.items():
                fout_all.write("%s;%s;%s;%s;%s;%s;%s;%s\n" % (trial_nr, dataset_name, cls_method, method_name, bucket, k, v, bucket_score / n_splits))   
        fout_all.write("%s;%s;%s;%s;%s;%s;%s;%s\n" % (trial_nr, dataset_name, cls_method, method_name, 0, "processing_time", time.time() - start, 0))  
    else:
        for k, v in args.items():
            fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" % (trial_nr, dataset_name, cls_method, method_name, k, v, score / n_splits))   
        fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" % (trial_nr, dataset_name, cls_method, method_name, "processing_time", time.time() - start, 0))   
    fout_all.flush()
    return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': cls}

In [7]:
# dataset_ref = argv[1]
# params_dir = argv[2]
# n_iter = int(argv[3])
# bucket_method = argv[4]
# cls_encoding = argv[5]
# cls_method = argv[6]

dataset_ref = "bpic2012"
params_dir = "params"
n_iter = 5
bucket_method = "single"
cls_encoding = "3d"
cls_method = "lstm"

if bucket_method == "state":
    bucket_encoding = "last"
else:
    bucket_encoding = "agg"

method_name = "%s_%s"%(bucket_method, cls_encoding)

dataset_ref_to_datasets = {
    "bpic2011": ["bpic2011_f%s"%formula for formula in range(1,5)],
    "bpic2015": ["bpic2015_%s_f2"%(municipality) for municipality in range(5,6)],
    "insurance": ["insurance_activity", "insurance_followup"],
    "bpic2012" : ["bpic2012_accepted"],
    "sepsis_cases": ["sepsis_cases_1"]#, "sepsis_cases_2", "sepsis_cases_4"]
}

encoding_dict = {
    "laststate": ["static", "last"],
    "agg": ["static", "agg"],
    "index": ["static", "index"],
    "combined": ["static", "last", "agg"],
    "3d": []
}

datasets = [dataset_ref] if dataset_ref not in dataset_ref_to_datasets else dataset_ref_to_datasets[dataset_ref]
methods = encoding_dict[cls_encoding]
print(datasets)
    
train_ratio = 0.8
n_splits = 3
random_state = 22

# create results directory
if not os.path.exists(os.path.join(params_dir)):
    os.makedirs(os.path.join(params_dir))
    
for dataset_name in datasets:
    
    # read the data
    dataset_manager = DatasetManager(dataset_name)
    data = dataset_manager.read_dataset()
    cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 
                        'static_cat_cols': dataset_manager.static_cat_cols,
                        'static_num_cols': dataset_manager.static_num_cols, 
                        'dynamic_cat_cols': dataset_manager.dynamic_cat_cols,
                        'dynamic_num_cols': dataset_manager.dynamic_num_cols, 
                        'fillna': True}

    # determine min and max (truncated) prefix lengths
    min_prefix_length = 1
    if "traffic_fines" in dataset_name:
        max_prefix_length = 10
    elif "bpic2017" in dataset_name:
        max_prefix_length = min(20, dataset_manager.get_pos_case_length_quantile(data, 0.90))
    else:
        max_prefix_length = min(40, dataset_manager.get_pos_case_length_quantile(data, 0.90))

    # split into training and test
    print("splitting data")
    train, _ = dataset_manager.split_data_strict(data, train_ratio, split="temporal")
    
    # prepare chunks for CV
    dt_prefixes = []
    class_ratios = []
    for train_chunk, test_chunk in dataset_manager.get_stratified_split_generator(train, n_splits=n_splits):
        class_ratios.append(dataset_manager.get_class_ratio(train_chunk))
        # generate data where each prefix is a separate instance
        dt_prefixes.append(dataset_manager.generate_prefix_data(test_chunk, min_prefix_length, max_prefix_length))
    del train
        
    # set up search space
    if cls_method == "rf":
        space = {'max_features': hp.uniform('max_features', 0, 1)}
    elif cls_method == "xgboost":
        space = {'learning_rate': hp.uniform("learning_rate", 0, 1),
                 'subsample': hp.uniform("subsample", 0.5, 1),
                 'max_depth': scope.int(hp.quniform('max_depth', 4, 30, 1)),
                 'colsample_bytree': hp.uniform("colsample_bytree", 0.5, 1),
                 'min_child_weight': scope.int(hp.quniform('min_child_weight', 1, 6, 1))}
    elif cls_method == "logit":
        space = {'C': hp.uniform('C', -15, 15)}
    elif cls_method == "svm":
        space = {'C': hp.uniform('C', -15, 15),
                 'gamma': hp.uniform('gamma', -15, 15)}
    elif cls_method == "cb":
        space = {'learning_rate': hp.uniform("learning_rate", 0, 1),
                 'depth': scope.int(hp.quniform('max_depth', 4, 16, 1)),
                 'subsample': hp.uniform("subsample", 0.5, 1)}
    elif cls_method == "lstm":
        space = {'lstm1_nodes':hp.choice('units_lsmt1', [10,20,30,40,50]),#,100,150,200]),
                 'lstm1_dropouts':hp.loguniform('dos_lstm1',np.log(0.001),np.log(0.5)), 
                 'lstm_layers': hp.choice('num_layers_lstm',[{'layers':'one'},
                                {'layers':'two','lstm2_nodes':hp.choice('units_lstm_2', [10,20,30,40,50]),#,100,150,200]),
                                'lstm2_dropouts':hp.loguniform('dos_lstm_2',np.log(0.001),np.log(0.5))},
                                {'layers':'three','lstm2_nodes':hp.choice('units_lstm2', [10,20,30,40,50]),#,100,150,200]),
                                'lstm2_dropouts':hp.loguniform('dos_lstm2',np.log(0.001),np.log(0.5)),
                                'lstm3_nodes':hp.choice('units_lstm3', [10,20,30,40,50]),#,100,150,200]),
                                'lstm3_dropouts':hp.loguniform('dos_lstm3',np.log(0.001),np.log(0.5))}]),
                 'dense_layers': hp.choice('num_layers_dense',[{'layers':'one'},
                                {'layers':'two','dense2_nodes':hp.choice('units_dense', [8,16,24,32])}]),
                 "optimizer": hp.choice('optmz',["adam", "rmsprop"]), 
                 'epochs':hp.choice('ep', [20,50,100,200,300,500]),
                 "batch_size":hp.choice('bs',[8, 16, 32, 64, 128, 256]),
                 "learning_rate":hp.loguniform('lr', np.log(0.0001), np.log(0.01))}
        
    if bucket_method == "cluster":
        space['n_clusters'] = scope.int(hp.quniform('n_clusters', 2, 50, 1))

    # optimize parameters
    trial_nr = 1
    trials = Trials()
    fout_all = open(os.path.join(params_dir, "param_optim_all_trials_%s_%s_%s.csv" % (cls_method, dataset_name, method_name)), "w")
    if "prefix" in method_name:
        fout_all.write("%s;%s;%s;%s;%s;%s;%s;%s\n" % ("iter", "dataset", "cls", "method", "nr_events", "param", "value", "score"))   
    else:
        fout_all.write("%s;%s;%s;%s;%s;%s;%s\n" % ("iter", "dataset", "cls", "method", "param", "value", "score"))   
    best = fmin(create_and_evaluate_model, space, algo=tpe.suggest, max_evals=n_iter, trials=trials, verbose=True)
    fout_all.close()

    # write the best parameters
    best_params = hyperopt.space_eval(space, best)
    outfile = os.path.join(params_dir, "optimal_params_%s_%s_%s.pickle" % (cls_method, dataset_name, method_name))
    # write to file
    with open(outfile, "wb") as fout:
        pickle.dump(best_params, fout)


['bpic2012_accepted']
splitting data
single                                                                                                                 
{'batch_size': 64, 'dense_layers': {'dense2_nodes': 8, 'layers': 'two'}, 'epochs': 50, 'learning_rate': 0.002925394800322271, 'lstm1_dropouts': 0.0020351889244454215, 'lstm1_nodes': 40, 'lstm_layers': {'layers': 'two', 'lstm2_dropouts': 0.0870148903792336, 'lstm2_nodes': 10}, 'optimizer': 'rmsprop'}
  0%|                                                                            | 0/5 [00:15<?, ?trial/s, best loss=?]

job exception: name 'weights' is not defined



  0%|                                                                            | 0/5 [00:15<?, ?trial/s, best loss=?]


NameError: name 'weights' is not defined

In [None]:
params