# Characteristics of Datasets Used

In [1]:
import os
os.chdir("..")
PATH = os.getcwd()

In [2]:
import pandas as pd
import numpy as np

import sys

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import statistics
import math

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score, mean_absolute_percentage_error, mean_absolute_error, f1_score
import sklearn

from DatasetManager import DatasetManager

import json

C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
C:\ProgramData\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll


## Tabular Data

In [3]:
classification_datasets = ["breast_cancer", "compas", "diabetes", "income", "iris", "mushroom", "nursery"]

datasets = []
datasets.extend(classification_datasets)

data_names = {"breast_cancer":"Breast Cancer", "compas": "COMPAS", "diabetes":"Diabetes", 
              "income": "Adult Income", "iris": "Iris", "mushroom": "Mushroom", "nursery": "Nursery"}

PATH = os.getcwd()

In [4]:
for data in classification_datasets:
    print(data_names[data])
    X_train = pd.read_csv(os.path.join(PATH, data, "datasets", data+"_Xtrain.csv"), index_col=False, sep = ";")#.values
    y_train = pd.read_csv(os.path.join(PATH, data, "datasets", data+"_ytrain.csv"), index_col=False, sep = ";")#.values
    
    with open(os.path.join(PATH, data, "datasets", "col_dict.json")) as f:
        col_dict = json.load(f)
        
    if col_dict["continuous"] == None:
        disc_prop = 1.0
        disc_num = len(col_dict["discrete"])
    elif col_dict["discrete"] == None:
        disc_prop = 0.0
        disc_num = 0
    else:
        disc_num = len(col_dict["discrete"])
        total = X_train.shape[1]
        disc_prop = disc_num/total
    
    print("Num variables:\t", X_train.shape[1])
    print("Num training instances:\t", X_train.shape[0])
    print("Ratio of variables to instances:\t", X_train.shape[0]/X_train.shape[1])
    print("Percentage discrete variables:\t", round(disc_prop, 4))
    print("Num discrete variables:\t", disc_num)
    
    print(y_train.value_counts(normalize=True)*100)
    print("----------------------------------------------------------------------------")

Breast Cancer
Num variables:	 30
Num training instances:	 296
Ratio of variables to instances:	 9.866666666666667
Percentage discrete variables:	 0.0
Num discrete variables:	 0
diagnosis
1            51.013514
0            48.986486
dtype: float64
----------------------------------------------------------------------------
COMPAS
Num variables:	 20
Num training instances:	 2793
Ratio of variables to instances:	 139.65
Percentage discrete variables:	 0.8
Num discrete variables:	 16
high_risk
0            50.125313
1            49.874687
dtype: float64
----------------------------------------------------------------------------
Diabetes
Num variables:	 8
Num training instances:	 375
Ratio of variables to instances:	 46.875
Percentage discrete variables:	 0.0
Num discrete variables:	 0
Outcome
1          50.4
0          49.6
dtype: float64
----------------------------------------------------------------------------
Adult Income
Num variables:	 104
Num training instances:	 10977
Ratio of v

## Event Logs

In [5]:
el_datasets = ["bpic2012", "production", "sepsis_cases"]
preprocs = ["single_agg", "prefix_agg", "prefix_index"]
dataset_ref = {
    "bpic2012" : "bpic2012_accepted",
    "sepsis_cases": "sepsis_cases_1",
    "production" : "production"
}

preproc_ref = {
    "single_agg": "Single & Aggregate",
    "prefix_agg": "Prefix & Aggregate",
    "prefix_index": "Prefix & Index-Based"
}


In [6]:
for data in el_datasets:
    dataset = dataset_ref[data]
    dataset_manager = DatasetManager(dataset)
    
    for preproc in preprocs:
        print(data, preproc)
        variables = []
        training_instances = []
        variables_to_instances = []
        class_balance = []
        
        folder_loc = os.path.join(PATH, "%s/xgboost/%s" %(data,preproc))
        num_buckets = len([name for name in os.listdir(os.path.join(folder_loc,'pipelines/'))])
        
        for bucket in range(1, num_buckets+1):
            X_train = pd.read_csv(os.path.join(folder_loc, "train_data/train_data_bucket_%s.csv"%(bucket)), sep=",")
            y_train = pd.read_csv(os.path.join(folder_loc, "train_data/y_train_bucket_%s.csv"%(bucket)), sep=",")
           #print(y_train.head())
        
            variables.append(X_train.shape[1])
            training_instances.append(X_train.shape[0])
            variables_to_instances.append(X_train.shape[0]/X_train.shape[1])
            class_balance.append(round(y_train.value_counts(normalize=True)[0],4)*100)
            
        print("Num Variables:\t", min(variables), "\t", max(variables))
        print("Num training instances:\t", min(training_instances), "\t", max(training_instances))
        print("Ratio Variables to Instances:\t", min(variables_to_instances), "\t", max(variables_to_instances))
        print("Class Balance:\t", min(class_balance), "\t", max(class_balance))
            
        print("----------------------------------------------------------------------------")

bpic2012 single_agg
Num Variables:	 120 	 120
Num training instances:	 52122 	 52122
Ratio Variables to Instances:	 434.35 	 434.35
Class Balance:	 53.04 	 53.04
----------------------------------------------------------------------------
bpic2012 prefix_agg
Num Variables:	 43 	 120
Num training instances:	 3642 	 3748
Ratio Variables to Instances:	 30.35 	 87.16279069767442
Class Balance:	 52.99 	 53.1
----------------------------------------------------------------------------
bpic2012 prefix_index
Num Variables:	 11 	 770
Num training instances:	 3642 	 3748
Ratio Variables to Instances:	 4.72987012987013 	 340.72727272727275
Class Balance:	 52.99 	 53.1
----------------------------------------------------------------------------
production single_agg
Num Variables:	 166 	 166
Num training instances:	 1530 	 1530
Ratio Variables to Instances:	 9.216867469879517 	 9.216867469879517
Class Balance:	 42.16 	 42.16
-------------------------------------------------------------------------