# Initial EDA

In this notebook we explore the preprocessed data tables

In [143]:
# Load the "autoreload" extension
%load_ext autoreload
%autoreload 2

import sys
import os
import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline
#%matplotlib notebook
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.style.use('ggplot')

# add the 'src' directory as one where we can import modules
src_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'src'))
sys.path.append(src_dir)

# self written packages
from data import data_utils
from data.preprocessing import preprocess

from copy import deepcopy


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Read files from directory of processed data tables

In [144]:
dfs = data_utils.read_csv_from_dir(dir_name="processed")
print("We have {} tables".format(len(dfs)))

We have 11 tables


## Head all 11 tables:

In [145]:
dfs[1].head()

Unnamed: 0,index,T:concept:name,E:concept:name,E:dismissal,E:lifecycle:transition,E:time:timestamp
0,17,A10001,Send Appeal to Prefecture,#,complete,9/24/07 12:00 AM
1,263,A10102,Send Appeal to Prefecture,#,complete,10/25/07 12:00 AM
2,276,A10107,Send Appeal to Prefecture,#,complete,9/24/07 12:00 AM
3,320,A10125,Send Appeal to Prefecture,NIL,complete,9/24/07 12:00 AM
4,574,A10236,Send Appeal to Prefecture,#,complete,10/2/07 12:00 AM


In [146]:
##
# All create_fine events
##
create_fine_activities_RAW = deepcopy(dfs[3])
create_fine_uid = set(create_fine_activities_RAW["T:concept:name"])
len(create_fine_uid)

150370

In [147]:
##
# All payment events
##
payment_activities_RAW = deepcopy(dfs[7])
payment_uid = set(payment_activities_RAW["T:concept:name"])

# union of both events
create_to_payment_uid = payment_uid & create_fine_uid
len(create_to_payment_uid)

69715

In [148]:
result_ids = create_to_payment_uid
for i in range(0,len(dfs)):
    if i not in [3,7]:
        helper = deepcopy(dfs[i])
        helper = set(helper["T:concept:name"])
        result_ids = result_ids - helper
print("number of results which are only in create_fine & payment:")
len(result_ids)

number of results which are only in create_fine & payment:


46383

In [151]:
helper = relevant_events_RAW.set_index("T:concept:name")
helper = helper.loc[result_ids,:]
payment_feature = helper.reset_index()
payment_feature.head()

Unnamed: 0,T:concept:name,level_0,index,E:amount,E:article,E:concept:name,E:dismissal,E:lifecycle:transition,E:org:resource,E:points,E:time:timestamp,E:vehicleClass
0,N47426,45785,174207,33.6,157.0,Create Fine,NIL,complete,536.0,0.0,2/13/03 12:00 AM,A
1,S155716,102067,376330,38.0,7.0,Create Fine,NIL,complete,41.0,0.0,7/23/10 12:00 AM,A
2,A4781,21071,72379,21.0,7.0,Create Fine,NIL,complete,559.0,0.0,10/18/06 12:00 AM,A
3,N59171,51153,194962,33.6,157.0,Create Fine,NIL,complete,552.0,0.0,7/24/04 12:00 AM,A
4,N88746,66626,250811,35.0,157.0,Create Fine,NIL,complete,538.0,0.0,1/15/06 12:00 AM,A


In [150]:
len(relevant_events_RAW)

46383

In [235]:
data = create_fine_activities_RAW.set_index("T:concept:name")
data["label"] = 0
payment_indices = list(payment_feature["T:concept:name"])
data.loc[payment_indices, "label"] = 1
#with pd.option_context('display.max_rows', 300, 'display.max_columns', None):
#    display(data)

# according to paper which used data set:
#  Dismissal con- tains a character that encodes 
#  the diverse reasons for a possible dismissal of 
#  the fine. A value of NIL encodes 
#  that the fine is not dismissed (i.e. has to be paid)
#  any other value encodes different motivations.

#  redesign feature so that 1 encodes has to be paid & 0 encodes the opposite

#t = cleaned_features.set_index("T:concept:name")
data["dismissal"] = 0
# get all 
dismissal_nil_indices = list(data[data["E:dismissal"]=='NIL'].index)
data.loc[dismissal_nil_indices, "dismissal"] = 1
data = data.drop('E:dismissal', axis=1)
#t.reset_index(inplace=True)


data.reset_index(inplace=True)

In [236]:
data

Unnamed: 0,T:concept:name,index,E:amount,E:article,E:concept:name,E:lifecycle:transition,E:org:resource,E:points,E:time:timestamp,E:vehicleClass,label,dismissal
0,A1,0,35.0,157.0,Create Fine,complete,561.0,0.0,7/24/06 12:00 AM,A,0,1
1,A100,2,35.0,157.0,Create Fine,complete,561.0,0.0,8/2/06 12:00 AM,A,0,1
2,A10000,7,36.0,157.0,Create Fine,complete,561.0,0.0,3/9/07 12:00 AM,A,0,1
3,A10001,12,36.0,157.0,Create Fine,complete,537.0,0.0,3/19/07 12:00 AM,A,0,1
4,A10004,18,36.0,157.0,Create Fine,complete,537.0,0.0,3/20/07 12:00 AM,A,0,1
5,A10005,23,36.0,157.0,Create Fine,complete,537.0,0.0,3/20/07 12:00 AM,A,1,1
6,A10007,25,36.0,157.0,Create Fine,complete,537.0,0.0,3/20/07 12:00 AM,A,1,1
7,A10008,27,36.0,157.0,Create Fine,complete,537.0,0.0,3/20/07 12:00 AM,A,0,1
8,A10009,32,22.0,7.0,Create Fine,complete,537.0,0.0,3/20/07 12:00 AM,A,0,1
9,A1001,38,21.0,7.0,Create Fine,complete,550.0,0.0,8/2/06 12:00 AM,A,0,1


In [237]:
# drop unnecessary columns
not_used_columns = ['label','index', 'E:org:resource','T:concept:name','E:concept:name', 'E:lifecycle:transition', 'E:time:timestamp']

# target variable
target_column = data["label"] 

cleaned_features = data.drop(not_used_columns, axis = 1)

new_names=['amount','article','dismissal','org_resource','points','vehicle_class']
#df = df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'})


In [238]:
cleaned_features.head()

Unnamed: 0,E:amount,E:article,E:points,E:vehicleClass,dismissal
0,35.0,157.0,0.0,A,1
1,35.0,157.0,0.0,A,1
2,36.0,157.0,0.0,A,1
3,36.0,157.0,0.0,A,1
4,36.0,157.0,0.0,A,1


In [199]:
#len(set(cleaned_features['E:org:resource']))

147

In [239]:
# create dummy features
dummies = ['E:vehicleClass']



for dummy in dummies:
    features_with_dummy = pd.get_dummies(cleaned_features[dummy]).rename(columns=lambda x: dummy + '_' + str(x))
    cleaned_features = pd.concat([cleaned_features, features_with_dummy], axis=1)
    cleaned_features = cleaned_features.drop([dummy], axis=1)

In [240]:
len(cleaned_features.columns)

8

In [319]:
cleaned_features.head()

Unnamed: 0,E:amount,E:article,E:points,dismissal,E:vehicleClass_A,E:vehicleClass_C,E:vehicleClass_M,E:vehicleClass_R
0,35.0,157.0,0.0,1,1,0,0,0
1,35.0,157.0,0.0,1,1,0,0,0
2,36.0,157.0,0.0,1,1,0,0,0
3,36.0,157.0,0.0,1,1,0,0,0
4,36.0,157.0,0.0,1,1,0,0,0


In [377]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

seed_num = 42
parameters = {'n_estimators':np.arange(10,30,step=5), 'criterion':('gini', 'entropy')}

In [378]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(cleaned_features, target_column, test_size=0.4, random_state=seed_num)

In [383]:
# create random forest classifier and perform grid search with cross validation
rf = RandomForestClassifier(random_state=seed_num)
clf = GridSearchCV(rf, parameters,cv=10)

In [384]:
clf.fit(X_train,y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'criterion': ('gini', 'entropy'), 'n_estimators': array([10, 15, 20, 25])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [388]:
bp = clf.best_params_

In [392]:
rf_with_best_params = RandomForestClassifier(n_estimators=bp['n_estimators'], criterion=bp['criterion'],random_state=seed_num)

In [393]:
# fit model onto train set
rf_with_best_params.fit(X_train,y_train)
# predict labels
rf_with_best_params_pred = clf.predict(X_test)
# predict probabilities
rf_with_best_params_probs = clf.predict_proba(X_test)

In [395]:
# model accuracy
accuracy_score(y_test, rf_with_best_params_pred)

0.69506883021879362

In [396]:
# SVM
# DO NOT RUN, TAKES > 1 HOUR !!!!!
from sklearn import svm

svm_clf = svm.SVC(kernel='linear', C = 1.0)
svm_clf.fit(X_train,y_train)


AttributeError: 'SVC' object has no attribute 'pred'

In [398]:
res = svm_clf.predict(X_test)

In [399]:
accuracy_score(y_test, res)

0.69262485868191792