In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from helper import get_performance

EPOCHS = 700
BATCH_SIZE = 2048
ACTIVATION = 'swish'
LEARNING_RATE = 0.0007
FOLDS = 5

In [14]:
data_dir = "dataset/Bank_Marketing/"
data = pd.read_csv(data_dir + "bank-additional-full.csv", sep = ';')
data = data.sample(frac=0.5, replace=True, random_state=1).reset_index(drop=True)

data['y'].replace(['yes', 'no'], [0, 1], inplace=True)

data['job'].replace(['housemaid' , 'services' , 'admin.' , 'blue-collar' , 'technician', 'retired' , 'management', 'unemployed', 'self-employed', 'unknown' , 'entrepreneur', 'student'] , [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], inplace=True)

data['education'].replace(['basic.4y' , 'high.school', 'basic.6y', 'basic.9y', 'professional.course', 'unknown' , 'university.degree' , 'illiterate'], [1, 2, 3, 4, 5, 6, 7, 8], inplace=True)

data['marital'].replace(['married', 'single', 'divorced', 'unknown'], [1, 2, 3, 4], inplace=True)

data['default'].replace(['yes', 'no', 'unknown'],[1, 2, 3], inplace=True)

data['housing'].replace(['yes', 'no', 'unknown'],[1, 2, 3], inplace=True)

data['loan'].replace(['yes', 'no', 'unknown'],[1, 2, 3], inplace=True)

data['poutcome'].replace(['nonexistent', 'failure', 'success'], [1,2,3], inplace  = True)

labelencoder_X = LabelEncoder()
data['contact']     = labelencoder_X.fit_transform(data['contact']) 
data['month']       = labelencoder_X.fit_transform(data['month']) 
data['day_of_week'] = labelencoder_X.fit_transform(data['day_of_week']) 

data.rename(columns={'emp.var.rate' : 'emp_var_rate',
                    'cons.price.idx' : 'cons_price_idx',
                    'cons.conf.idx' : 'cons_conf_idx',
                    'nr.employed' : 'nr_employed'}, inplace=True)
y = data['y']
data = data.drop(['y'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.33)

In [15]:
y_train.value_counts()

1    12277
0     1520
Name: y, dtype: int64

In [16]:
y_test.value_counts()

1    6039
0     758
Name: y, dtype: int64

In [17]:
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

(13797, 20) (13797,) (6797, 20) (6797,)


# Default Model

In [18]:
model_default = svm.SVC()
scores_default = cross_val_score(model_default, X=X_train, y=y_train, cv = FOLDS)
model_default.fit(X_train, y_train)
y_pred_default = model_default.predict(X_test)
get_performance(X_test, y_test, y_pred_default)

Performance on all data
Precision: 0.8716904071237983
Recall: 0.8952479034868325
F1 Score: 0.8708174375564097


(0.8716904071237983, 0.8952479034868325, 0.8708174375564097, None)

In [19]:
pd.DataFrame(y_pred_default).value_counts()

1    6549
0     248
dtype: int64

In [20]:
import time 
import sys
sys.path.insert(1, './mmd')
from mmd import diagnoser
from scipy import stats as st
import numpy
#notebook's library
%matplotlib inline
from helper import get_top_f1_rules, get_relevent_attributs_target, get_MMD_results, get_biased_features, get_BGMD_results
from helper import generateTrain_data_Weights

In [21]:
default_result = pd.concat([X_test, y_test], axis=1, join='inner')
default_result.loc[:,"pred"] = y_pred_default

In [22]:
def mispredict_label(row):
    if row['y'] == row['pred']:
        return False
    return True

In [23]:
default_result_copy = default_result.copy()
X_test_copy = X_test.copy()
X_test_copy['mispredict'] = default_result_copy.apply(lambda row: mispredict_label(row), axis=1)

In [24]:
settings = diagnoser.Settings
settings.all_rules = True
# Get relevent attributes and target 
relevant_attributes, Target = get_relevent_attributs_target(X_test_copy)
# Generate MMD rules and correspodning information
MMD_rules, MMD_time, MMD_Features = get_MMD_results(X_test_copy, relevant_attributes, Target)

#Get biased attributes this time 
biased_attributes = get_biased_features(X_test_copy, relevant_attributes)

BGMD_rules, BGMD_time, BGMD_Features = get_BGMD_results(X_test_copy, biased_attributes, Target)

print('MMD Spent:', MMD_time, 'BGMD Spent:', BGMD_time)
MMD_rules, BGMD_rules

Original Rule
Subgroup Discovery Result

Found [1m3[0m subgroups
[1mDataset[0m
Target: mispredict=True
# Rows:	6797
# Cols:	21
% Target in dataset 10.48%
[1mSubgroup: duration>844[0m
% of subgroup in population (Full Dataset):	3.5% (238 rows)
Precision: P(mispredict=True | duration>844) = 59.66%
Recall: P(duration>844 | mispredict=True) = 19.94%
[1mSubgroup: duration>383 & euribor3m<=0.729[0m
% of subgroup in population (Full Dataset):	0.82% (56 rows)
Precision: P(mispredict=True | duration>383 & euribor3m<=0.729) = 67.86%
Recall: P(duration>383 & euribor3m<=0.729 | mispredict=True) = 5.34%
[1mSubgroup: duration>582[0m
% of subgroup in population (Full Dataset):	8.78% (597 rows)
Precision: P(mispredict=True | duration>582) = 45.9%
Recall: P(duration>582 | mispredict=True) = 38.48%

###############################

BGMD Rule
Subgroup Discovery Result

Found [1m3[0m subgroups
[1mDataset[0m
Target: mispredict=True
# Rows:	6797
# Cols:	21
% Target in dataset 10.48%
[1mSubgro

([[0.4186401833460657,
   0.45896147403685095,
   0.3848314606741573,
   'duration>582'],
  [0.29894736842105263, 0.5966386554621849, 0.199438202247191, 'duration>844'],
  [0.09895833333333334,
   0.6785714285714286,
   0.05337078651685393,
   'duration>383 & euribor3m<=0.729']],
 [[0.4186401833460657,
   0.45896147403685095,
   0.3848314606741573,
   'duration>582'],
  [0.29894736842105263, 0.5966386554621849, 0.199438202247191, 'duration>844'],
  [0.09895833333333334,
   0.6785714285714286,
   0.05337078651685393,
   'duration>383 & euribor3m<=0.729']])

# Decision Tree

In [25]:
model_default = DecisionTreeClassifier()
scores_default = cross_val_score(model_default, X=X_train, y=y_train, cv = FOLDS)
model_default.fit(X_train, y_train)
y_pred_default = model_default.predict(X_test)
get_performance(X_test, y_test, y_pred_default)

Performance on all data
Precision: 0.9202327632004054
Recall: 0.9184934529939679
F1 Score: 0.9193169274809054


(0.9202327632004054, 0.9184934529939679, 0.9193169274809054, None)

In [26]:
default_result = pd.concat([X_test, y_test], axis=1, join='inner')
default_result.loc[:,"pred"] = y_pred_default
default_result_copy = default_result.copy()
X_test_copy = X_test.copy()
X_test_copy['mispredict'] = default_result_copy.apply(lambda row: mispredict_label(row), axis=1)
settings = diagnoser.Settings
settings.all_rules = True
# Get relevent attributes and target 
relevant_attributes, Target = get_relevent_attributs_target(X_test_copy)
# Generate MMD rules and correspodning information
MMD_rules, MMD_time, MMD_Features = get_MMD_results(X_test_copy, relevant_attributes, Target)

#Get biased attributes this time 
biased_attributes = get_biased_features(X_test_copy, relevant_attributes)

BGMD_rules, BGMD_time, BGMD_Features = get_BGMD_results(X_test_copy, biased_attributes, Target)

print('MMD Spent:', MMD_time, 'BGMD Spent:', BGMD_time)
MMD_rules, BGMD_rules

Original Rule
Subgroup Discovery Result

Found [1m3[0m subgroups
[1mDataset[0m
Target: mispredict=True
# Rows:	6797
# Cols:	21
% Target in dataset 8.15%
[1mSubgroup: duration>582[0m
% of subgroup in population (Full Dataset):	8.78% (597 rows)
Precision: P(mispredict=True | duration>582) = 33.33%
Recall: P(duration>582 | mispredict=True) = 35.92%
[1mSubgroup: duration>844[0m
% of subgroup in population (Full Dataset):	3.5% (238 rows)
Precision: P(mispredict=True | duration>844) = 35.71%
Recall: P(duration>844 | mispredict=True) = 15.34%
[1mSubgroup: duration>844 & day_of_week>3[0m
% of subgroup in population (Full Dataset):	0.91% (62 rows)
Precision: P(mispredict=True | duration>844 & day_of_week>3) = 46.77%
Recall: P(duration>844 & day_of_week>3 | mispredict=True) = 5.23%

###############################

BGMD Rule
Subgroup Discovery Result

Found [1m3[0m subgroups
[1mDataset[0m
Target: mispredict=True
# Rows:	6797
# Cols:	21
% Target in dataset 8.15%
[1mSubgroup: durati

([[0.34578627280625546,
   0.3333333333333333,
   0.3592057761732852,
   'duration>582'],
  [0.21464646464646464,
   0.35714285714285715,
   0.15342960288808663,
   'duration>844'],
  [0.09415584415584415,
   0.46774193548387094,
   0.052346570397111915,
   'duration>844 & day_of_week>3']],
 [[0.34578627280625546,
   0.3333333333333333,
   0.3592057761732852,
   'duration>582'],
  [0.21464646464646464,
   0.35714285714285715,
   0.15342960288808663,
   'duration>844'],
  [0.09415584415584415,
   0.46774193548387094,
   0.052346570397111915,
   'duration>844 & day_of_week>3']])