In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from helper import get_performance

EPOCHS = 700
BATCH_SIZE = 2048
ACTIVATION = 'swish'
LEARNING_RATE = 0.0007
FOLDS = 5

In [22]:
# Reading the dataset
raw_train = pd.read_csv("dataset/tabular/train.csv")
#raw_train = raw_train.sample(frac=0.01, replace=True, random_state=1)
target = raw_train.target
X_train, X_test, y_train, y_test = train_test_split(raw_train, target, test_size = 0.5, random_state = 29)

X_train = X_train.sample(frac=0.03, replace=True, random_state=1).reset_index(drop=True)
target = X_train.target
X_train = X_train.drop('target', axis = 1)
train = X_train.drop('id', axis = 1)

X_test = X_test.sample(frac=0.03, replace=True, random_state=1).reset_index(drop=True)
y_test = X_test.target
X_test = X_test.drop('target', axis = 1)
test = X_test.drop('id', axis = 1)

print("Train data: ", train.shape)
print("Test data: ", test.shape)

Train data:  (9000, 100)
Test data:  (9000, 100)


In [23]:
X_test = test

# Default Model

In [24]:
model_default = svm.SVC()
scores_default = cross_val_score(model_default, X=train, y=target, cv = FOLDS)
model_default.fit(train, target)
y_pred_default = model_default.predict(X_test)
get_performance(X_test, y_test, y_pred_default)

Performance on all data
Precision: 0.525581147833746
Recall: 0.5231111111111111
F1 Score: 0.4855188098273521


(0.525581147833746, 0.5231111111111111, 0.4855188098273521, None)

# MAPS

In [25]:
import time 
import sys
sys.path.insert(1, './mmd')
from mmd import diagnoser
from scipy import stats as st
import numpy
#notebook's library
%matplotlib inline
from helper import get_top_f1_rules, get_relevent_attributs_target, get_MMD_results, get_biased_features, get_BGMD_results
from helper import generateTrain_data_Weights

In [26]:
default_result = pd.concat([X_test, y_test], axis=1, join='inner')
default_result.loc[:,"pred"] = y_pred_default

In [27]:
def mispredict_label(row):
    if row['target'] == row['pred']:
        return False
    return True

In [28]:
default_result_copy = default_result.copy()
X_test_copy = X_test.copy()
X_test_copy['mispredict'] = default_result_copy.apply(lambda row: mispredict_label(row), axis=1)

In [29]:
settings = diagnoser.Settings
settings.all_rules = True
# Get relevent attributes and target 
relevant_attributes, Target = get_relevent_attributs_target(X_test_copy)
# Generate MMD rules and correspodning information
MMD_rules, MMD_time, MMD_Features = get_MMD_results(X_test_copy, relevant_attributes, Target)

#Get biased attributes this time 
biased_attributes = get_biased_features(X_test_copy, relevant_attributes)

BGMD_rules, BGMD_time, BGMD_Features = get_BGMD_results(X_test_copy, biased_attributes, Target)

print('MMD Spent:', MMD_time, 'BGMD Spent:', BGMD_time)
MMD_rules, BGMD_rules

Original Rule
Subgroup Discovery Result

Found [1m2[0m subgroups
[1mDataset[0m
Target: mispredict=True
# Rows:	9000
# Cols:	101
% Target in dataset 47.69%
[1mSubgroup: f43<=4.07133[0m
% of subgroup in population (Full Dataset):	79.99% (7199 rows)
Precision: P(mispredict=True | f43<=4.07133) = 49.24%
Recall: P(f43<=4.07133 | mispredict=True) = 82.6%
[1mSubgroup: f43<=3.07003[0m
% of subgroup in population (Full Dataset):	59.82% (5384 rows)
Precision: P(mispredict=True | f43<=3.07003) = 50.65%
Recall: P(f43<=3.07003 | mispredict=True) = 63.54%

###############################

BGMD Rule
Subgroup Discovery Result

Found [1m2[0m subgroups
[1mDataset[0m
Target: mispredict=True
# Rows:	9000
# Cols:	101
% Target in dataset 47.69%
[1mSubgroup: f43<=4.07133[0m
% of subgroup in population (Full Dataset):	79.99% (7199 rows)
Precision: P(mispredict=True | f43<=4.07133) = 49.24%
Recall: P(f43<=4.07133 | mispredict=True) = 82.6%
[1mSubgroup: f43<=3.07003[0m
% of subgroup in populatio

([[0.6170046123052824, 0.49242950409779135, 0.825955265610438, 'f43<=4.07133'],
  [0.5636626705250103,
   0.5065007429420505,
   0.6353681267474371,
   'f43<=3.07003']],
 [[0.6170046123052824, 0.49242950409779135, 0.825955265610438, 'f43<=4.07133'],
  [0.5636626705250103,
   0.5065007429420505,
   0.6353681267474371,
   'f43<=3.07003']])

# DT

In [30]:
model_default = DecisionTreeClassifier()
scores_default = cross_val_score(model_default, X=train, y=target, cv = FOLDS)
model_default.fit(train, target)
y_pred_default = model_default.predict(X_test)
get_performance(X_test, y_test, y_pred_default)

Performance on all data
Precision: 0.5482927892407136
Recall: 0.5481111111111111
F1 Score: 0.5481483526891013


(0.5482927892407136, 0.5481111111111111, 0.5481483526891013, None)

In [31]:
default_result = pd.concat([X_test, y_test], axis=1, join='inner')
default_result.loc[:,"pred"] = y_pred_default
default_result_copy = default_result.copy()
X_test_copy = X_test.copy()
X_test_copy['mispredict'] = default_result_copy.apply(lambda row: mispredict_label(row), axis=1)
settings = diagnoser.Settings
settings.all_rules = True
# Get relevent attributes and target 
relevant_attributes, Target = get_relevent_attributs_target(X_test_copy)
# Generate MMD rules and correspodning information
MMD_rules, MMD_time, MMD_Features = get_MMD_results(X_test_copy, relevant_attributes, Target)

#Get biased attributes this time 
biased_attributes = get_biased_features(X_test_copy, relevant_attributes)

BGMD_rules, BGMD_time, BGMD_Features = get_BGMD_results(X_test_copy, biased_attributes, Target)

print('MMD Spent:', MMD_time, 'BGMD Spent:', BGMD_time)
MMD_rules, BGMD_rules

Original Rule
Subgroup Discovery Result

Found [1m3[0m subgroups
[1mDataset[0m
Target: mispredict=True
# Rows:	9000
# Cols:	101
% Target in dataset 45.19%
[1mSubgroup: f2<=172.454[0m
% of subgroup in population (Full Dataset):	60.03% (5403 rows)
Precision: P(mispredict=True | f2<=172.454) = 46.38%
Recall: P(f2<=172.454 | mispredict=True) = 61.62%
[1mSubgroup: f16<=0.0250106[0m
% of subgroup in population (Full Dataset):	39.99% (3599 rows)
Precision: P(mispredict=True | f16<=0.0250106) = 46.93%
Recall: P(f16<=0.0250106 | mispredict=True) = 41.53%
[1mSubgroup: f46>0.0726664[0m
% of subgroup in population (Full Dataset):	40.28% (3625 rows)
Precision: P(mispredict=True | f46>0.0726664) = 46.76%
Recall: P(f46>0.0726664 | mispredict=True) = 41.68%

###############################

BGMD Rule
Subgroup Discovery Result

Found [1m3[0m subgroups
[1mDataset[0m
Target: mispredict=True
# Rows:	9000
# Cols:	101
% Target in dataset 45.19%
[1mSubgroup: f34<=2.98954[0m
% of subgroup in p

([[0.5292502639915523, 0.46381639829724225, 0.6161790017211703, 'f2<=172.454'],
  [0.4407176287051482,
   0.4675862068965517,
   0.4167691172854684,
   'f46>0.0726664'],
  [0.4406470127837203,
   0.4692970269519311,
   0.4152938283747234,
   'f16<=0.0250106']],
 [[0.5270384452893958, 0.4619514904647288, 0.6134743053848045, 'f34<=2.98954'],
  [0.4383561643835616,
   0.4669260700389105,
   0.41308089500860584,
   'f34<=1.90474'],
  [0.0884476534296029,
   0.536986301369863,
   0.04819277108433735,
   'f58<=0.00394169 & f57>4.08037']])