In [83]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from helper import get_performance

EPOCHS = 700
BATCH_SIZE = 2048
ACTIVATION = 'swish'
LEARNING_RATE = 0.0007
FOLDS = 5

In [84]:
data_dir = "dataset/Water_Quality/"
data = pd.read_csv(data_dir + "water_potability.csv")
data = data.sample(frac=1, replace=True, random_state=1).reset_index(drop=True)
data = data.dropna()
label = data.columns[-1]
features = data.columns[:-1]
# Separate the data
X, y = data[features], data[label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [85]:
y_train.value_counts()

0    819
1    503
Name: Potability, dtype: int64

In [86]:
y_test.value_counts()

0    398
1    254
Name: Potability, dtype: int64

In [87]:
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

(1322, 9) (1322,) (652, 9) (652,)


# Default Model

In [88]:
model_default = svm.SVC(kernel='sigmoid')
scores_default = cross_val_score(model_default, X=X_train, y=y_train, cv = FOLDS)
model_default.fit(X_train, y_train)
y_pred_default = model_default.predict(X_test)
get_performance(X_test, y_test, y_pred_default)

Performance on all data
Precision: 0.5180195164024661
Recall: 0.5322085889570553
F1 Score: 0.5231311049971578


(0.5180195164024661, 0.5322085889570553, 0.5231311049971578, None)

In [89]:
pd.DataFrame(y_pred_default).value_counts()

0    439
1    213
dtype: int64

In [90]:
import time 
import sys
sys.path.insert(1, './mmd')
from mmd import diagnoser
from scipy import stats as st
import numpy
#notebook's library
%matplotlib inline
from helper import get_top_f1_rules, get_relevent_attributs_target, get_MMD_results, get_biased_features, get_BGMD_results
from helper import generateTrain_data_Weights

In [91]:
default_result = pd.concat([X_test, y_test], axis=1, join='inner')
default_result.loc[:,"pred"] = y_pred_default

In [92]:
def mispredict_label(row):
    if row['Potability'] == row['pred']:
        return False
    return True

In [93]:
default_result_copy = default_result.copy()
X_test_copy = X_test.copy()
X_test_copy['mispredict'] = default_result_copy.apply(lambda row: mispredict_label(row), axis=1)

In [94]:
settings = diagnoser.Settings
settings.all_rules = True
# Get relevent attributes and target 
relevant_attributes, Target = get_relevent_attributs_target(X_test_copy)
# Generate MMD rules and correspodning information
MMD_rules, MMD_time, MMD_Features = get_MMD_results(X_test_copy, relevant_attributes, Target)

#Get biased attributes this time 
biased_attributes = get_biased_features(X_test_copy, relevant_attributes)

BGMD_rules, BGMD_time, BGMD_Features = get_BGMD_results(X_test_copy, biased_attributes, Target)

print('MMD Spent:', MMD_time, 'BGMD Spent:', BGMD_time)
MMD_rules, BGMD_rules

Original Rule
Subgroup Discovery Result

Found [1m3[0m subgroups
[1mDataset[0m
Target: mispredict=True
# Rows:	652
# Cols:	10
% Target in dataset 46.78%
[1mSubgroup: Chloramines<=5.83068529423083[0m
% of subgroup in population (Full Dataset):	19.79% (129 rows)
Precision: P(mispredict=True | Chloramines<=5.83068529423083) = 58.91%
Recall: P(Chloramines<=5.83068529423083 | mispredict=True) = 24.92%
[1mSubgroup: Solids<=19635.58825474065[0m
% of subgroup in population (Full Dataset):	40.34% (263 rows)
Precision: P(mispredict=True | Solids<=19635.58825474065) = 55.51%
Recall: P(Solids<=19635.58825474065 | mispredict=True) = 47.87%
[1mSubgroup: Solids<=19635.58825474065 & Organic_carbon<=11.48268699578233[0m
% of subgroup in population (Full Dataset):	7.21% (47 rows)
Precision: P(mispredict=True | Solids<=19635.58825474065 & Organic_carbon<=11.48268699578233) = 74.47%
Recall: P(Solids<=19635.58825474065 & Organic_carbon<=11.48268699578233 | mispredict=True) = 11.48%

#############

([[0.5140845070422536,
   0.5551330798479087,
   0.4786885245901639,
   'Solids<=19635.58825474065'],
  [0.3502304147465438,
   0.5891472868217055,
   0.24918032786885247,
   'Chloramines<=5.83068529423083'],
  [0.19886363636363635,
   0.7446808510638298,
   0.11475409836065574,
   'Solids<=19635.58825474065 & Organic_carbon<=11.48268699578233']],
 [[0.6183574879227053,
   0.4894837476099426,
   0.839344262295082,
   'Organic_carbon<=17.128023270048516'],
  [0.5718432510885341,
   0.5130208333333334,
   0.6459016393442623,
   'Hardness<=203.4193306887763'],
  [0.3287037037037037,
   0.5590551181102362,
   0.23278688524590163,
   'Organic_carbon<=11.48268699578233']])

# Decision Tree

In [95]:
model_default = DecisionTreeClassifier()
scores_default = cross_val_score(model_default, X=X_train, y=y_train, cv = FOLDS)
model_default.fit(X_train, y_train)
y_pred_default = model_default.predict(X_test)
get_performance(X_test, y_test, y_pred_default)

Performance on all data
Precision: 0.8246461095207238
Recall: 0.8236196319018405
F1 Score: 0.8240359110139183


(0.8246461095207238, 0.8236196319018405, 0.8240359110139183, None)

In [96]:
default_result = pd.concat([X_test, y_test], axis=1, join='inner')
default_result.loc[:,"pred"] = y_pred_default
default_result_copy = default_result.copy()
X_test_copy = X_test.copy()
X_test_copy['mispredict'] = default_result_copy.apply(lambda row: mispredict_label(row), axis=1)
settings = diagnoser.Settings
settings.all_rules = True
# Get relevent attributes and target 
relevant_attributes, Target = get_relevent_attributs_target(X_test_copy)
# Generate MMD rules and correspodning information
MMD_rules, MMD_time, MMD_Features = get_MMD_results(X_test_copy, relevant_attributes, Target)

#Get biased attributes this time 
biased_attributes = get_biased_features(X_test_copy, relevant_attributes)

BGMD_rules, BGMD_time, BGMD_Features = get_BGMD_results(X_test_copy, biased_attributes, Target)

print('MMD Spent:', MMD_time, 'BGMD Spent:', BGMD_time)
MMD_rules, BGMD_rules

Original Rule
Subgroup Discovery Result

Found [1m3[0m subgroups
[1mDataset[0m
Target: mispredict=True
# Rows:	652
# Cols:	10
% Target in dataset 17.64%
[1mSubgroup: ph<=8.205550719357213[0m
% of subgroup in population (Full Dataset):	79.45% (518 rows)
Precision: P(mispredict=True | ph<=8.205550719357213) = 19.5%
Recall: P(ph<=8.205550719357213 | mispredict=True) = 87.83%
[1mSubgroup: Hardness<=203.4193306887763[0m
% of subgroup in population (Full Dataset):	58.9% (384 rows)
Precision: P(mispredict=True | Hardness<=203.4193306887763) = 20.57%
Recall: P(Hardness<=203.4193306887763 | mispredict=True) = 68.7%
[1mSubgroup: Conductivity<=446.9924646420934[0m
% of subgroup in population (Full Dataset):	59.66% (389 rows)
Precision: P(mispredict=True | Conductivity<=446.9924646420934) = 20.31%
Recall: P(Conductivity<=446.9924646420934 | mispredict=True) = 68.7%

###############################

BGMD Rule
Subgroup Discovery Result

Found [1m3[0m subgroups
[1mDataset[0m
Target: mis

([[0.31911532385466035,
   0.19498069498069498,
   0.8782608695652174,
   'ph<=8.205550719357213'],
  [0.3166332665330661,
   0.20572916666666666,
   0.6869565217391305,
   'Hardness<=203.4193306887763'],
  [0.3134920634920635,
   0.20308483290488433,
   0.6869565217391305,
   'Conductivity<=446.9924646420934']],
 [[0.31351351351351353,
   0.22745098039215686,
   0.5043478260869565,
   'Sulfate>341.53708898075774'],
  [0.30672926447574334,
   0.18702290076335878,
   0.8521739130434782,
   'Sulfate>301.4297472651064'],
  [0.2924901185770751,
   0.18925831202046037,
   0.6434782608695652,
   'Sulfate>324.05559223457215']])