In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from helper import get_performance

EPOCHS = 700
BATCH_SIZE = 2048
ACTIVATION = 'swish'
LEARNING_RATE = 0.0007
FOLDS = 5

In [13]:
# Reading the dataset
data = pd.read_csv("dataset/Job_Change/aug_train.csv")
aug_train = data.sample(frac=1, replace=True, random_state=1).reset_index(drop=True)

# Seperate aug_train into target and features 
y = aug_train['target']
X_aug_train = aug_train.drop('target',axis = 'columns')
# save the index for X_aug_train 
X_aug_train_index = X_aug_train.index.to_list()

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                # convert float NaN --> string NaN
                output[col] = output[col].fillna('NaN')
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

# store the catagorical features names as a list      
cat_features = X_aug_train.select_dtypes(['object']).columns.to_list()

# use MultiColumnLabelEncoder to apply LabelEncoding on cat_features 
# uses NaN as a value , no imputation will be used for missing data
X = MultiColumnLabelEncoder(columns = cat_features).fit_transform(X_aug_train)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

print("Train data: ", X_train.shape)
print("Test data: ", X_test.shape)

Train data:  (13410, 13)
Test data:  (5748, 13)


In [14]:
y_train.value_counts()

0.0    9945
1.0    3465
Name: target, dtype: int64

In [15]:
y_test.value_counts()

0.0    4338
1.0    1410
Name: target, dtype: int64

# Default Model

In [16]:
model_default = svm.SVC(kernel='sigmoid')
scores_default = cross_val_score(model_default, X=X_train, y=y_train, cv = FOLDS)
model_default.fit(X_train, y_train)
y_pred_default = model_default.predict(X_test)
get_performance(X_test, y_test, y_pred_default)

Performance on all data
Precision: 0.6276959352355325
Recall: 0.6240431454418929
F1 Score: 0.6258444553933447


(0.6276959352355325, 0.6240431454418929, 0.6258444553933447, None)

In [17]:
pd.DataFrame(y_pred_default).value_counts()

0.0    4297
1.0    1451
dtype: int64

In [18]:
import time 
import sys
sys.path.insert(1, './mmd')
from mmd import diagnoser
from scipy import stats as st
import numpy
#notebook's library
%matplotlib inline
from helper import get_top_f1_rules, get_relevent_attributs_target, get_MMD_results, get_biased_features, get_BGMD_results
from helper import generateTrain_data_Weights

In [19]:
default_result = pd.concat([X_test, y_test], axis=1, join='inner')
default_result.loc[:,"pred"] = y_pred_default

In [20]:
def mispredict_label(row):
    if row['target'] == row['pred']:
        return False
    return True

In [21]:
default_result_copy = default_result.copy()
X_test_copy = X_test.copy()
X_test_copy['mispredict'] = default_result_copy.apply(lambda row: mispredict_label(row), axis=1)

In [22]:
settings = diagnoser.Settings
settings.all_rules = True
# Get relevent attributes and target 
relevant_attributes, Target = get_relevent_attributs_target(X_test_copy)
# Generate MMD rules and correspodning information
MMD_rules, MMD_time, MMD_Features = get_MMD_results(X_test_copy, relevant_attributes, Target)

#Get biased attributes this time 
biased_attributes = get_biased_features(X_test_copy, relevant_attributes)

BGMD_rules, BGMD_time, BGMD_Features = get_BGMD_results(X_test_copy, biased_attributes, Target)

print('MMD Spent:', MMD_time, 'BGMD Spent:', BGMD_time)
MMD_rules, BGMD_rules

Original Rule
Subgroup Discovery Result

Found [1m3[0m subgroups
[1mDataset[0m
Target: mispredict=True
# Rows:	5748
# Cols:	14
% Target in dataset 37.6%
[1mSubgroup: enrollee_id<=19841[0m
% of subgroup in population (Full Dataset):	60.04% (3451 rows)
Precision: P(mispredict=True | enrollee_id<=19841) = 44.91%
Recall: P(enrollee_id<=19841 | mispredict=True) = 71.73%
[1mSubgroup: enrollee_id>6616[0m
% of subgroup in population (Full Dataset):	79.78% (4586 rows)
Precision: P(mispredict=True | enrollee_id>6616) = 42.19%
Recall: P(enrollee_id>6616 | mispredict=True) = 89.54%
[1mSubgroup: enrollee_id>6616 & enrollee_id<=13389[0m
% of subgroup in population (Full Dataset):	20.06% (1153 rows)
Precision: P(mispredict=True | enrollee_id>6616 & enrollee_id<=13389) = 62.71%
Recall: P(enrollee_id>6616 & enrollee_id<=13389 | mispredict=True) = 33.46%

###############################

BGMD Rule
Subgroup Discovery Result

Found [1m3[0m subgroups
[1mDataset[0m
Target: mispredict=True
# Ro

([[0.5735882614495331,
   0.4219363279546446,
   0.8954187875983342,
   'enrollee_id>6616'],
  [0.5523877405559515,
   0.4491451753115039,
   0.7172605275335493,
   'enrollee_id<=19841'],
  [0.43633071816535907,
   0.627059843885516,
   0.3345673299398427,
   'enrollee_id>6616 & enrollee_id<=13389']],
 [[0.5735882614495331,
   0.4219363279546446,
   0.8954187875983342,
   'enrollee_id>6616'],
  [0.5523877405559515,
   0.4491451753115039,
   0.7172605275335493,
   'enrollee_id<=19841'],
  [0.43633071816535907,
   0.627059843885516,
   0.3345673299398427,
   'enrollee_id>6616 & enrollee_id<=13389']])

# Decision Tree

In [23]:
model_default = DecisionTreeClassifier()
scores_default = cross_val_score(model_default, X=X_train, y=y_train, cv = FOLDS)
model_default.fit(X_train, y_train)
y_pred_default = model_default.predict(X_test)
get_performance(X_test, y_test, y_pred_default)

Performance on all data
Precision: 0.8620284825303114
Recall: 0.8587334725121781
F1 Score: 0.8601471613991762


(0.8620284825303114, 0.8587334725121781, 0.8601471613991762, None)

In [24]:
default_result = pd.concat([X_test, y_test], axis=1, join='inner')
default_result.loc[:,"pred"] = y_pred_default
default_result_copy = default_result.copy()
X_test_copy = X_test.copy()
X_test_copy['mispredict'] = default_result_copy.apply(lambda row: mispredict_label(row), axis=1)
settings = diagnoser.Settings
settings.all_rules = True
# Get relevent attributes and target 
relevant_attributes, Target = get_relevent_attributs_target(X_test_copy)
# Generate MMD rules and correspodning information
MMD_rules, MMD_time, MMD_Features = get_MMD_results(X_test_copy, relevant_attributes, Target)

#Get biased attributes this time 
biased_attributes = get_biased_features(X_test_copy, relevant_attributes)

BGMD_rules, BGMD_time, BGMD_Features = get_BGMD_results(X_test_copy, biased_attributes, Target)

print('MMD Spent:', MMD_time, 'BGMD Spent:', BGMD_time)
MMD_rules, BGMD_rules

Original Rule
Subgroup Discovery Result

Found [1m3[0m subgroups
[1mDataset[0m
Target: mispredict=True
# Rows:	5748
# Cols:	14
% Target in dataset 14.13%
[1mSubgroup: city_development_index<=0.7959999999999999[0m
% of subgroup in population (Full Dataset):	30.79% (1770 rows)
Precision: P(mispredict=True | city_development_index<=0.7959999999999999) = 20.28%
Recall: P(city_development_index<=0.7959999999999999 | mispredict=True) = 44.21%
[1mSubgroup: city_development_index<=0.742[0m
% of subgroup in population (Full Dataset):	24.81% (1426 rows)
Precision: P(mispredict=True | city_development_index<=0.742) = 21.18%
Recall: P(city_development_index<=0.742 | mispredict=True) = 37.19%
[1mSubgroup: city_development_index<=0.884[0m
% of subgroup in population (Full Dataset):	41.25% (2371 rows)
Precision: P(mispredict=True | city_development_index<=0.884) = 18.3%
Recall: P(city_development_index<=0.884 | mispredict=True) = 53.45%

###############################

BGMD Rule
Subgroup D

([[0.27807900852052675,
   0.20282485875706216,
   0.4421182266009852,
   'city_development_index<=0.7959999999999999'],
  [0.27269871190700595,
   0.1830451286377056,
   0.5344827586206896,
   'city_development_index<=0.884'],
  [0.26988382484361034,
   0.211781206171108,
   0.37192118226600984,
   'city_development_index<=0.742']],
 [[0.27807900852052675,
   0.20282485875706216,
   0.4421182266009852,
   'city_development_index<=0.7959999999999999'],
  [0.27269871190700595,
   0.1830451286377056,
   0.5344827586206896,
   'city_development_index<=0.884'],
  [0.26988382484361034,
   0.211781206171108,
   0.37192118226600984,
   'city_development_index<=0.742']])