# This notebook contains the experiments on Adult Census dataset with LionForests

In [None]:
import pandas as pd 
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.preprocessing import MinMaxScaler,StandardScaler, LabelEncoder
from sklearn import preprocessing
from LionForests import LionForests

Firstly, we load the dataset and we set the feature and class names

In [None]:
feature_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','salary']
class_names=['<=50K','>50K'] #0: <=50K and 1: >50K
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', names=feature_names, delimiter=', ')
data_test = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', names=feature_names, delimiter=', ')
data_test = data_test.drop(data_test.index[[0]])

We are doing the following preprocessing influenced by a github notebook

In [None]:
data = data[(data != '?').all(axis=1)]
data_test = data_test[(data_test != '?').all(axis=1)]
data_test['salary'] = data_test['salary'].map({'<=50K.': '<=50K', '>50K.': '>50K'})
frames = [data, data_test]
data = pd.concat(frames)
data.head(15)

Feature Engineering from:
https://github.com/pooja2512/Adult-Census-Income/blob/master/Adult%20Census%20Income.ipynb. So run and skip the next code block

In [None]:

hs_grad = ['HS-grad','11th','10th','9th','12th']
elementary = ['1st-4th','5th-6th','7th-8th']
# replace elements in list.
for i in hs_grad:
    data['education'].replace(i , 'HS-grad', regex=True , inplace=True)
for e in elementary:
    data['education'].replace(e , 'elementary-school', regex=True, inplace = True)

married= ['Married-spouse-absent','Married-civ-spouse','Married-AF-spouse']
separated = ['Separated','Divorced']
#replace elements in list.
for m in married:
    data['marital-status'].replace(m ,'Married', regex=True, inplace = True)
for s in separated:
    data['marital-status'].replace(s ,'Separated', regex=True, inplace = True)

self_employed = ['Self-emp-not-inc','Self-emp-inc']
govt_employees = ['Local-gov','State-gov','Federal-gov']
for se in self_employed:
    data['workclass'].replace(se , 'Self_employed', regex=True, inplace = True)
for ge in govt_employees:
    data['workclass'].replace(ge , 'Govt_employees', regex=True, inplace = True)

del_cols = ['relationship','education-num']
data.drop(labels = del_cols, axis = 1, inplace = True)

index_age = data[data['age'] == 90].index
data.drop(labels = index_age, axis = 0, inplace =True)
num_col_new = ['age','capital-gain', 'capital-loss',
       'hours-per-week','fnlwgt']
cat_col_new = ['workclass', 'education', 'marital-status', 'occupation',
               'race', 'sex','salary','native-country']#add native-country label
scaler = MinMaxScaler()
#pd.DataFrame(scaler.fit_transform(data[num_col_new]),columns = num_col_new)
class DataFrameSelector(TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names = attribute_names
    def fit(self,X,y = None):
        return self
    def transform(self,X):
        return X[self.attribute_names]
class num_trans(TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        df = pd.DataFrame(X)
        df.columns = num_col_new 
        return df
pipeline = Pipeline([('selector',DataFrameSelector(num_col_new)),  
                     ('scaler',MinMaxScaler()),('transform',num_trans())])#('scaler',MinMaxScaler()),        
num_df = pipeline.fit_transform(data)
num_df.shape
# columns which I don't need after creating dummy variables dataframe
cols = ['workclass_Govt_employess','education_Some-college',
        'marital-status_Never-married','occupation_Other-service',
        'race_Black','sex_Male','salary_>50K']
class dummies(TransformerMixin):
    def __init__(self,cols):
        self.cols = cols
    
    def fit(self,X,y = None):
        return self
    
    def transform(self,X):
        df = pd.get_dummies(X)
        df_new = df[df.columns.difference(cols)] 
        return df_new
pipeline_cat=Pipeline([('selector',DataFrameSelector(cat_col_new)),
                      ('dummies',dummies(cols))])
cat_df = pipeline_cat.fit_transform(data)
cat_df['id'] = pd.Series(range(cat_df.shape[0]))
num_df['id'] = pd.Series(range(num_df.shape[0]))
final_df = pd.merge(cat_df,num_df,how = 'inner', on = 'id')
print(f"Number of observations in final dataset: {final_df.shape}")

We extract the train and target data from the dataframe

In [None]:
y = final_df['salary_<=50K'].values
final_df.drop(labels = ['id','salary_<=50K'],axis = 1,inplace = True)
X = final_df.values

And we need the new onehot encoded features' names

In [None]:
feature_names = list(final_df.columns.values)
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'race', 'sex','native-country']

We declare the name of the categorical features in order LionForests to extract more compact explanations

In [None]:
my_scaler = MinMaxScaler(feature_range=(0,1))
parameters = [{
    'max_depth': [10],
    'max_features': ['sqrt'],
    'bootstrap': [False],
    'min_samples_leaf' : [1],
    'n_estimators': [100]
}]
lf = LionForests(class_names=class_names)
lf.train(X, y, my_scaler, feature_names, None, categorical_features) #Please do not ascale data before training

Once again, we use the build-in GridSearch of LionForests to find the best classifier for this dataset

In [None]:
number_of_estimators = lf.model.n_estimators
print("Accuracy:",lf.accuracy,", Number of estimators:",lf.number_of_estimators)
print(lf.model)

Now we are ready to extract explanations about an instance. We choose the eleventh instance:

In [None]:
lf_rule = lf.following_breadcrumbs(X[10], False, True, False, complexity=2)
print(lf_rule)

The original rule would have been this one:

In [None]:
discrete_features = ['age']
lf.check_changes_in_prediction (X[10],lf_rule, discrete_features)

In this example, some categorical alternative values got reduced. Currently, LionForests do not extract them automatically, but it will be implemented soon. You can notice that fnlwgt, age, hours per week and capital gain and loss values were not inverse transformed from LionForests. This is happening because we preprocessed this data externally. Thus we have to inverse transform them manually. If you want check instance 882 too.