## Overview

**Dataset**

HMDA mortgage application data for 2018. Get the dataset from [here](https://ffiec.cfpb.gov/data-publication/snapshot-national-loan-level-dataset/2018)

2018 CSV file is being used for the demo. Either download the dataset inside the notebook or upload the dataset to the Watson Studio project data assets. 

Note: You have to update the dataset path in the notebook accordingly.


**Problem**
Posed as a binary classification problem to predict whether mortgage is approved or not


**Notebook**
Notebook attepmts to demonstrate the entire AI governance cycle, from the time policies for using AI models in applications are made, to the development/deployment/monitoring of the model etc.
    
**Models**
Various models are considered and evaluated on muliple trust dimensions - predictive performance, fairness, explainability, adversarial robustness and robustness to dataset shift. Models considered are:
1. Logistic Regression - scikit-learn
4. Random Forest Classifier - scikit-learn
5. Gradient BoostingClassifier - scikit-learn
6. Multi-layer Perceptron - Keras

# Libraries

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals
from numpy.random import seed
seed(42)
from tensorflow import set_random_seed
set_random_seed(42)

import os
import warnings

warnings.filterwarnings("ignore")

# visualization, data wrangling, matrix operations
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic
from tqdm import tqdm
from time import time



# model training, evaluation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization
from tensorflow.keras.initializers import Constant
from tensorflow.keras.metrics import AUC, Precision, Recall,TruePositives,TrueNegatives,FalsePositives,FalseNegatives


from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, balanced_accuracy_score, roc_auc_score

#Scalers
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


#save models
import pickle

#Datasets
# from aif360.datasets.meps_dataset_longitudinal import MEPSDatasetLongitudinal, merge

# fairness
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.datasets import BinaryLabelDataset

#Bias Mitigation Techniques
from aif360.algorithms.preprocessing.reweighing import Reweighing


# adversarial robustness
from art.attacks.evasion.hop_skip_jump import HopSkipJump
from art.classifiers.scikitlearn import ScikitlearnLogisticRegression, ScikitlearnRandomForestClassifier, ScikitlearnDecisionTreeClassifier
from art.estimators.classification import KerasClassifier
from art.classifiers.scikitlearn import ScikitlearnGradientBoostingClassifier
from art.metrics import empirical_robustness
from art.metrics.metrics import get_crafter

# explainability
import lime 
from lime.lime_tabular import LimeTabularExplainer
from aix360.metrics import faithfulness_metric, monotonicity_metric

pd.set_option("display.max_columns",200)
pd.set_option("display.max_rows",200)



## Define Useful Methods

In [None]:
import json
FACTS = {}
CURRENT_PHASE = 'Feature Engineering'

def add_fact(fact):
    global FACTS
    global CURRENT_PHASE
    curr_phase = FACTS.get(CURRENT_PHASE)
    if (curr_phase is None):
        curr_phase = []
        FACTS[CURRENT_PHASE] = curr_phase
    curr_phase.append(fact)
    
def save_facts(fname):
    with open(fname, "w") as outfile: 
        json.dump(FACTS, outfile)
        
def read_facts(fname):
    with open(fname, 'r') as openfile: 
        # Reading from json file 
        json_object = json.load(openfile)
        return json_object
    
def print_facts():
    s = json.dumps(FACTS, indent=4)
    print(s)

In [None]:
def print_result(model,X,y, scaler = None, data='train', model_name=None):
    """
    Evaluates and prints model metrics scores
    """
    Xdata = X.copy()
    
    if (scaler is not None):
        Xdata = scaler.transform(X)

    y_true = y.copy()        
    y_pred = model.predict(Xdata)
    
    ###for mlp, predict returns probabilities, not class
    if model_name == 'MLP':
        y_pred = np.argmax(y_pred, axis=1)

    y_preda = model.predict_proba(Xdata)
    y_preda = y_preda[:,1]
    y_preda = y_preda.reshape(y_preda.shape[0],1)
    
    print('Accuracy for ' + data + ' data: ' + str(accuracy_score(y_true, y_pred)))
    print('Confusion Matrix for ' + data + ' data: \n' + str(confusion_matrix(y_true, y_pred)))
    print('Precision for ' + data + ' data: ' + str(precision_score(y_true, y_pred)))
    print('Recall for ' + data + ' data: ' + str(recall_score(y_true, y_pred)))
    print('F1 score ' + data + ' data: ' + str(f1_score(y_true, y_pred)))
    print('Balanced accuracy for ' + data + ' data: ' + str(balanced_accuracy_score(y_true, y_pred)))
    print('AUC for ' + data + ' data: ' + str(roc_auc_score(y_true, y_preda))+'\n')

In [None]:
def accuracy_metrics(model,X,y, scaler = None, model_name=None):
    
    Xdata = X.copy()
    
    if (scaler is not None):
        Xdata = scaler.transform(X)


    y_true = y.copy()
    y_pred = model.predict(Xdata)

    ###for mlp, predict returns probabilities, not class
    if model_name == 'MLP':
        y_pred = np.argmax(y_pred, axis=1)
        
    y_preda = model.predict_proba(Xdata)
    y_preda = y_preda[:,1]
    y_preda = y_preda.reshape(y_preda.shape[0],1)

        
    #create a list and append values of the scores
    score_temp = []
    score_temp.append(accuracy_score(y_true, y_pred))
    score_temp.append(precision_score(y_true, y_pred))
    score_temp.append(recall_score(y_true, y_pred))
    score_temp.append(f1_score(y_true, y_pred))
    score_temp.append(balanced_accuracy_score(y_true, y_pred))
    score_temp.append(roc_auc_score(y_true, y_preda))
        
    cm = confusion_matrix(y_true, y_pred)
    score_temp.append(int(cm[0][0]))
    score_temp.append(int(cm[0][1]))
    score_temp.append(int(cm[1][0]))
    score_temp.append(int(cm[1][1]))

    return score_temp
        


In [None]:
def fairness_metrics(model, X, y, prot_attrs, prot_attrs_names, scaler = None, model_name=None):
    data = X.copy()  #X is dataframe, X_scaled is numpy array
        
    Xdata = X.copy()
    
    if (scaler is not None):
        Xdata = scaler.transform(X)

    y_pred = model.predict(Xdata)
    
    ###for mlp, predict returns probabilities, not class
    if model_name == 'MLP':
        y_pred = np.argmax(y_pred, axis=1)

    data['target'] = y_pred
    
    prot_attr = prot_attrs_names[0]
        
    # create "BinaryLabelDataset" and "BinaryLabelDatasetMetric" class instances
    if prot_attrs is not None:
        data_prot_attrs = prot_attrs.copy()
        #display(data_prot_attrs)
        #display(data)
        #mydf=pd.concat([data,data_prot_attrs],axis=1)
        #display(mydf[mydf.isnull().any(axis=1)])
        #print('not use prot attrs')
        df = BinaryLabelDataset(df=pd.concat([data,data_prot_attrs],axis=1), label_names=['target'], protected_attribute_names=[prot_attr])
        #mydf = pd.concat([data,data_prot_attrs],axis=1)
    else:
        #print('using prot attrs')
        df = BinaryLabelDataset(df=data, label_names=['target'], protected_attribute_names=[prot_attr])
        #mydf = data
        
    privileged_groups = [{prot_attr: 1.0}]
    unprivileged_groups = [{prot_attr: 0.0}]
    #print(mydf[(mydf['target'] == 1.0) & (mydf['derived_race_ethnicity_combination'] == 1.0)].shape) 
    #print(mydf[(mydf['derived_race_ethnicity_combination'] == 1.0)].shape) 
    #print(mydf[(mydf['target'] == 1.0) & (mydf['derived_race_ethnicity_combination'] == 0.0)].shape)
    #print(mydf[(mydf['derived_race_ethnicity_combination'] == 0.0)].shape) 
    #display(mydf[['target','gender','derived_race_ethnicity_combination']].head(10))
    score = BinaryLabelDatasetMetric(df, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
        
    # create a list and append values of the scores
    score_temp = []
    score_temp.append(score.disparate_impact())
    score_temp.append(score.statistical_parity_difference())
    
    return score_temp


In [None]:
def adversarial_metrics(model, X, y, sample_size, scaler = None, model_name=None):
    """
    X - dataset to be subsampled
    sample_size - size of the subsample, eventually number of adversarial samples
    models - models to attack
    """
    st = time()
        
    # subsample dataset for untargeted attack
    Xdata = X.copy()
    samples = Xdata.sample(sample_size, random_state=42)
    #print(samples.index)
    #return 0.0
    #use this is X was a ndarray --- samples = X[np.random.choice(X.shape[0], sample_size, replace=False)]
    
    if (scaler is not None):
        samples = scaler.transform(samples)
    else:
        samples = samples.values
  
    try:
        score = empirical_robustness(model,samples,attack_name='hsj')
        tm = round((time()-st)/60,2)
        print("{} success!!! Time to evaluate {} samples: {} min.".format(type(model),sample_size, tm))

    except:  
        import sys
        print(sys.exc_info()[0])
        print("{} failed!!!".format(type(model)))
        score = 'NA'

    # create a list and append values of the scores
    score_temp = []
    score_temp.append(score)

    return score_temp


In [None]:
def monotonicity(model, x, coefs, base):

    #find predicted class
    pred_class = np.argmax(model.predict_proba(x.reshape(1,-1)), axis=1)[0]  #np.argmax(y_pred, axis=1)
    #print('mon pred_class' + str(pred_class))
    x_copy = base.copy()

    #find indexs of coefficients in increasing order of value
    ar = np.argsort(coefs)
    pred_probs = np.zeros(x.shape[0])
    for ind in np.nditer(ar):
        x_copy[ind] = x[ind]
        x_copy_pr = model.predict_proba(x_copy.reshape(1,-1))
        #print('mon x_copy_pr' + str(x_copy_pr))
        pred_probs[ind] = x_copy_pr[0][pred_class]

    return np.all(np.diff(pred_probs[ar]) >= 0)

def faithfulness(model, x, coefs, base):
    #find predicted class
    pred_class = np.argmax(model.predict_proba(x.reshape(1,-1)), axis=1)[0]

    #find indexs of coefficients in decreasing order of value
    ar = np.argsort(-coefs)  #argsort returns indexes of values sorted in increasing order; so do it for negated array
    pred_probs = np.zeros(x.shape[0])
    for ind in np.nditer(ar):
        x_copy = x.copy()
        x_copy[ind] = base[ind]
        x_copy_pr = model.predict_proba(x_copy.reshape(1,-1))
        pred_probs[ind] = x_copy_pr[0][pred_class]

    return -np.corrcoef(coefs, pred_probs)[0,1]



In [None]:
###CHECK THIS CODE AGAINST AIX NOTEBOOK
def lime_explainability(explainer, model, X, y, ncases, scaler = None, model_name=None, random_state=42):  
    st = time()

    Xdata = X.copy()
    samples = Xdata.sample(ncases, random_state=random_state)
    #print(samples.index)
    #return 0.0
    
    if (scaler is not None):
        samples = scaler.transform(samples)
    else:
        samples = samples.values        

    mon = np.zeros(ncases)
    fait = np.zeros(ncases)
    base = np.zeros(samples.shape[1])
    #define num_features, look at top_labels
    for i in range(ncases):

        y_pred = model.predict(samples[i].reshape(1, -1))
    
        ###for mlp, predict returns probabilities, not class
        if model_name == 'MLP':
            y_pred = np.argmax(y_pred, axis=1)
            
        predicted_class = y_pred[0] * 1.0
        #print('predicted_class')
        #print(predicted_class)
        exp = explainer.explain_instance(samples[i], model.predict_proba, num_features=65, top_labels=2)
        try:
            le = exp.local_exp[predicted_class]
        except:
            print('exception thrown in explainability')
            le = exp.local_exp[0]
        m = exp.as_map()

        x = samples[i]
        coefs = np.zeros(x.shape[0])

        for v in le:
            coefs[v[0]] = v[1]
                
        '''
            pred_class = predicted_class
            print(pred_class)

            x_copy = base.copy()

            #find indexs of coefficients in increasing order of value
            ar = np.argsort(coefs)
            pred_probs = np.zeros(x.shape[0])
            for ind in np.nditer(ar):
                x_copy[ind] = x[ind]
                x_copy_pr = model.predict_proba(x_copy.reshape(1,-1))
                pred_probs[ind] = x_copy_pr[0][int(pred_class)]
        '''
        
        mon[i] = 0 # monotonicity(model, samples[i], coefs, base) #monotonicity_metric
        #tm = round((time()-st)/60,2)
        #print('Monotonocity time: ', tm)
        
        #st = time()
        fait[i] = faithfulness(model, samples[i], coefs, base) #faithfulness_metric

#         print("{} % of test records where explanation is monotonic".format(np.mean(mon)))
#         print("Faithfulness metric mean: ", np.mean(fait[~np.isnan(fait)]))
#         print("Faithfulness metric std. dev.:", np.std(fait[~np.isnan(fait)]))

    # create a list and append values of the scores
    score_temp = []
    score_temp.append(np.mean(mon))
    score_temp.append(np.mean(fait[~np.isnan(fait)]))
    score_temp.append(np.std(fait[~np.isnan(fait)]))
    tm = round((time()-st)/60,2)
    print('Faithfulness time: ', tm)
    
    return score_temp


# Dataset loading and processing

In [None]:
def load_data(dataset_name):

    instance_weights = None

    if (dataset_name == 'HMDA-MORTGAGE-APPROVAL-TRAINING-WITHOUT-PROTECTED-ATTRIBUTES'):
        '''
        'lei', 'derived_msa_md', 'state_code', 'county_code', 'census_tract',
       '', 'purchaser_type', '', '',
       '', '', 'intro_rate_period',
       'other_nonamortizing_features', '', 'total_units',
       '', '', '',
       'applicant_age_above_62', 'initially_payable_to_institution', '',
       'tract_population', 'tract_minority_population_percent',
       'ffiec_msa_md_median_family_income', 'tract_to_msa_income_percentage',
       'tract_owner_occupied_units', 'tract_one_to_four_family_homes',
       'tract_median_age_of_housing_units',
       '', '', '',
       ''
        '''
        data = pd.read_csv('2018_hmda_mortgage_approval_processed_TRAIN.csv')
        add_fact('Data for building model taken from file: ' + '2018_hmda_mortgage_approval_processed_TRAIN.csv')

        cat = ['conforming_loan_limit','preapproval','loan_term',
              'aus_1', 'applicant_age', 'applicant_credit_score_type']
        num = ['combined_loan_to_value_ratio', 'property_value', 'income', 'modified_debt_to_income_ratio',
              'loan_amount','state_code']
        prot_attrs_names = ['derived_race_ethnicity_combination', 'gender']
        add_fact('Protected features: ' + str(prot_attrs_names))
        add_fact('Privileged value for \'derived_race_ethnicity_combination\': Non-Hispanic White')
        add_fact('Privileged value for \'gender\': Male')
        add_fact('Protected features NOT being used for model creation')
        
        add_fact('Features used for model creation: ' + str(set(cat + num) - set(['state_code'])))
        
        
        gender_map = {'Male': 1.0, 'Female':0.0}
        data.replace({'gender': gender_map}, inplace=True)

        race_map = {'Non-Hispanic White': 1.0, 'Non-Hispanic Black':0.0}
        data.replace({'derived_race_ethnicity_combination': race_map}, inplace=True)
        
        prot_attrs = data[prot_attrs_names].copy()
        
        dum = pd.get_dummies(data[cat].astype('category'),prefix_sep='=')
        X  = pd.concat([dum, data[num]], axis=1)
        y = np.array((data['loan_approved'] == True) + .0)


        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state=42)

        NorthEast = ['ME','VT','NH','MA','RI','CT','NY','NJ','PA','MD','DE']
        #ne_states_lar_subset = lar_subset[lar_subset['state_code'].isin(NorthEast)]        
        shift_logic = X['state_code'].isin(NorthEast)

        #holdout for distribution shift
        X_shift = X[shift_logic]
        y_shift = y[shift_logic]
        prot_attrs_shift = prot_attrs[shift_logic]

        #the rest; to be splitted randomly 75/15/10
        _X = X[~shift_logic]
        _y = y[~shift_logic]
        _prot_attrs = prot_attrs[~shift_logic]

        X_train, X_test, y_train, y_test, prot_attrs_train, prot_attrs_test = train_test_split(_X, _y, _prot_attrs, test_size = .3, random_state=42) #was 0.1
        
        #remove 'state_code'
        X_train.drop(columns=['state_code'], inplace = True)
        X_test.drop(columns=['state_code'], inplace = True)
        X_shift.drop(columns=['state_code'], inplace = True)
        
        
        add_fact('Dataset for data-shift consists of \'state_code\' in :' + str(NorthEast))
        add_fact('Remaining \'state_code\'s used for train/test datasets')
        add_fact('Training dataset size: ' + str(X_train.shape))
        add_fact('Test dataset size: ' + str(X_test.shape))
        add_fact('Shift dataset size: ' + str(X_shift.shape))
        
        X_validate = None
        y_validate = None
        prot_attrs_validate = None
        #prot_attr = 'derived_race_ethnicity_combination=Non-Hispanic White'
        
    elif (dataset_name == 'HMDA-MORTGAGE-APPROVAL-TRAINING-WITH-PROTECTED-ATTRIBUTES'):
        data = pd.read_csv('2018_hmda_mortgage_approval_processed_TRAIN.csv')

        cat = ['conforming_loan_limit','preapproval','loan_term',
              'aus_1', 'applicant_age', 'applicant_credit_score_type']
        num = ['combined_loan_to_value_ratio', 'property_value', 'income', 'modified_debt_to_income_ratio',
              'loan_amount','state_code', 'derived_race_ethnicity_combination', 'gender']
        prot_attrs_names = ['derived_race_ethnicity_combination', 'gender']
        
        gender_map = {'Male': 1.0, 'Female':0.0}
        data.replace({'gender': gender_map}, inplace=True)

        race_map = {'Non-Hispanic White': 1.0, 'Non-Hispanic Black':0.0}
        data.replace({'derived_race_ethnicity_combination': race_map}, inplace=True)
        
        
        dum = pd.get_dummies(data[cat].astype('category'),prefix_sep='=')
        X  = pd.concat([dum, data[num]], axis=1)
        y = np.array((data['loan_approved'] == True) + .0)


        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state=42)

        NorthEast = ['ME','VT','NH','MA','RI','CT','NY','NJ','PA','MD','DE']
        #ne_states_lar_subset = lar_subset[lar_subset['state_code'].isin(NorthEast)]        
        shift_logic = X['state_code'].isin(NorthEast)

        #holdout for distribution shift
        X_shift = X[shift_logic]
        y_shift = y[shift_logic]
        prot_attrs_shift = None

        #the rest; to be splitted randomly 75/15/10
        _X = X[~shift_logic]
        _y = y[~shift_logic]

        X_train, X_test, y_train, y_test = train_test_split(_X, _y, test_size = .3, random_state=42) #was 0.1
        prot_attrs_train = None
        prot_attrs_test = None
        
        #remove 'state_code'
        X_train.drop(columns=['state_code'], inplace = True)
        X_test.drop(columns=['state_code'], inplace = True)
        X_shift.drop(columns=['state_code'], inplace = True)
        
        X_validate = None
        y_validate = None
        prot_attrs_validate = None
        #prot_attr = 'derived_race_ethnicity_combination=Non-Hispanic White'
        
    elif (dataset_name == 'HMDA-MORTGAGE-APPROVAL-VALIDATION-WITHOUT-PROTECTED-ATTRIBUTES'):
        data = pd.read_csv('2018_hmda_mortgage_approval_processed_VALIDATION.csv')
        add_fact('Data for validating model taken from file: ' + '2018_hmda_mortgage_approval_processed_VALIDATION.csv')

        cat = ['conforming_loan_limit','preapproval','loan_term',
              'aus_1', 'applicant_age', 'applicant_credit_score_type']
        num = ['combined_loan_to_value_ratio', 'property_value', 'income', 'modified_debt_to_income_ratio',
              'loan_amount'] #,'state_code']

        prot_attrs_names = ['derived_race_ethnicity_combination', 'gender']
        
        add_fact('Protected features: ' + str(prot_attrs_names))
        add_fact('Privileged value for \'derived_race_ethnicity_combination\': Non-Hispanic White')
        add_fact('Privileged value for \'gender\': Male')
        add_fact('Protected features NOT being used by model')
        
        add_fact('Features used by model: ' + str(set(cat + num)))
        
        gender_map = {'Male': 1.0, 'Female':0.0}
        data.replace({'gender': gender_map}, inplace=True)

        race_map = {'Non-Hispanic White': 1.0, 'Non-Hispanic Black':0.0}
        data.replace({'derived_race_ethnicity_combination': race_map}, inplace=True)

        prot_attrs = data[prot_attrs_names].copy()

        dum = pd.get_dummies(data[cat].astype('category'),prefix_sep='=')
        X  = pd.concat([dum, data[num]], axis=1)
        y = np.array((data['loan_approved'] == True) + .0)


        X_train = None
        y_train = None
        prot_attrs_train = None
        X_test = None
        y_test = None
        prot_attrs_test = None
        X_shift = None
        y_shift = None
        prot_attrs_shift = None
      
        
        X_validate = X
        y_validate = y
        prot_attrs_validate = prot_attrs
        #prot_attr = 'derived_race_ethnicity_combination=Non-Hispanic White'
        add_fact('Validation dataset size: ' + str(X.shape))
        
    elif (dataset_name == 'HMDA-MORTGAGE-APPROVAL-VALIDATION-WITH-PROTECTED-ATTRIBUTES'):
        data = pd.read_csv('2018_hmda_mortgage_approval_processed_VALIDATION.csv')

        cat = ['conforming_loan_limit','preapproval','loan_term',
              'aus_1', 'applicant_age', 'applicant_credit_score_type']
        num = ['combined_loan_to_value_ratio', 'property_value', 'income', 'modified_debt_to_income_ratio',
              'loan_amount', 'derived_race_ethnicity_combination', 'gender'] #,'state_code']

        prot_attrs_names = ['derived_race_ethnicity_combination', 'gender']
        
        gender_map = {'Male': 1.0, 'Female':0.0}
        data.replace({'gender': gender_map}, inplace=True)

        race_map = {'Non-Hispanic White': 1.0, 'Non-Hispanic Black':0.0}
        data.replace({'derived_race_ethnicity_combination': race_map}, inplace=True)

        dum = pd.get_dummies(data[cat].astype('category'),prefix_sep='=')
        X  = pd.concat([dum, data[num]], axis=1)
        y = np.array((data['loan_approved'] == True) + .0)


        X_train = None
        y_train = None
        prot_attrs_train = None
        X_test = None
        y_test = None
        prot_attrs_test = None
        X_shift = None
        y_shift = None
        prot_attrs_shift = None
      
        
        X_validate = X
        y_validate = y
        prot_attrs_validate = None
        #prot_attr = 'derived_race_ethnicity_combination=Non-Hispanic White'  
    elif (dataset_name == 'HMDA-MORTGAGE-SIMPLE-MODEL'):
        data = pd.read_csv('2018_hmda_mortgage_approval_processed_TRAIN.csv')
        add_fact('Data for building model taken from file: ' + '2018_hmda_mortgage_approval_processed_TRAIN.csv')

        num = ['combined_loan_to_value_ratio', 'modified_debt_to_income_ratio', 'state_code']
        prot_attrs_names = ['derived_race_ethnicity_combination', 'gender']
        add_fact('Protected features: ' + str(prot_attrs_names))
        add_fact('Privileged value for \'derived_race_ethnicity_combination\': Non-Hispanic White')
        add_fact('Privileged value for \'gender\': Male')
        add_fact('Protected features NOT being used for model creation')
        
        add_fact('Features used for model creation: ' + str(set(num) - set(['state_code'])))
        
        
        gender_map = {'Male': 1.0, 'Female':0.0}
        data.replace({'gender': gender_map}, inplace=True)

        race_map = {'Non-Hispanic White': 1.0, 'Non-Hispanic Black':0.0}
        data.replace({'derived_race_ethnicity_combination': race_map}, inplace=True)
        
        prot_attrs = data[prot_attrs_names].copy()
        
        X  = data[num]
        y = np.array((data['loan_approved'] == True) + .0)


        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state=42)

        NorthEast = ['ME','VT','NH','MA','RI','CT','NY','NJ','PA','MD','DE']
        #ne_states_lar_subset = lar_subset[lar_subset['state_code'].isin(NorthEast)]        
        shift_logic = X['state_code'].isin(NorthEast)

        #holdout for distribution shift
        X_shift = X[shift_logic]
        y_shift = y[shift_logic]
        prot_attrs_shift = prot_attrs[shift_logic]

        #the rest; to be splitted randomly 75/15/10
        _X = X[~shift_logic]
        _y = y[~shift_logic]
        _prot_attrs = prot_attrs[~shift_logic]

        X_train, X_test, y_train, y_test, prot_attrs_train, prot_attrs_test = train_test_split(_X, _y, _prot_attrs, test_size = .3, random_state=42) #was 0.1
        
        #remove 'state_code'
        X_train.drop(columns=['state_code'], inplace = True)
        X_test.drop(columns=['state_code'], inplace = True)
        X_shift.drop(columns=['state_code'], inplace = True)
        
        
        add_fact('Dataset for data-shift consists of \'state_code\' in :' + str(NorthEast))
        add_fact('Remaining \'state_code\'s used for train/test datasets')
        add_fact('Training dataset size: ' + str(X_train.shape))
        add_fact('Test dataset size: ' + str(X_test.shape))
        add_fact('Shift dataset size: ' + str(X_shift.shape))
        
        X_validate = None
        y_validate = None
        prot_attrs_validate = None
        #prot_attr = 'derived_race_ethnicity_combination=Non-Hispanic White'



    return X_train, y_train, prot_attrs_train, X_validate, y_validate, prot_attrs_validate, X_test, y_test, prot_attrs_test, X_shift, y_shift, prot_attrs_shift, prot_attrs_names, instance_weights

# Model training

In [None]:
def evaluate_model(model, adv_model, model_type, X, y, prot_attrs, prot_attrs_names, explainer, scaler, num_of_adversarial_examples, num_of_cases_to_explain):
    accuracy_table = {}
    fairness_table = {}
    adversarial_table = {}
    explainability_table = {}

    if X is not None:
        accuracy_table[model_type] = accuracy_metrics(model,X,y, scaler, model_type)
        
        ind=['Accuracy', 'Precision', 'Recall', 'F1', 'Balanced Accuracy', 'AUC', 'TN', 'FP','FN', 'TP']
        leng = len(ind)
        fact = ''
        for ii in range(leng):
            if (ii > 0):
                fact += ', '
            fact += ind[ii]+ ' = ' + str(accuracy_table[model_type][ii])
        add_fact('Accuracy metrics for model type ' + model_type + ' :' + fact)


        fairness_table[model_type] = fairness_metrics(model,X,y, prot_attrs, prot_attrs_names, scaler, model_type)
        ind=['Disparate Impact', 'Statistical Parity Difference']
        leng = len(ind)
        fact = ''
        for ii in range(leng):
            if (ii > 0):
                fact += ', '
            fact += ind[ii]+ ' = ' + str(fairness_table[model_type][ii])
        add_fact('Fairness metrics for model type ' + model_type + ' :' + fact)

            
        adversarial_table[model_type] = adversarial_metrics(adv_model,X,y, num_of_adversarial_examples, scaler, model_type)
        ind=['Empirical Robustness']
        leng = len(ind)
        fact = ''
        for ii in range(leng):
            if (ii > 0):
                fact += ', '
            fact += ind[ii]+ ' = ' + str(adversarial_table[model_type][ii])
        add_fact('Adversarial robustness metrics for model type ' + model_type + ' :' + fact)


        explainability_table[model_type] = lime_explainability(explainer, model, X, y, num_of_cases_to_explain, scaler, model_type)
        ind=['Monoticity %', 'Faithfulness mean', 'Faitfulness std']
        leng = len(ind)
        fact = ''
        for ii in range(leng):
            if (ii > 0):
                fact += ', '
            fact += ind[ii]+ ' = ' + str(explainability_table[model_type][ii])
        add_fact('Explainability metrics for model type ' + model_type + ' :' + fact)




    accuracy_table_df = pd.DataFrame(accuracy_table, index=['Accuracy', 'Precision', 'Recall', 'F1', 'Balanced Accuracy', 'AUC', 'TN', 'FP','FN', 'TP'])


    fairness_table_df = pd.DataFrame(fairness_table, index=['Disparate Impact', 'Statistical Parity Difference'])

    adversarial_table_df = pd.DataFrame(adversarial_table, index=['Empirical Robustness'])

    explainability_table_df = pd.DataFrame(explainability_table, index=['Monoticity %', 'Faithfulness mean', 'Faitfulness std'])

    results_table = pd.concat([accuracy_table_df,fairness_table_df,adversarial_table_df,explainability_table_df])
    
    return results_table

In [None]:
def build_model(model_type, X, y, scaler=None, sample_weight=None, optimize=False):
    
    sample_weights = None
    if (sample_weight is not None):
        sample_weights = sample_weight.copy()
        
    Xdata = X.copy()
    ydata = y.copy()
    
    if (scaler is not None):
        Xdata = scaler.transform(Xdata)

        
    print("\n\nModel: ", model_type)

    if (model_type == 'LR'):

        if optimize:

            # Create first pipeline for base without reducing features.
            from sklearn.pipeline import Pipeline
            from sklearn.model_selection import GridSearchCV, StratifiedKFold
            from sklearn.metrics import make_scorer
            #pipe = Pipeline([('scaler', StandardScaler()), ('classifier' , LogisticRegression(random_state=42))]) #class_weight='balanced'
            pipe = Pipeline([('classifier' , LogisticRegression(random_state=42))])
            # Create param grid.

            param_grid = [
                {'classifier__penalty' : ['l2'],
                 'classifier__class_weight' : ['balanced', None],
                'classifier__C' : np.append(np.logspace(-4, 4, 20),[1]), #np.logspace(-4, 4, 20),
                 'classifier__max_iter' : [1000],
                'classifier__solver' : ['lbfgs', 'liblinear','newton-cg','sag', 'saga']}] #'liblinear', 

            scorers = {
                #'precision_score': make_scorer(precision_score),
                #'recall_score': make_scorer(recall_score),
                'accuracy_score': make_scorer(accuracy_score),
                'balanced_score': make_scorer(balanced_accuracy_score),
                'auc_score': make_scorer(roc_auc_score)
            }

            # Create grid search object

            skf = StratifiedKFold(n_splits=5)
            clf = GridSearchCV(pipe, param_grid = param_grid, cv = skf, verbose=True, n_jobs=-1, scoring=scorers, refit = 'auc_score')

            # Fit on data

            #return clf.fit(Xdata, ydata, sample_weight=sample_weights)
            best_gscv = clf.fit(Xdata, ydata, **{'classifier__sample_weight': sample_weights})
            best_lr = best_gscv.best_estimator_.named_steps['classifier']
            print(best_lr)
            return best_lr
        else:

            clf_logit = LogisticRegression(C=1, class_weight='balanced', penalty='l2', random_state=42)
            #clf_logit = LogisticRegression(random_state = 1, solver = 'lbfgs' ,max_iter = 500)


            return clf_logit.fit(Xdata, ydata, sample_weight=sample_weights)

    elif (model_type == 'RF'):
        clf_rf = RandomForestClassifier(n_jobs=-1, max_depth=8,
                                      n_estimators=500, class_weight='balanced', random_state=42)
        return clf_rf.fit(Xdata, ydata, sample_weight=sample_weights)
    
    elif (model_type == 'DT'):
        clf_rf = DecisionTreeClassifier(class_weight='balanced', random_state=42)
        return clf_rf.fit(Xdata, ydata, sample_weight=sample_weights)

    elif (model_type == 'GBC'):
        if optimize:

            # Create first pipeline for base without reducing features.
            from sklearn.pipeline import Pipeline
            from sklearn.model_selection import GridSearchCV, StratifiedKFold
            from sklearn.metrics import make_scorer
            pipe = Pipeline([('classifier' , GradientBoostingClassifier(random_state=42, learning_rate=0.15, n_estimators=300,n_iter_no_change=5,validation_fraction=0.2))])
            # Create param grid.

            param_grid = [
                {#'classifier__learning_rate' : [0.15,0.1,0.05,0.01,0.005,0.001],
                 #'classifier__n_estimators' : [100,250,500,750,1000,1250,1500]
                 'classifier__min_samples_leaf':range(25,200,25), 
                 'classifier__min_samples_split':range(50,300,50)
                 
                }] #'liblinear', 

            scorers = {
                #'precision_score': make_scorer(precision_score),
                #'recall_score': make_scorer(recall_score),
                #'accuracy_score': make_scorer(accuracy_score),
                #'balanced_score': make_scorer(balanced_accuracy_score),
                'auc_score': make_scorer(roc_auc_score)
            }

            # Create grid search object

            skf = StratifiedKFold(n_splits=5)
            clf = GridSearchCV(pipe, param_grid = param_grid, cv = skf, verbose=True, n_jobs=-1, scoring=scorers, refit = 'auc_score')

            # Fit on data

            #return clf.fit(Xdata, ydata, sample_weight=sample_weights)
            best_gscv = clf.fit(Xdata, ydata, **{'classifier__sample_weight': sample_weights})
            #print(best_gscv.grid_scores_, best_gscv.best_params_, best_gscv.best_score_)
            best_gbc = best_gscv.best_estimator_.named_steps['classifier']
            print(best_gbc)
            return best_gbc
        else:
            
            clf_gb = GradientBoostingClassifier(n_estimators=300, random_state=42)
            print(clf_gb)
            return clf_gb.fit(Xdata, ydata, sample_weight=sample_weights)
    
    elif (model_type == 'MLP'):
        
        w0 = np.bincount(ydata.ravel().astype(int))[0]/np.bincount(ydata.ravel().astype(int))[1]
        neg = np.bincount(ydata.ravel().astype(int))[0]
        pos = np.bincount(ydata.ravel().astype(int))[1]

        total = neg+pos
        initial_bias = np.log([pos/neg])
        initial_negative_bias = np.log([neg/pos])

        clf_mlp = Sequential()
        clf_mlp.add(Dense(10,input_dim= Xdata.shape[1], activation='relu'))
        clf_mlp.add(BatchNormalization())
        clf_mlp.add(Dense(20, activation='relu')) #input_dim= X_train.shape[1],
        clf_mlp.add(BatchNormalization())
        clf_mlp.add(Dense(4, activation='relu')) #,input_dim= X_train.shape[1]
        clf_mlp.add(BatchNormalization())
        #clf_mlp.add(Dense(1, activation='relu'))
        #clf_mlp.add(Dense(1, activation='sigmoid', bias_initializer= Constant(initial_bias)))
        clf_mlp.add(Dense(2, activation='softmax', bias_initializer= Constant(initial_bias)))

        #clf_mlp.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',AUC(), Precision(), Recall(),TruePositives(),TrueNegatives(),FalsePositives(),FalseNegatives()])
        clf_mlp.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy',AUC(), Precision(), Recall(),TruePositives(),TrueNegatives(),FalsePositives(),FalseNegatives()])

        #clf_mlp.summary()

        weight_for_0 = (1 / neg)*(total)/2.0 
        weight_for_1 = (1 / pos)*(total)/2.0
        #print('Computed class weights', weight_for_0, weight_for_1)
        class_weight = {0: weight_for_0, 1: weight_for_1}
        
        #import numpy as np
        #from sklearn.utils.class_weight import compute_class_weight

        #y_integers = np.argmax(y_art_train, axis=1)
        #class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers)
        #d_class_weights = dict(enumerate(class_weights))
        #print('Computed weights for one hot', d_class_weights)

        ####SAMPLE WEIGHTS USE
        from keras.utils import to_categorical
        hist = clf_mlp.fit(Xdata, y=to_categorical(ydata), epochs=130, batch_size=256, class_weight=class_weight, sample_weight=sample_weights, verbose=0) #{0:w0,1:1})
        #print('fit compete')
        
        train_predictions_baseline = clf_mlp.predict(Xdata, batch_size=256)
        return clf_mlp


# Scores Table

In [None]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Experiments

In [None]:
def run_experiments(database_list, models_list, num_of_cases_to_explain, num_of_adversarial_examples, scaling, optimize, reweighing):

    for dataset_name in database_list:
        #load & process data
        X_train, y_train, prot_attrs_train, X_validate, y_validate, prot_attrs_validate, X_test, y_test, prot_attrs_test, X_shift, y_shift, prot_attrs_shift, prot_attrs_names, instance_weights = load_data(dataset_name)
        add_fact('Sensitive attribute(s): '+ str(prot_attrs_names))
        sample_weights = None
        if (instance_weights is not None):
            sample_weights = instance_weights.copy()
            #print(sample_weights[:5])
            
        if reweighing:
            add_fact('Performing bias mitigation using Reweighing')
            data = X_train.copy()  #X is dataframe
            data_y = y_train.copy()
            
            prot_attr = prot_attrs_names[0]
            data['target'] = data_y
            
            instance_weights_name = None
            if (sample_weights is not None):
                instance_weights_name = 'instance_weights'
                data[instance_weights_name] = sample_weights

            # create "BinaryLabelDataset" 
            if prot_attrs_train is not None:
                data_prot_attrs = prot_attrs_train.copy()
                df = BinaryLabelDataset(df=pd.concat([data, data_prot_attrs],axis=1), label_names=['target'], protected_attribute_names=[prot_attr], instance_weights_name=instance_weights_name)  #bias mitigation wrt 1st protected attribute only
            else:
                df = BinaryLabelDataset(df=data, label_names=['target'], protected_attribute_names=[prot_attr], instance_weights_name=instance_weights_name)  #bias mitigation wrt 1st protected attribute only

            privileged_groups = [{prot_attr: 1}]
            unprivileged_groups = [{prot_attr: 0}]

            RW = Reweighing(unprivileged_groups=unprivileged_groups,
                            privileged_groups=privileged_groups)
            RW.fit(df)
            X_transf = RW.transform(df)
            sample_weights = X_transf.instance_weights


        scaler = None
        if scaling:
            scaler = StandardScaler().fit(X_train)
            add_fact('Scaling features using StandardScaler()')

        
        train_table = {}
        validate_table = {}
        test_table = {}
        shift_table = {}

        
        learned_models = {}

        for model_type in models_list:
            model = build_model(model_type, X_train, y_train, scaler, sample_weights, optimize)
            learned_models[model_type] = model

            if (model_type == 'LR'):
                adv_model = ScikitlearnLogisticRegression(model=model)
            elif (model_type == 'RF'):
                adv_model = ScikitlearnRandomForestClassifier(model=model)
            elif (model_type == 'GBC'):
                adv_model = ScikitlearnGradientBoostingClassifier(model=model)
            elif (model_type == 'MLP'):
                adv_model = KerasClassifier(model=model)
            elif (model_type == 'DT'):
                adv_model = ScikitlearnDecisionTreeClassifier(model=model)
            else:
                adv_model = None
            '''
            if (model_type == 'LR'):
                feature_importances = pd.DataFrame(abs(model.coef_).T,
                                               index = X_train.columns,
                                                columns=['importance']).sort_values('importance', ascending=False)
                print(feature_importances)
            elif ((model_type == 'RF') | (model_type == 'GBC')):
                feature_importances = pd.DataFrame(model.feature_importances_,
                                                   index = X_train.columns,
                                                    columns=['importance']).sort_values('importance', ascending=False)
                print(feature_importances)
            '''

            ## CHECK WITH AIX NOTEBOOK TO SEE IF SCALED DATA NEEDS TO BE PASSED IN
            if scaling:
                explainer = LimeTabularExplainer(scaler.transform(X_train),feature_names=X_train.columns, 
                                                 class_names=['0','1'], discretize_continuous=True)
            else:
                explainer = LimeTabularExplainer(X_train.values,feature_names=X_train.columns, 
                                                 class_names=['0','1'], discretize_continuous=True)
                

            #print_result(model,X_train, y_train, scaler, 'train')
            #print_result(model,X_validate, y_validate, scaler, 'validate', model_type)

            if X_train is not None:
                add_fact('Evaluating model ' + model_type + ' on Train data')
            train_table[model_type] = evaluate_model(model, adv_model, model_type, X_train, y_train, prot_attrs_train, prot_attrs_names, explainer, scaler, num_of_adversarial_examples, num_of_cases_to_explain)

            if X_validate is not None:
                add_fact('Evaluating model ' + model_type + ' on Validation data')
            validate_table[model_type] = evaluate_model(model, adv_model, model_type, X_validate, y_validate, prot_attrs_validate, prot_attrs_names, explainer, scaler, num_of_adversarial_examples, num_of_cases_to_explain)

            if X_test is not None:
                add_fact('Evaluating model ' + model_type + ' on Test data')
            test_table[model_type] = evaluate_model(model, adv_model, model_type, X_test, y_test, prot_attrs_test, prot_attrs_names, explainer, scaler, num_of_adversarial_examples, num_of_cases_to_explain)

            
            if X_shift is not None:
                add_fact('Evaluating model ' + model_type + ' on Shift data')
            shift_table[model_type] = evaluate_model(model, adv_model, model_type, X_shift, y_shift, prot_attrs_shift, prot_attrs_names, explainer, scaler, num_of_adversarial_examples, num_of_cases_to_explain)


        tt = pd.concat(list(train_table.values()), axis=1)
        vt = pd.concat(list(validate_table.values()), axis=1)
        te = pd.concat(list(test_table.values()), axis=1)
        st = pd.concat(list(shift_table.values()), axis=1)
        results_table = pd.concat([tt, vt, te, st],axis=1,keys=('Train','Validate','Test','Shift'))
        filename_qualifier = '_'.join(models_list)
        if scaling:
            filename_qualifier = filename_qualifier + '_scaling'
        if optimize:
            filename_qualifier = filename_qualifier + '_optimized'
        if reweighing:
            filename_qualifier = filename_qualifier + '_reweighing'
        print('Database name:', filename_qualifier, '_',dataset_name)
        display(results_table)
        results_table.to_excel(dataset_name+'_'+filename_qualifier+'_results.xls', index=True)
        
        return learned_models


In [None]:
def add_HMDA_mortgage_approval_dataset_to_catalog(fname='2018_public_lar_csv.csv'):
    #create two files (70/30% split) as training and validation datasets
    import pandas as pd
    iter_csv = pd.read_csv(fname,low_memory=False, iterator=True, chunksize=1000000)
    #df = pd.concat([chunk[chunk['field'] > constant] for chunk in iter_csv])
    lar = pd.concat(chunk for chunk in iter_csv)

    add_fact("Raw data filename: " + '2018_public_lar_csv.csv')
    add_fact('Raw dataset size: ' + str(lar.shape))
    
    train = lar.sample(frac=0.70, random_state=42)
    validation = lar.drop(train.index)
    
    add_fact('Creating datasets for model building (70%) and validation (30%)')
    add_fact('Model building dataset: ' + '2018_public_lar_csv_TRAIN.csv')
    add_fact('Model building dataset size: ' + str(train.shape))
    add_fact('Model validation dataset: ' + '2018_public_lar_csv_VALIDATION.csv')
    add_fact('Model validation dataset size: ' + str(validation.shape))
    print('------------------------------------------------------------------------------')
    train.to_csv('2018_public_lar_csv_TRAIN.csv', index=False)
    validation.to_csv('2018_public_lar_csv_VALIDATION.csv', index=False)
    print("Training set size: " + str(train.shape))
    print("Validation set size: " + str(validation.shape))
    print('------------------------------------------------------------------------------')

In [None]:
def process_hmda_mortgage_approval_dataset(fname):
    import pandas as pd
    iter_csv = pd.read_csv(fname,low_memory=False, iterator=True, chunksize=1000000)
    #df = pd.concat([chunk[chunk['field'] > constant] for chunk in iter_csv])
    lar = pd.concat(chunk for chunk in iter_csv)

    add_fact('Dataset name: ' + fname)
    add_fact('Dataset size: ' + str(lar.shape))


    lar_subset = lar.loc[((lar['loan_purpose'] == 1) &
                          (lar['derived_loan_product_type'] == "Conventional:First Lien") &
                          (lar["derived_dwelling_category"] == "Single Family (1-4 Units):Site-Built") &
                          (lar["open_end_line_of_credit"] == 2) &
                          (lar["business_or_commercial_purpose"] == 2) &
                          (lar["occupancy_type"] == 1) &
                          (lar["reverse_mortgage"] == 2) &
                          (lar["negative_amortization"] == 2) &
                          (lar["interest_only_payment"] == 2) &
                          (lar["conforming_loan_limit"] != "U") &
                          (lar["balloon_payment"] == 2)) , :]

    add_fact("Limiting rows to condition: " + 'loan_purpose == 1')
    add_fact("Limiting rows to condition: " + 'derived_loan_product_type == "Conventional:First Lien"')
    add_fact("Limiting rows to condition: " + 'derived_dwelling_category == "Single Family (1-4 Units):Site-Built"')
    add_fact("Limiting rows to condition: " + 'open_end_line_of_credit == 2')
    add_fact("Limiting rows to condition: " + 'business_or_commercial_purpose == 2')
    add_fact("Limiting rows to condition: " + 'occupancy_type == 1')
    add_fact("Limiting rows to condition: " + 'reverse_mortgage == 2')
    add_fact("Limiting rows to condition: " + 'negative_amortization == 2')
    add_fact("Limiting rows to condition: " + 'interest_only_payment == 2')
    add_fact("Limiting rows to condition: " + 'conforming_loan_limit != "U"')
    add_fact("Limiting rows to condition: " + 'balloon_payment == 2')


    print('Data size after initial filtering: ' + str(lar_subset.shape))

    print("action_taken distribution")
    display(lar_subset['action_taken'].value_counts(normalize=True))
    
    lar = None


    #action_taken
    #1 - Loan originated
    #2 - Application approved but not accepted
    #3 - Application denied
    lar_subset = lar_subset.loc[(lar_subset['action_taken'] == 1) | (lar_subset['action_taken'] == 3),:]

    add_fact("Limiting rows to condition: " + 'action_taken == 1 [loan originated] or action_taken == 3 [application denied]')
    
    print(lar_subset.shape)

    display(lar_subset['action_taken'].value_counts(normalize=True))

    #Race - White=5, Black = 3, Asian =2
    #Ethnicity - Hispanic=1, Non-hispanic=2
    display(pd.crosstab(lar_subset['applicant_race_1'],lar_subset['applicant_ethnicity_1'], normalize=False))

    display(pd.crosstab(lar_subset['applicant_race_1'],lar_subset['derived_race'], normalize=False))

    display(pd.crosstab(lar_subset['applicant_ethnicity_1'],lar_subset['derived_ethnicity'], normalize=False))

    def race(row):
        if ((row['applicant_ethnicity_1'] == 2) and (row['applicant_race_1'] == 5)):  #non-Hispanic Whites are marked as WHITE; all others as NON-WHITE
            return 'Non-Hispanic White'
        elif ((row['applicant_ethnicity_1'] == 2) and (row['applicant_race_1'] == 3)):
            return 'Non-Hispanic Black'
        else:
            return 'Others'

    lar_subset['derived_race_ethnicity_combination'] = lar_subset.apply(lambda row: race(row), axis=1)

    display(lar_subset['derived_race_ethnicity_combination'].value_counts())
    
    add_fact("Created new feature: derived_race_ethnicity_combination")
    add_fact("Setting derived_race_ethnicity_combination to 'Non-Hispanic White' if applicant_ethnicity_1 == 2 [non Hispanic] and applicant_race_1 == 5 [White]")
    add_fact("Setting derived_race_ethnicity_combination to 'Non-Hispanic Black' if applicant_ethnicity_1 == 2 [non Hispanic] and applicant_race_1 == 3 [Black]")
    add_fact("Limiting rows to condition: " + 'derived_race_ethnicity_combination == "Non-Hispanic Black" | derived_race_ethnicity_combination == "Non-Hispanic White"')

    #restrict population to non hispanic white and not hispanic black only
    lar_subset = lar_subset.loc[(lar_subset['derived_race_ethnicity_combination'] == 'Non-Hispanic White') | (lar_subset['derived_race_ethnicity_combination'] == 'Non-Hispanic Black'),:]

    def loan_approval(row):
        if (row['action_taken'] <= 2):  #non-Hispanic Whites are marked as WHITE; all others as NON-WHITE
            return True
        else:
            return False

    lar_subset['loan_approved'] = lar_subset.apply(lambda row: loan_approval(row), axis=1)

    display(pd.crosstab(lar_subset['derived_race_ethnicity_combination'],lar_subset['loan_approved'], normalize=False))

    lar_subset['loan_term'].unique()

    lar_subset = lar_subset.loc[((lar_subset["loan_term"] == '180') | (lar_subset["loan_term"] == '360')) , :]
    add_fact("Limiting rows to condition: " + 'loan_term == 180 or 360')

    print(lar_subset.shape)

    display(pd.crosstab(lar_subset['derived_race_ethnicity_combination'],lar_subset['loan_approved'], normalize=False))

    # 1 is Male, 2 is Female
    display(pd.crosstab(lar_subset['applicant_sex'],lar_subset['loan_approved'], normalize=False))

    def gender(row):
        if (row['applicant_sex'] == 1):  #non-Hispanic Whites are marked as WHITE; all others as NON-WHITE
            return 'Male'
        elif (row['applicant_sex'] == 2):
            return 'Female'
        else:
            return 'Others'

    lar_subset['gender'] = lar_subset.apply(lambda row: gender(row), axis=1)

    #restrict population to male and female only
    lar_subset = lar_subset.loc[(lar_subset['gender'] == 'Male') | (lar_subset['gender'] == 'Female'),:]
    
    add_fact("Created new feature: gender")
    add_fact("Setting gender to 'Male' if applicant_sex == 1")
    add_fact("Setting gender to 'Female' if applicant_sex == 2")
    add_fact("Limiting rows to condition: " + 'gender == "Male" | gender == "Female"')

    display(pd.crosstab(lar_subset['gender'],lar_subset['loan_approved'], normalize=False))

    print(lar_subset.shape)

    lar_subset['loan_amount'].unique()

    lar_subset[lar_subset['loan_amount'].isna()].shape

    lar_subset['loan_amount'] = lar_subset['loan_amount'].astype('float64')

    lar_subset['loan_amount']

    lar_subset['combined_loan_to_value_ratio'].unique()  #string and has nan - convert to real and handle nan

    

    print(lar_subset.shape)
     
    # Choose only data points with combined_loan_to_value_ratio != Exempt   
    lar_subset = lar_subset.loc[lar_subset['combined_loan_to_value_ratio'] != 'Exempt', :]
    add_fact("Limiting rows to condition: " + 'combined_loan_to_value_ratio != "Exempt"')

    print(lar_subset.shape)

    lar_subset['combined_loan_to_value_ratio'] = lar_subset['combined_loan_to_value_ratio'].astype('float64')
    add_fact("Converted feature combined_loan_to_value_ratio from String to Float")

    lar_subset.loc[lar_subset['combined_loan_to_value_ratio'].isna(),['loan_amount','property_value']]

    #remove NA values for loan to value ratio
    lar_subset = lar_subset.loc[~lar_subset['combined_loan_to_value_ratio'].isna(),:]
    add_fact("Limiting rows to condition: " + 'combined_loan_to_value_ratio is not NA')

    print(lar_subset.shape)

    lar_subset[lar_subset['property_value'] == 'Exempt'].shape

    lar_subset['property_value'] = lar_subset['property_value'].astype('float64')
    add_fact("Converted feature property_value from String to Float")

    lar_subset[lar_subset['property_value'].isna()].shape

    #remove NA values for property_value
    lar_subset = lar_subset.loc[~lar_subset['property_value'].isna(),:]
    add_fact("Limiting rows to condition: " + 'property_value is not NA')
    print(lar_subset.shape)

    # lar_subset[lar_subset['income'] == 'Exempt'].shape

    lar_subset['income'] = lar_subset['income'].astype('float64')
    add_fact("Converted feature income from String to Float")
    
    lar_subset[lar_subset['income'].isna()].shape

    #remove NA values for income
    lar_subset = lar_subset.loc[~lar_subset['income'].isna(),:]
    add_fact("Limiting rows to condition: " + 'income is not NA')
    print(lar_subset.shape)

    #temp = lar_subset[lar_subset['interest_rate'].isna()]
    #temp['action_taken'].value_counts()
    display(pd.crosstab(lar_subset['hoepa_status'],lar_subset['loan_approved'], normalize=False))

    #hoepa_status is always 3 if loan is declined. so discard from attributes...probably known after loan decision

    temp = lar_subset.copy()

    temp['int_rate_is_na'] = temp['interest_rate'].isna()

    pd.crosstab(temp['int_rate_is_na'],temp['loan_approved'], normalize=False)

    #interest rate is na if loan is declined. so set after loan decision. so remove from set of attributes

    temp['disc_pts_is_na'] = temp['discount_points'].isna()
    pd.crosstab(temp['disc_pts_is_na'],temp['loan_approved'], normalize=False)

    #remove discount_points as it is set only if loan approved

    temp['rate_spread_is_na'] = temp['rate_spread'].isna()
    pd.crosstab(temp['rate_spread_is_na'],temp['loan_approved'], normalize=False)

    #remove rate_spread as set once loan is approved

    temp['origination_charges_is_na'] = temp['origination_charges'].isna()
    pd.crosstab(temp['origination_charges_is_na'],temp['loan_approved'], normalize=False)

    #remove origination_charges

    temp['lender_credits_is_na'] = temp['lender_credits'].isna()
    pd.crosstab(temp['lender_credits_is_na'],temp['loan_approved'], normalize=False)

    #remove lender_credits

    temp['intro_rate_period_is_na'] = temp['intro_rate_period'].isna()
    pd.crosstab(temp['intro_rate_period_is_na'],temp['loan_approved'], normalize=False)

    #keep intro_rate_period

    pd.crosstab(lar_subset['applicant_credit_score_type'],lar_subset['loan_approved'], normalize=False)

    #make sure applicant credit score type is NOT 1111 (Exempt)

    pd.crosstab(lar_subset['aus_1'],lar_subset['loan_approved'], normalize=False)

    #remove rows with aus_1 == 1111 (Exempt)
    lar_subset = lar_subset.loc[lar_subset['aus_1'] != 1111, :]
    add_fact("Limiting rows to condition: " + 'aus_1 != "1111"')

    lar_subset.shape

    lar_subset[lar_subset['debt_to_income_ratio'].isna()].shape

    temp['debt_to_income_ratio_is_na'] = temp['debt_to_income_ratio'].isna()
    pd.crosstab(temp['debt_to_income_ratio_is_na'],temp['loan_approved'], normalize=False)

    #remove rows with debt_to/-income_ratio of NA
    lar_subset = lar_subset.loc[~lar_subset['debt_to_income_ratio'].isna(),:]
    add_fact("Limiting rows to condition: " + 'debt_to_income_ratio is not NA')

    lar_subset.shape

    lar_subset['debt_to_income_ratio'].unique()

    # lar_subset[lar_subset['debt_to_income_ratio'] == 'Exempt'].shape

    #remove rows where debt_to_income_ratio is Exempt
    lar_subset = lar_subset.loc[lar_subset['debt_to_income_ratio'] != 'Exempt', :]
    add_fact("Limiting rows to condition: " + 'debt_to_income_ratio != "Exempt"')

    lar_subset.shape

    lar_subset['debt_to_income_ratio'].unique()

    def debt_to_income_ratio(row):
        if (row['debt_to_income_ratio'] == '20%-<30%'):  #non-Hispanic Whites are marked as WHITE; all others as NON-WHITE
            return '25'
        elif (row['debt_to_income_ratio'] == '30%-<36%'):
            return '33'
        elif (row['debt_to_income_ratio'] == '50%-60%'):
            return '55'
        elif (row['debt_to_income_ratio'] == '<20%'):
            return '15'
        elif (row['debt_to_income_ratio'] == '>60%'):
            return '65'
        else:
            return row['debt_to_income_ratio']

    lar_subset['modified_debt_to_income_ratio'] = lar_subset.apply(lambda row: debt_to_income_ratio(row), axis=1)
    lar_subset['modified_debt_to_income_ratio'] = lar_subset['modified_debt_to_income_ratio'].astype('float64')

    add_fact("Creating new feature: modified_debt_to_income_ratio")
    add_fact("Setting modified_debt_to_income_ratio to 25,33,55,15,16 corresponding to debt_to_income_ratio = 20%-30%,30%-<36%,50%-60%,<20%,>60% respectively")
    add_fact("Converting modified_debt_to_income_ratio from String to Float")
    lar_subset['modified_debt_to_income_ratio'].unique()

    pd.crosstab(lar_subset['debt_to_income_ratio'],lar_subset['modified_debt_to_income_ratio'], normalize=False)

    lar_subset['applicant_age'].value_counts(normalize=False)

    #remove rows where applicant_age is 8888 or 9999 is Exempt
    lar_subset = lar_subset.loc[((lar_subset['applicant_age'] != '8888') & (lar_subset['applicant_age'] != '9999')), :]
    add_fact("Limiting rows to condition: " + 'applicant_age != "8888" & applicant_age != "9999"')
    
    lar_subset.shape

    cols = ['loan_purpose','derived_loan_product_type','derived_dwelling_category',
            'open_end_line_of_credit','business_or_commercial_purpose','occupancy_type',
            'reverse_mortgage','negative_amortization','interest_only_payment',
            'balloon_payment','action_taken','applicant_race_1','applicant_ethnicity_1',
            'derived_race','derived_ethnicity','applicant_sex','hoepa_status',
            'interest_rate','discount_points','rate_spread','origination_charges',
            'lender_credits','activity_year']
    lar_subset.drop(columns=cols, inplace=True)

    add_fact("Dropping columns: "+str(cols))
    lar_subset.shape

    cols = ['applicant_race_2','applicant_race_3','applicant_race_4','applicant_race_5',
            'applicant_ethnicity_2','applicant_ethnicity_3','applicant_ethnicity_4','applicant_ethnicity_5',
            'applicant_race_observed','applicant_ethnicity_observed','applicant_sex_observed',
            'aus_2','aus_3','aus_4','aus_5']
    lar_subset.drop(columns=cols, inplace=True)
    add_fact("Dropping columns: "+str(cols))
    
    lar_subset.shape

    co_applicant_columns = [c for c in lar_subset.columns if c.startswith('co_applicant')]
    co_applicant_columns
    lar_subset.drop(columns=co_applicant_columns, inplace=True)
    add_fact("Dropping columns: "+str(co_applicant_columns))

    lar_subset.shape

    cols = ['construction_method','denial_reason_1','denial_reason_2',
            'denial_reason_3','denial_reason_4','derived_sex','lien_status',
            'loan_type']
    lar_subset.drop(columns=cols, inplace=True)
    add_fact("Dropping columns: "+str(cols))
    
    lar_subset.shape

    cols = ['manufactured_home_secured_property_type','manufactured_home_land_property_interest',
            'multifamily_affordable_units','submission_of_application']
    lar_subset.drop(columns=cols, inplace=True)
    add_fact("Dropping columns: "+str(cols))

    lar_subset.shape

    temp['total_loan_costs_is_na'] = temp['total_loan_costs'].isna()
    pd.crosstab(temp['total_loan_costs_is_na'],temp['loan_approved'], normalize=False)

    temp['total_points_and_fees_is_na'] = temp['total_points_and_fees'].isna()
    pd.crosstab(temp['total_points_and_fees_is_na'],temp['loan_approved'], normalize=False)

    #remove both total_points_and_fees and total_loan_costs

    lar_subset.drop(columns=['total_loan_costs','total_points_and_fees'], inplace=True)

    temp['prepayment_penalty_term_is_na'] = temp['prepayment_penalty_term'].isna()
    pd.crosstab(temp['prepayment_penalty_term_is_na'],temp['loan_approved'], normalize=False)

    lar_subset.drop(columns=['prepayment_penalty_term'], inplace=True)

    add_fact("Dropping columns: "+str(['total_loan_costs','total_points_and_fees','prepayment_penalty_term']))
    lar_subset.shape

    temp['intro_rate_period_is_na'] = temp['intro_rate_period'].isna()
    pd.crosstab(temp['intro_rate_period_is_na'],temp['loan_approved'], normalize=False)

    temp['initially_payable_to_institution_is_na'] = temp['initially_payable_to_institution'].isna()
    pd.crosstab(temp['initially_payable_to_institution_is_na'],temp['loan_approved'], normalize=False)

    lar_subset['initially_payable_to_institution'].value_counts()

    #remove rows with initially_payable_to_institution == 1111 (Exempt)
    lar_subset = lar_subset.loc[lar_subset['initially_payable_to_institution'] != 1111, :]
    add_fact("Limiting rows to condition: " + 'initially_payable_to_institution != 1111')

    print(lar_subset.shape)

    print(lar_subset.columns)
    
    add_fact("Size of processed dataset: " + str(lar_subset.shape))
    
    return lar_subset

In [None]:
# Data & AI Policy officer specifies the governance policy around data and AI models
FACTS = {}
CURRENT_PHASE = 'Data and AI Models Policy Specification'
add_fact('Datasets must be approved and in data catalog')
add_fact('Race, ethnicity, and gender of applicant cannot be used in models used to make mortgage related decisions')
add_fact('Model predictive performance metrics must minimally include accuracy, balanced_accuracy and AUC score')
add_fact('Models must be checked for bias using Disparate Impact')
add_fact('Models must be checked for faithfulness of explanations')
add_fact('Models must be checked for robustness to Adversarial attacks using using Empirical Robustness metric')
add_fact('Models must be checked for robustness to dataset shift')

save_facts('policy.json')
print_facts()

In [None]:
#Business owner requests model for predicting mortgage approval
#Data scientist requests data to be added to catalog
#Data is added to data catalog by data steward as two files - training to be used for building model by data scientist; 
#validation to be used subsequently by model validator

CURRENT_PHASE = 'Data Addition to Data Catalog'
add_fact('Model purpose: Predict mortgage approval')
add_HMDA_mortgage_approval_dataset_to_catalog()

save_facts('data_catalog.json')
print_facts()

In [None]:
#Data scientist aka model builder takes traing data from catalog and starts the model building process by preocessing the data
CURRENT_PHASE = 'Model Building - Data Processing'

lar_subset = process_hmda_mortgage_approval_dataset('2018_public_lar_csv_TRAIN.csv')
lar_subset.to_csv('2018_hmda_mortgage_approval_processed_TRAIN.csv', index=False)

save_facts('data_processing.json')
print_facts()

In [None]:
#data scientist creates test/train and shift datasets and builds/evaluates models

CURRENT_PHASE = 'Model Building - Evaluation'
database_list = ['HMDA-MORTGAGE-APPROVAL-TRAINING-WITHOUT-PROTECTED-ATTRIBUTES'] #['AIF360-GERMAN'] #['AIF360-BANK', 'AIF360-ADULT', MEPS, 'ZINDI', 'BANK',  'ADULT']
models_list = ['LR', 'RF', 'GBC', 'MLP']
num_of_cases_to_explain = 20
num_of_adversarial_examples = 10

scaling = True
optimize = False

reweighing = False

learned_models = run_experiments(database_list, models_list, num_of_cases_to_explain, num_of_adversarial_examples, scaling, optimize, reweighing)

save_facts('model_evaluation.json')
print_facts()

In [None]:

#data scientist creates test/train and shift datasets and builds/evaluates models with bias mitigation

CURRENT_PHASE = 'Model Building - Bias Mitigation'

database_list = ['HMDA-MORTGAGE-APPROVAL-TRAINING-WITHOUT-PROTECTED-ATTRIBUTES'] #['AIF360-GERMAN'] #['AIF360-BANK', 'AIF360-ADULT', MEPS, 'ZINDI', 'BANK',  'ADULT']
models_list = ['LR', 'RF', 'GBC', 'MLP']
num_of_cases_to_explain = 20
num_of_adversarial_examples = 10

scaling = True
optimize = False

reweighing = True

learned_models = run_experiments(database_list, models_list, num_of_cases_to_explain, num_of_adversarial_examples, scaling, optimize, reweighing)

save_facts('bias_mitigation.json')
print_facts()

In [None]:
#Validator takes facts from data scientist and processes the validation data the same way

CURRENT_PHASE = 'Model Validation - Data Processing'

lar_subset = process_hmda_mortgage_approval_dataset('2018_public_lar_csv_VALIDATION.csv')
lar_subset.to_csv('2018_hmda_mortgage_approval_processed_VALIDATION.csv', index=False)




save_facts('validation_data_processing.json')
print_facts()

In [None]:
#data scientist selects the GBC model as the final model
#the validator tests the learnt model against the validation data.
#note: validator needs access to training data for feature scaling purposes

CURRENT_PHASE = 'Model Validation - Evaluation'
accuracy_table = {}
fairness_table = {}
adversarial_table = {}
explainability_table = {}

model_type = 'GBC'
model = learned_models[model_type]
X_train, y_train, prot_attrs_train, _, _, _, _, _, _,_,_,_, prot_attrs_names, _ = load_data('HMDA-MORTGAGE-APPROVAL-TRAINING-WITHOUT-PROTECTED-ATTRIBUTES')
_, _, _, X_validate, y_validate, prot_attrs_validate, _, _, _, _, _, _,_,_ = load_data('HMDA-MORTGAGE-APPROVAL-VALIDATION-WITHOUT-PROTECTED-ATTRIBUTES')
#X_train, y_train, prot_attrs_train, X_validate, y_validate, prot_attrs_validate, X_test, y_test, prot_attrs_test, X_shift, y_shift, prot_attrs_shift, prot_attrs_names, instance_weights = load_data(dataset_name)

scaler = StandardScaler().fit(X_train)

adv_model = ScikitlearnGradientBoostingClassifier(model=model)

explainer = LimeTabularExplainer(scaler.transform(X_train),feature_names=X_train.columns, 
                                 class_names=['0','1'], discretize_continuous=True)

results_table = evaluate_model(model, adv_model, model_type, X_validate, y_validate, prot_attrs_validate, prot_attrs_names, explainer, scaler, num_of_adversarial_examples, num_of_cases_to_explain)

display(results_table)

save_facts('validation_evaluation.json')
print_facts()

In [None]:
#the validator also carries out a "sanity check" by creating/evaluating a simple decision tree model using only debt to income and
#loan to value as features

FACTS = read_facts('validation_evaluation.json')

CURRENT_PHASE = 'Model Validation - Simple Model Sanity Check'


database_list = ['HMDA-MORTGAGE-SIMPLE-MODEL'] 
models_list = ['DT']
num_of_cases_to_explain = 20
num_of_adversarial_examples = 10

scaling = True
optimize = False

reweighing = True

learned_models = run_experiments(database_list, models_list, num_of_cases_to_explain, num_of_adversarial_examples, scaling, optimize, reweighing)

save_facts('sanity_check.json')
print_facts()

In [None]:
#validator sends report to Model Validation executive for review who approves it for deployment
#model is embedded into fuller application, tested, and deployed

#model kpis checked on regular basis

#significant deviation triggers model pull/rebuild by Risk officer
