## In this notebook, we show a classification example using a dataset including information of some borrowers.

## In this example, it is aimed to predict if a credit request aked by a borrower will be declined or not. Therefore, the target variable is binary and it is equal to one when the credit request is declined otherwise it is equal to zero.

## The original dataset includes 60 columns among which we select "loan_type", "loan_purpose", "lien_status", "income", "gender", "race" as the explanatory variables. Definition of these explanatory variables is presented below. 


In [1]:
import pandas as pd
import numpy as np
import warnings
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from safeaipackage import check_accuracy, check_robustness, check_explainability, check_fairness, check_privacy  
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier


##modelagnosticsafeaipackage
from modelagnosticsafeaipackage import check_accuracy as modelagnostic_accuracy
from modelagnosticsafeaipackage import check_explainability as modelagnostic_explainability
from modelagnosticsafeaipackage import check_fairness as modelagnostic_fairness
from modelagnosticsafeaipackage import check_privacy as modelagnostic_privacy
from modelagnosticsafeaipackage import check_robustness as modelagnostic_robustness

warnings.filterwarnings('ignore')
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 10000)

ModuleNotFoundError: No module named 'safeaipackage'

In [12]:
data = pd.read_csv("data.csv")
print("This dataset has {} rows and {} columns".format(data.shape[0], data.shape[1]))
data.head()

This dataset has 157269 rows and 60 columns


Unnamed: 0.1,Unnamed: 0,X,respondent_id,loan_type_name,loan_type,loan_purpose_name,loan_purpose,loan_amount_000s,action_taken_name,action_taken,msamd_name,msamd,county_name,county_code,census_tract_number,applicant_ethnicity_name,applicant_ethnicity,co_applicant_ethnicity_name,co_applicant_ethnicity,applicant_race_name_1,applicant_race_1,applicant_race_name_2,applicant_race_2,applicant_race_name_3,applicant_race_3,applicant_race_name_4,applicant_race_4,applicant_race_name_5,applicant_race_5,co_applicant_race_name_1,co_applicant_race_1,co_applicant_race_name_2,co_applicant_race_2,co_applicant_race_name_3,co_applicant_race_3,co_applicant_race_name_4,co_applicant_race_4,co_applicant_race_name_5,co_applicant_race_5,applicant_sex_name,co_applicant_sex_name,applicant_income_000s,purchaser_type_name,purchaser_type,denial_reason_name_1,denial_reason_1,denial_reason_name_2,denial_reason_2,denial_reason_name_3,denial_reason_3,rate_spread,lien_status_name,lien_status,population,minority_population,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units,response
0,1,1,7197000003,Conventional,1,Refinancing,3,100,Loan originated,1,"New York, Jersey City, White Plains - NY, NJ",35614.0,Queens County,81.0,616.01,Not Hispanic or Latino,2,No co-applicant,5,Black or African American,1,,,,,,,,,No co-applicant,8,,,,,,,,,Male,No co-applicant,84,Fannie Mae (FNMA),1,10,10,,,,,,Secured by a first lien,1,2384.0,99.540001,73700.0,150.360001,640.0,688.0,0
1,2,2,75-2921540,Conventional,1,Refinancing,3,315,Loan originated,1,"New York, Jersey City, White Plains - NY, NJ",35614.0,Queens County,81.0,616.01,Not Hispanic or Latino,2,No co-applicant,5,Black or African American,1,,,,,,,,,No co-applicant,8,,,,,,,,,Female,No co-applicant,58,Fannie Mae (FNMA),1,10,10,,,,,,Secured by a first lien,1,2384.0,99.540001,73700.0,150.360001,640.0,688.0,0
2,3,3,852218,Conventional,1,Home purchase,1,117,Loan originated,1,"Dutchess County, Putnam County - NY",20524.0,Dutchess County,27.0,400.03,Not Hispanic or Latino,2,No co-applicant,5,White,0,,,,,,,,,No co-applicant,8,,,,,,,,,Male,No co-applicant,57,Fannie Mae (FNMA),1,10,10,,,,,,Secured by a first lien,1,4031.0,33.389999,97100.0,78.470001,971.0,1530.0,0
3,4,4,7197000003,Conventional,1,Refinancing,3,312,Loan originated,1,"New York, Jersey City, White Plains - NY, NJ",35614.0,Queens County,81.0,616.01,Not Hispanic or Latino,2,No co-applicant,5,Black or African American,1,,,,,,,,,No co-applicant,8,,,,,,,,,Male,No co-applicant,82,Fannie Mae (FNMA),1,10,10,,,,,,Secured by a first lien,1,2384.0,99.540001,73700.0,150.360001,640.0,688.0,0
4,5,5,11-3290207,FHA-insured,2,Home purchase,1,599,Loan originated,1,"New York, Jersey City, White Plains - NY, NJ",35614.0,Queens County,81.0,614.0,Not Hispanic or Latino,2,Not Hispanic or Latino,2,Black or African American,1,,,,,,,,,Black or African American,3,,,,,,,,,Female,Female,67,Other type of purchaser,9,10,10,,,,,1.8,Secured by a first lien,1,1321.0,95.989998,73700.0,147.679993,310.0,390.0,0


### We use the following variables for our experimental analysis:

![Screenshot%202024-04-03%20111717.png](attachment:Screenshot%202024-04-03%20111717.png)

In [13]:
data = data.loc[0:1000,["loan_type", "loan_purpose", "applicant_race_1", "applicant_income_000s",
                                       "lien_status", "applicant_sex_name","response"]]

data.head()

Unnamed: 0,loan_type,loan_purpose,applicant_race_1,applicant_income_000s,lien_status,applicant_sex_name,response
0,1,3,1,84,1,Male,0
1,1,3,1,58,1,Female,0
2,1,1,0,57,1,Male,0
3,1,3,1,82,1,Male,0
4,2,1,1,67,1,Female,0


In [14]:
data["applicant_sex_name"] = np.where(data["applicant_sex_name"]=="Male", 1, 0)
#data["applicant_income_000s"] = [np.log(x) for x in data["applicant_income_000s"]]
data.head()

Unnamed: 0,loan_type,loan_purpose,applicant_race_1,applicant_income_000s,lien_status,applicant_sex_name,response
0,1,3,1,84,1,1,0
1,1,3,1,58,1,0,0
2,1,1,0,57,1,1,0
3,1,3,1,82,1,1,0
4,2,1,1,67,1,0,0


In [15]:
data.describe()

Unnamed: 0,loan_type,loan_purpose,applicant_race_1,applicant_income_000s,lien_status,applicant_sex_name,response
count,1001.0,1001.0,1001.0,1001.0,1001.0,1001.0,1001.0
mean,1.24975,1.803197,0.351648,132.762238,1.016983,0.548452,0.114885
std,0.433085,0.980933,0.477724,208.42184,0.129272,0.497896,0.319043
min,1.0,1.0,0.0,8.0,1.0,0.0,0.0
25%,1.0,1.0,0.0,68.0,1.0,0.0,0.0
50%,1.0,1.0,0.0,98.0,1.0,1.0,0.0
75%,1.0,3.0,1.0,136.0,1.0,1.0,0.0
max,2.0,3.0,1.0,4342.0,2.0,1.0,1.0


In [16]:
data.isna().sum()

loan_type                0
loan_purpose             0
applicant_race_1         0
applicant_income_000s    0
lien_status              0
applicant_sex_name       0
response                 0
dtype: int64

## Define X and y for the classification problem. We use 30% of data as the train data

In [17]:
X = data.drop(["response"], axis=1)
y = data["response"]

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=1)

print(xtrain.shape)
print(xtest.shape)

(700, 6)
(301, 6)


In [18]:
model1 = RandomForestClassifier(random_state=1).fit(xtrain, ytrain)
yhat_mod1 = model1.predict(xtest)

# ACCURACY

In [19]:
rga_value = modelagnostic_accuracy.rga(ytest, yhat_mod1)
rga_value

0.5160315985130112

In [21]:
# to do the rga based test, we need to have yhat_rm
# this test is done to compare the predictive accuracy of the full Random Forest model with that of a reduced Random Forest
#model without "lien_status"

xtrain_rm = xtrain.drop("lien_status", axis=1)
xtest_rm = xtest.drop("lien_status", axis=1)

model1_rm = RandomForestClassifier(random_state=1).fit(xtrain_rm, ytrain)
yhat_mod1_rm = model1_rm.predict(xtest_rm)

pvalue_rga = modelagnostic_accuracy.rga_statistic_test(ytest, yhat_mod1_rm, yhat_mod1)
pvalue_rga

1.0

# EXPLAINABILITY

In [23]:
## find the cotribution of lien_status

rge_value = modelagnostic_explainability.rge(yhat_mod1, yhat_mod1_rm)
rge_value

0.01907961460446228

In [24]:
## apply the RGE based statistical test for "lien_status". 

pvalue_rge = modelagnostic_explainability.rge_statistic_test(yhat_mod1, yhat_mod1_rm)

pvalue_rge

0.26940351900563964

# ROBUSTNESS

In [26]:
## the following code lines are related to the perturbation process done in the safeaipackage 
## to find rgr of the model when a variable for example lien_status is perturbed we perturb the corresponding column using 
# the perturb function defined above

traindata_ = xtrain.reset_index(drop=True)
perturbed_variable = traindata_.loc[:,"lien_status"]
vals = [[i, values] for i, values in enumerate(perturbed_variable)]
indices = [x[0] for x in sorted(vals, key= lambda item: item[1])]
sorted_variable = [x[1] for x in sorted(vals, key= lambda item: item[1])]
percentile_5_index = int(np.ceil(0.05 * len(sorted_variable)))
percentile_95_index = int(np.ceil((1-0.05) * len(sorted_variable)))
values_before_5th_percentile = sorted_variable[:percentile_5_index]
values_after_95th_percentile = sorted_variable[percentile_95_index:]
n = min([len(values_before_5th_percentile), len(values_after_95th_percentile)])
lowertail_indices = indices[0:n]
uppertail_indices = (indices[-n:])
uppertail_indices = uppertail_indices[::-1]
new_variable = perturbed_variable.copy()
for j in range(n):
    new_variable[lowertail_indices[j]] = perturbed_variable[uppertail_indices[j]]
    new_variable[uppertail_indices[j]] = perturbed_variable[lowertail_indices[j]]
traindata_.loc[:,"lien_status"] = new_variable

In [27]:

model1_pert = RandomForestClassifier(random_state=1).fit(traindata_, ytrain)
yhat_mod1_pert = model1_pert.predict(xtest)

rgr_value = modelagnostic_robustness.rgr(yhat_mod1, yhat_mod1_pert)
rgr_value

0.9600025354969574

In [29]:
## here the robustness of the Random Forest model for "lien_status" is compared with that of the GradientBoostingClassifier

model2 = GradientBoostingClassifier(random_state=1).fit(xtrain, ytrain)
yhat_mod2 = model2.predict(xtest)
model2_pert = GradientBoostingClassifier(random_state=1).fit(traindata_, ytrain)
yhat_mod2_pert = model2_pert.predict(xtest)

pvalue_rgr = modelagnostic_robustness.rgr_statistic_test(yhat_mod1, yhat_mod2, yhat_mod1_pert, yhat_mod2_pert)
pvalue_rgr

0.47677104454989516

# FAIRNESS

In [30]:
rgf_value = modelagnostic_fairness.rgf(yhat_mod1, yhat_mod1_rm)

rgf_value

0.9809203853955377

In [31]:
pvalue_rgf = modelagnostic_fairness.rgf_statistic_test(yhat_mod1, yhat_mod1_rm)

pvalue_rgf

0.26940351900563964

# PRIVACY

In [33]:
## we find rgp for the model when the tenth observation is removed

xtrain_10_rm = xtrain.drop(10, axis=0)
ytrain_10_rm = ytrain.drop(10, axis=0)
model1_10_rm = RandomForestClassifier(random_state=1).fit(xtrain_10_rm, ytrain_10_rm)
yhat_mod1_10_rm = model1_10_rm.predict(xtest)

rgp_value = modelagnostic_privacy.rgp(yhat_mod1, yhat_mod1_10_rm)

rgp_value

0.9655172413793104

In [34]:
pvalue_rgp = modelagnostic_privacy.rgp_statistic_test(yhat_mod1, yhat_mod1_10_rm)

pvalue_rgp

0.15260654461146148