In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, roc_auc_score
from imblearn.combine import SMOTETomek
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

In [2]:
pd.set_option('display.max_columns', None)
df = pd.read_csv('2021_public_lar.csv',usecols=['state_code','derived_loan_product_type','derived_dwelling_category'
                                                ,'derived_race','applicant_race_1','action_taken',
                                                'loan_purpose','business_or_commercial_purpose',
                                                'loan_amount','combined_loan_to_value_ratio',
                                                'property_value','occupancy_type','income',
                                                'debt_to_income_ratio'])

  df = pd.read_csv('2021_public_lar.csv',usecols=['state_code','derived_loan_product_type','derived_dwelling_category'


In [3]:
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].astype(str)
def average_dti_range(dti_value):
    if pd.isna(dti_value) or dti_value == 'nan' or dti_value == 'Exempt':
        return np.nan

    range_pattern = r'(\d+)%-<(\d+)%|(\d+)%-(\d+)%'
    less_than_pattern = r'<(\d+)%'
    greater_than_pattern = r'>(\d+)%'
    
    if re.match(range_pattern, dti_value):
        bounds = re.findall(range_pattern, dti_value)[0]
        bounds = [float(b) for b in bounds if b]
        lower_bound, upper_bound = bounds
        return (lower_bound + upper_bound) / 2
    elif re.match(less_than_pattern, dti_value):
        upper_bound = re.findall(less_than_pattern, dti_value)[0][0]
        upper_bound = float(upper_bound)
        return upper_bound / 2
    elif re.match(greater_than_pattern, dti_value):
        lower_bound = re.findall(greater_than_pattern, dti_value)[0][0]
        lower_bound = float(lower_bound)
        return lower_bound * 1.1
    else:
        return float(dti_value.replace('%', ''))

In [4]:
df_ny = df[(df["state_code"] == 'NY')]

df_ny = df_ny[df_ny['derived_loan_product_type'] == 'Conventional:First Lien']
df_ny = df_ny.loc[df_ny['loan_purpose'].isin([1])]
df_ny = df_ny.loc[df_ny['business_or_commercial_purpose'].isin([2])]
df_ny = df_ny[df_ny['derived_dwelling_category'] == 'Single Family (1-4 Units):Site-Built']
df_ny = df_ny.loc[df_ny['occupancy_type'].isin([1])]
df_ny = df_ny[df_ny["combined_loan_to_value_ratio"].str.contains("Exempt") == False]
df_ny['combined_loan_to_value_ratio'] = df_ny['combined_loan_to_value_ratio'].astype(str).astype(float)
#df_ny['interest_rate'] = df_ny['interest_rate'].astype(str).astype(float)
df_ny['property_value'] = df_ny['property_value'].astype(str).astype(float)
df_ny = df_ny.loc[df_ny['action_taken'].isin([1,3])]
df_ny = df_ny.loc[df_ny['applicant_race_1'].isin([1,2,3,4,5])]
df_ny['action_taken'] = df['action_taken'].replace({3: 0})
df_ny['debt_to_income_ratio'] = df_ny['debt_to_income_ratio'].apply(average_dti_range)

In [5]:
selected_features = ['income', 'debt_to_income_ratio', 'loan_amount', 'combined_loan_to_value_ratio', 'property_value','action_taken','applicant_race_1']
columns_to_drop = [col for col in df_ny.columns if col not in selected_features]
df_ny = df_ny.drop(columns=columns_to_drop)

In [6]:
df_ny.dropna(inplace=True)

In [7]:


# Feature scaling
scaler = StandardScaler()
df_ny[['income', 'debt_to_income_ratio', 'loan_amount', 'combined_loan_to_value_ratio', 'property_value', 'applicant_race_1']] = scaler.fit_transform(df_ny[['income', 'debt_to_income_ratio', 'loan_amount', 'combined_loan_to_value_ratio', 'property_value', 'applicant_race_1']])

# Split the dataset
X = df_ny.drop(columns=['action_taken'])
y = df_ny['action_taken']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE and Tomek Links
smote_tomek = SMOTETomek(random_state=42)
X_train_smote_tomek, y_train_smote_tomek = smote_tomek.fit_resample(X_train, y_train)


# Create the XGBoost model with the appropriate hyperparameters for balancing
xgb_model = xgb.XGBClassifier(scale_pos_weight=1, random_state=42)

# Specify the hyperparameters and their possible values
params = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

# Perform grid search with cross-validation and parallelization
grid_search = GridSearchCV(xgb_model, params, cv=5, verbose=1, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train_smote_tomek, y_train_smote_tomek)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Model evaluation
y_pred = grid_search.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_pred))

# Demographic parity analysis for the XGBoost model
# ...


Fitting 5 folds for each of 972 candidates, totalling 4860 fits
Best Hyperparameters: {'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}
Accuracy: 0.9400665926748057
Balanced Accuracy: 0.643245447132373
Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.31      0.37       364
           1       0.96      0.98      0.97      5943

    accuracy                           0.94      6307
   macro avg       0.71      0.64      0.67      6307
weighted avg       0.93      0.94      0.93      6307

AUC-ROC: 0.643245447132373


In [8]:
# Get the predicted probabilities
y_pred_proba = grid_search.predict_proba(X_test)[:, 1]

# Create a DataFrame with race, loan_approval (y_test), and prediction (y_pred)
results = pd.DataFrame({'race': X_test['applicant_race_1'], 'loan_approval': y_test, 'prediction_proba': y_pred_proba, 'prediction': y_pred})

# Group the results by race and compute the mean for loan_approval and prediction
group_results = results.groupby('race').mean()

# Print the demographic parity analysis
print("Demographic Parity Analysis (Random Forest Model):")
print(group_results)

# Calculate the difference in approval rates between the groups
max_diff = abs(group_results['prediction'].max() - group_results['prediction'].min())
print("Max difference in approval rates between groups (Random Forest Model):", max_diff)

# Set a fairness threshold (e.g., 0.05)
fairness_threshold = 0.05
is_fair = max_diff <= fairness_threshold
print("Is the random forest model fair?", is_fair)

Demographic Parity Analysis (Random Forest Model):
           loan_approval  prediction_proba  prediction
race                                                  
-2.970009       0.857143          0.952392    1.000000
-2.098390       0.931996          0.930634    0.965440
-1.226772       0.905109          0.852042    0.905109
-0.355154       0.555556          0.649794    0.666667
 0.516464       0.948408          0.933796    0.966747
Max difference in approval rates between groups (Random Forest Model): 0.33333333333333337
Is the random forest model fair? False


In [9]:
# Create a separate StandardScaler instance for 'applicant_race_1'
race_scaler = StandardScaler()

# Feature scaling
scaler = StandardScaler()
df_ny[['income', 'debt_to_income_ratio', 'loan_amount', 'combined_loan_to_value_ratio', 'property_value']] = scaler.fit_transform(df_ny[['income', 'debt_to_income_ratio', 'loan_amount', 'combined_loan_to_value_ratio', 'property_value']])
df_ny['applicant_race_1'] = race_scaler.fit_transform(df_ny[['applicant_race_1']])

# Inverse transform the 'applicant_race_1' column
inverse_transformed_race = race_scaler.inverse_transform(df_ny['applicant_race_1'].values.reshape(-1, 1))

# Convert the inverse transformed values back to a Pandas Series
inverse_transformed_race = pd.Series(inverse_transformed_race.ravel())

# Now you have the original 'applicant_race_1' values in 'inverse_transformed_race'


In [13]:
df_ny['applicant_race_1']

1           0.516464
2           0.516464
4           0.516464
9           0.516464
10          0.516464
              ...   
26124279    0.516464
26124280    0.516464
26124281    0.516464
26124283    0.516464
26124285   -2.098390
Name: applicant_race_1, Length: 31534, dtype: float64

In [12]:
inverse_transformed_race.unique()

array([ 0.51646438, -2.09839048, -1.2267722 , -2.97000877, -0.35515391])