In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, roc_auc_score
from imblearn.combine import SMOTETomek
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [2]:
pd.set_option('display.max_columns', None)
df = pd.read_csv('2021_public_lar.csv',usecols=['state_code','derived_loan_product_type','derived_dwelling_category'
                                                ,'derived_race','applicant_race_1','action_taken',
                                                'loan_purpose','business_or_commercial_purpose',
                                                'loan_amount','combined_loan_to_value_ratio',
                                                'property_value','occupancy_type','income',
                                                'debt_to_income_ratio'])

  df = pd.read_csv('2021_public_lar.csv',usecols=['state_code','derived_loan_product_type','derived_dwelling_category'


In [3]:
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].astype(str)
def average_dti_range(dti_value):
    if pd.isna(dti_value) or dti_value == 'nan' or dti_value == 'Exempt':
        return np.nan

    range_pattern = r'(\d+)%-<(\d+)%|(\d+)%-(\d+)%'
    less_than_pattern = r'<(\d+)%'
    greater_than_pattern = r'>(\d+)%'
    
    if re.match(range_pattern, dti_value):
        bounds = re.findall(range_pattern, dti_value)[0]
        bounds = [float(b) for b in bounds if b]
        lower_bound, upper_bound = bounds
        return (lower_bound + upper_bound) / 2
    elif re.match(less_than_pattern, dti_value):
        upper_bound = re.findall(less_than_pattern, dti_value)[0][0]
        upper_bound = float(upper_bound)
        return upper_bound / 2
    elif re.match(greater_than_pattern, dti_value):
        lower_bound = re.findall(greater_than_pattern, dti_value)[0][0]
        lower_bound = float(lower_bound)
        return lower_bound * 1.1
    else:
        return float(dti_value.replace('%', ''))

In [4]:
df_ny = df[(df["state_code"] == 'NY')]

df_ny = df_ny[df_ny['derived_loan_product_type'] == 'Conventional:First Lien']
df_ny = df_ny.loc[df_ny['loan_purpose'].isin([1])]
df_ny = df_ny.loc[df_ny['business_or_commercial_purpose'].isin([2])]
df_ny = df_ny[df_ny['derived_dwelling_category'] == 'Single Family (1-4 Units):Site-Built']
df_ny = df_ny.loc[df_ny['occupancy_type'].isin([1])]
df_ny = df_ny[df_ny["combined_loan_to_value_ratio"].str.contains("Exempt") == False]
df_ny['combined_loan_to_value_ratio'] = df_ny['combined_loan_to_value_ratio'].astype(str).astype(float)
#df_ny['interest_rate'] = df_ny['interest_rate'].astype(str).astype(float)
df_ny['property_value'] = df_ny['property_value'].astype(str).astype(float)
df_ny = df_ny.loc[df_ny['action_taken'].isin([1,3])]
df_ny = df_ny.loc[df_ny['applicant_race_1'].isin([1,2,3,4,5])]
df_ny['action_taken'] = df['action_taken'].replace({3: 0})
df_ny['debt_to_income_ratio'] = df_ny['debt_to_income_ratio'].apply(average_dti_range)
df_ny['applicant_race_1'] = df_ny['applicant_race_1'].astype(int)

In [5]:
selected_features = ['income', 'debt_to_income_ratio', 'loan_amount', 'combined_loan_to_value_ratio', 'property_value','action_taken','applicant_race_1']
columns_to_drop = [col for col in df_ny.columns if col not in selected_features]
df_ny = df_ny.drop(columns=columns_to_drop)

In [6]:
df_ny['applicant_race_1'].unique()

array([5, 2, 3, 1, 4])

In [7]:
df_ny.dtypes

action_taken                      int64
loan_amount                       int64
combined_loan_to_value_ratio    float64
property_value                  float64
income                          float64
debt_to_income_ratio            float64
applicant_race_1                  int32
dtype: object

In [8]:
df_ny

Unnamed: 0,action_taken,loan_amount,combined_loan_to_value_ratio,property_value,income,debt_to_income_ratio,applicant_race_1
1,1,95000,95.000,105000.0,35.0,39.0,5
2,1,95000,96.999,95000.0,24.0,38.0,5
4,1,105000,90.000,115000.0,80.0,1.0,5
9,1,255000,80.000,315000.0,68.0,36.0,5
10,1,305000,89.820,335000.0,107.0,38.0,5
...,...,...,...,...,...,...,...
26124279,1,125000,96.154,135000.0,49.0,25.0,5
26124280,1,135000,97.000,135000.0,50.0,33.0,5
26124281,1,135000,97.000,145000.0,51.0,33.0,5
26124283,1,155000,97.000,165000.0,49.0,41.0,5


In [9]:
# Approved sample

In [10]:
df_ny[df_ny['action_taken'] == 1].sample(n=10)

Unnamed: 0,action_taken,loan_amount,combined_loan_to_value_ratio,property_value,income,debt_to_income_ratio,applicant_race_1
11030630,1,325000,95.0,335000.0,65.0,49.0,5
24458070,1,205000,94.39,215000.0,58.0,55.0,5
16822038,1,495000,38.385,1305000.0,,,2
197855,1,285000,95.0,305000.0,135.0,25.0,5
26123851,1,185000,96.977,225000.0,47.0,33.0,5
16820754,1,535000,59.685,885000.0,,,2
16821613,1,125000,60.0,215000.0,,,2
11857882,1,65000,90.0,75000.0,114.0,25.0,5
20969077,1,395000,95.0,425000.0,158.0,38.0,3
195046,1,135000,73.333,185000.0,52.0,43.0,5


In [11]:
df_ny[(df_ny['action_taken'] == 1) & (df_ny['applicant_race_1'] == 4)].sample(n=2)

Unnamed: 0,action_taken,loan_amount,combined_loan_to_value_ratio,property_value,income,debt_to_income_ratio,applicant_race_1
546825,1,105000,80.0,135000.0,41.0,40.0,4
17732917,1,205000,95.0,215000.0,65.0,37.0,4


In [12]:
df_ny[(df_ny['action_taken'] == 1) & (df_ny['applicant_race_1'] == 3)].sample(n=2)

Unnamed: 0,action_taken,loan_amount,combined_loan_to_value_ratio,property_value,income,debt_to_income_ratio,applicant_race_1
8377617,1,335000,90.0,385000.0,137.0,44.0,3
17732835,1,595000,95.0,625000.0,105.0,44.0,3


In [13]:
df_ny[(df_ny['action_taken'] == 1) & (df_ny['applicant_race_1'] == 2)].sample(n=2)

Unnamed: 0,action_taken,loan_amount,combined_loan_to_value_ratio,property_value,income,debt_to_income_ratio,applicant_race_1
25937100,1,335000,80.0,415000.0,137.0,55.0,2
6367149,1,325000,95.0,345000.0,94.0,44.0,2


In [14]:
df_ny[(df_ny['action_taken'] == 1) & (df_ny['applicant_race_1'] == 1)].sample(n=2)

Unnamed: 0,action_taken,loan_amount,combined_loan_to_value_ratio,property_value,income,debt_to_income_ratio,applicant_race_1
25938274,1,505000,80.0,635000.0,139.0,33.0,1
3338515,1,195000,85.0,235000.0,58.0,41.0,1


In [15]:
# Rejected sample

In [16]:
df_ny[df_ny['action_taken'] == 0].sample(n=10)

Unnamed: 0,action_taken,loan_amount,combined_loan_to_value_ratio,property_value,income,debt_to_income_ratio,applicant_race_1
7635976,0,425000,80.0,525000.0,133.0,55.0,2
769617,0,395000,61.417,635000.0,54.0,6.6,5
759807,0,465000,101.264,455000.0,44.0,44.0,5
25768036,0,265000,90.0,295000.0,60.0,6.6,5
1605360,0,195000,64.883,295000.0,69.0,25.0,5
11584690,0,295000,95.0,315000.0,47.0,6.6,5
274258,0,145000,95.0,155000.0,62.0,37.0,5
15833595,0,1005000,102.56,975000.0,340.0,25.0,2
12049190,0,205000,57.0,355000.0,170.0,42.0,5
5941638,0,235000,95.0,255000.0,59.0,39.0,5


In [None]:
df_ny[(df_ny['action_taken'] == 0) & (df_ny['applicant_race_1'] == 5)].sample(n=2)

In [21]:
df_ny[(df_ny['action_taken'] == 0) & (df_ny['applicant_race_1'] == 4)].sample(n=2)

Unnamed: 0,action_taken,loan_amount,combined_loan_to_value_ratio,property_value,income,debt_to_income_ratio,applicant_race_1
11841286,0,165000,103.061,165000.0,28.0,6.6,4
3289507,0,975000,85.0,1155000.0,161.0,55.0,4


In [22]:
df_ny[(df_ny['action_taken'] == 0) & (df_ny['applicant_race_1'] == 3)].sample(n=2)

Unnamed: 0,action_taken,loan_amount,combined_loan_to_value_ratio,property_value,income,debt_to_income_ratio,applicant_race_1
5297457,0,815000,95.0,865000.0,125.0,55.0,3
21244791,0,285000,80.0,355000.0,77.0,33.0,3


In [23]:
df_ny[(df_ny['action_taken'] == 0) & (df_ny['applicant_race_1'] == 2)].sample(n=2)

Unnamed: 0,action_taken,loan_amount,combined_loan_to_value_ratio,property_value,income,debt_to_income_ratio,applicant_race_1
17648213,0,445000,80.0,555000.0,97.0,33.0,2
1614812,0,505000,81.6,625000.0,82.0,55.0,2


In [24]:
df_ny[(df_ny['action_taken'] == 0) & (df_ny['applicant_race_1'] == 1)].sample(n=2)

Unnamed: 0,action_taken,loan_amount,combined_loan_to_value_ratio,property_value,income,debt_to_income_ratio,applicant_race_1
197596,0,145000,97.0,155000.0,66.0,44.0,1
15011018,0,295000,80.0,375000.0,172.0,55.0,1


5 - White   
2 - Asian   
3 - Black or African American   
1 - American Indian or Alaska Native  
4 - Native Hawaiian or Other Pacific Islander   

In [7]:

df_ny.dropna(inplace=True)

# Feature scaling
scaler = StandardScaler()
df_ny[['income', 'debt_to_income_ratio', 'loan_amount', 'combined_loan_to_value_ratio', 'property_value', 'applicant_race_1']] = scaler.fit_transform(df_ny[['income', 'debt_to_income_ratio', 'loan_amount', 'combined_loan_to_value_ratio', 'property_value', 'applicant_race_1']])

# Split the dataset
X = df_ny.drop(columns=['action_taken'])
y = df_ny['action_taken']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE and Tomek Links
smote_tomek = SMOTETomek(random_state=42)
X_train_smote_tomek, y_train_smote_tomek = smote_tomek.fit_resample(X_train, y_train)

# Define the random forest model with class_weight='balanced'
rf = RandomForestClassifier(class_weight='balanced', random_state=42)

# Specify the hyperparameters and their possible values
params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

'''
# Perform randomized search with cross-validation and parallelization
random_search = RandomizedSearchCV(rf, params, n_iter=20, cv=5, verbose=1, scoring='roc_auc', random_state=42, n_jobs=-1)
random_search.fit(X_train_smote_tomek, y_train_smote_tomek) '''

# Perform grid search with cross-validation and parallelization
grid_search = GridSearchCV(rf, params, cv=5, verbose=1, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train_smote_tomek, y_train_smote_tomek)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

'''
# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_) '''

# Model evaluation
#y_pred = random_search.predict(X_test)
y_pred = grid_search.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_pred))


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.9185032503567465
Balanced Accuracy: 0.657593290102124
Classification Report:
               precision    recall  f1-score   support

           0       0.32      0.36      0.34       364
           1       0.96      0.95      0.96      5943

    accuracy                           0.92      6307
   macro avg       0.64      0.66      0.65      6307
weighted avg       0.92      0.92      0.92      6307

AUC-ROC: 0.657593290102124


In [8]:
# Get the predicted probabilities
y_pred_proba = grid_search.predict_proba(X_test)[:, 1]

# Create a DataFrame with race, loan_approval (y_test), and prediction (y_pred)
results = pd.DataFrame({'race': X_test['applicant_race_1'], 'loan_approval': y_test, 'prediction_proba': y_pred_proba, 'prediction': y_pred})

# Group the results by race and compute the mean for loan_approval and prediction
group_results = results.groupby('race').mean()

# Print the demographic parity analysis
print("Demographic Parity Analysis (Random Forest Model):")
print(group_results)

# Calculate the difference in approval rates between the groups
max_diff = abs(group_results['prediction'].max() - group_results['prediction'].min())
print("Max difference in approval rates between groups (Random Forest Model):", max_diff)

# Set a fairness threshold (e.g., 0.05)
fairness_threshold = 0.05
is_fair = max_diff <= fairness_threshold
print("Is the random forest model fair?", is_fair)


Demographic Parity Analysis (Random Forest Model):
           loan_approval  prediction_proba  prediction
race                                                  
-2.970009       0.857143          0.818202    0.928571
-2.098390       0.931996          0.845507    0.928651
-1.226772       0.905109          0.730839    0.829684
-0.355154       0.555556          0.543333    0.666667
 0.516464       0.948408          0.871532    0.944579
Max difference in approval rates between groups (Random Forest Model): 0.2779121322047562
Is the random forest model fair? False


In [9]:
# Replace these example values with the actual input values
new_input = pd.DataFrame({
    'income': [35000],
    'debt_to_income_ratio': [0.75],
    'loan_amount': [125000],
    'combined_loan_to_value_ratio': [0.83],
    'property_value': [150000],
    'applicant_race_1': [5]
})


In [10]:
# Preprocess the input data (feature scaling)
new_input_scaled = scaler.transform(new_input)

# Make a prediction using the trained model
prediction = grid_search.predict(new_input_scaled)

# Print the prediction result
print("Prediction:", prediction[0])


Prediction: 1


