In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Read File
file_path = 'cleaned.csv'
hmda = pd.read_csv(file_path)

In [4]:
hmda_sample = hmda.sample(frac =.10) 

In [7]:
len(hmda_sample)

2486666

In [9]:
hmda_sample.dtypes

as_of_year                          int64
action_taken                        int64
loan_type                           int64
loan_purpose                        int64
loan_amount_000s                  float64
msamd                             float64
state_code                        float64
county_code                       float64
applicant_ethnicity                 int64
co_applicant_ethnicity              int64
applicant_race_1                    int64
co_applicant_race_1                 int64
applicant_sex                       int64
co_applicant_sex                    int64
applicant_income_000s             float64
purchaser_type                      int64
rate_spread                       float64
hoepa_status                        int64
population                        float64
minority_population               float64
hud_median_family_income          float64
tract_to_msamd_income             float64
number_of_owner_occupied_units    float64
number_of_1_to_4_family_units     

In [11]:
# Dropping rate_spread for now as well
X = hmda_sample.drop(columns=['action_taken', 'rate_spread'])
y = hmda_sample['action_taken']


In [13]:
X = pd.get_dummies(X, drop_first=True)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Random Forest Model
rf_model = RandomForestClassifier(random_state=42, verbose=2)
rf_model.fit(X_train, y_train)


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100



KeyboardInterrupt



In [21]:
# Accuracy Scores and Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix

y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))


KeyboardInterrupt



In [None]:
# ROC AUC Curve
from sklearn.metrics import roc_curve, auc

fpr, tpr, _ = roc_curve(y_test, rf_model.predict_proba(X_test)[:, 1])
auc_score = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f'AUC = {auc_score:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt
importances = rf_model.feature_importances_
features = X.columns
plt.barh(features, importances)
plt.xlabel("Feature Importance")
plt.title("Random Forest Feature Importance")
plt.show()

In [25]:
from sklearn.naive_bayes import GaussianNB

In [26]:
hmda_nb = GaussianNB()

hmda_nb.fit(X_train, y_train)

In [28]:
y_pred = hmda_nb.predict(X_test)
print(classification_report(y_test, y_pred))
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95    413621
           1       1.00      0.53      0.69     83713

    accuracy                           0.92    497334
   macro avg       0.95      0.77      0.82    497334
weighted avg       0.93      0.92      0.91    497334

Confusion Matrix
[[413403    218]
 [ 39169  44544]]


In [31]:
y_pred = hmda_nb.predict(X_test)

from sklearn import metrics
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)

Gaussian Naive Bayes model accuracy(in %): 92.0803725464175


In [33]:
# race_mapping = {5: 'White', 3: 'Black or African American', 2: 'Asian', 1: 'American Indian or Alaska Native', 4: 'Native Hawaiian or Other Pacific Islander'}
# hmda_sample['action_taken'] = hmda_sample['action_taken'].apply(lambda x: 1 if x in [1, 2] else 0)
# hmda_sample['applicant_race_1'] = hmda_sample['applicant_race_1'].map(race_mapping)
# hmda_sample.describe()

In [35]:
hmda_sample['applicant_race_1']

20493630    5
785654      5
23721266    5
11834421    3
5852959     1
           ..
15768785    3
10510466    5
18606040    5
23705045    5
23385277    5
Name: applicant_race_1, Length: 2486666, dtype: int64

In [37]:
#Training the data on white data and then testing on all data
# 5 is white
white_data = hmda_sample[hmda_sample['applicant_race_1'] == 5]
X_white = white_data.drop(columns=['action_taken', 'rate_spread'])
y_white = white_data['action_taken']


In [39]:
white_data

Unnamed: 0,as_of_year,action_taken,loan_type,loan_purpose,loan_amount_000s,msamd,state_code,county_code,applicant_ethnicity,co_applicant_ethnicity,...,applicant_income_000s,purchaser_type,rate_spread,hoepa_status,population,minority_population,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units
20493630,2015,0,2,3,118.0,43900.0,45.0,83.0,1,0,...,45.0,0,-1.00,2,7920.0,16.290001,54800.0,119.900002,2317.0,3005.0
785654,2007,0,1,1,34.0,31084.0,6.0,37.0,2,2,...,371.0,0,-1.00,2,4268.0,35.259998,56500.0,154.720001,1353.0,1538.0
23721266,2017,0,2,1,209.0,38060.0,4.0,13.0,2,0,...,70.0,0,-1.00,2,7333.0,30.400000,66200.0,130.979996,1982.0,3416.0
15220120,2012,0,2,1,181.0,24020.0,36.0,113.0,2,0,...,50.0,0,-1.00,2,3731.0,5.010000,62600.0,99.370003,1068.0,1750.0
9401259,2009,0,1,3,328.0,17140.0,39.0,165.0,2,2,...,149.0,0,-1.00,2,7708.0,6.330000,69200.0,155.580002,2250.0,2500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075418,2007,0,1,3,164.0,18140.0,39.0,159.0,2,0,...,46.0,0,-1.00,2,5738.0,1.850000,64200.0,113.629997,1763.0,2131.0
10510466,2010,0,3,3,305.0,17820.0,8.0,41.0,2,2,...,98.0,0,-1.00,2,4485.0,12.910000,70600.0,154.130005,1261.0,1577.0
18606040,2014,1,2,1,132.0,41420.0,41.0,47.0,1,0,...,31.0,7,1.57,2,4834.0,52.849998,55800.0,105.480003,1012.0,1305.0
23705045,2017,0,1,3,42.0,19124.0,48.0,85.0,2,2,...,112.0,0,-1.00,2,17773.0,33.750000,73400.0,165.919998,4815.0,5128.0


In [40]:
white_data['applicant_race_1']

20493630    5
785654      5
23721266    5
15220120    5
9401259     5
           ..
1075418     5
10510466    5
18606040    5
23705045    5
23385277    5
Name: applicant_race_1, Length: 1959501, dtype: int64

In [43]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_white_scaled = scaler.fit_transform(X_white)

In [44]:
hmda_nb.fit(X_white_scaled, y_white)

In [46]:
# Set up data for machine learning
# Create function that creates stratified samples of a dataset
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
def stratified_sample(df, col, sample_size):
    """Creates a new dataframe that has the same proportions of specified column as the original dataframe"""
    sample = pd.DataFrame()
    total_obs = df.shape[0]
    grouped = df.groupby(col)
    for category in list(grouped.groups.keys()):
        group = grouped.get_group(category)
        group_size = group.shape[0]
        group_sample_size = round((group_size / total_obs) * sample_size)
        if sample.shape[0] > 0:
            sample = pd.concat([sample, group.sample(n=group_sample_size, random_state=313)], ignore_index=True)
        else:
            sample = grouped.get_group(category).sample(n=group_sample_size, random_state=313)
    return sample

# Function that collects all data for a specific year
def set_year(df, year):
    """Creates a new dataframe that contains all observations for a specific year"""
    year_data = df[df['as_of_year'] == year]
    return year_data

# Function that splits data into training and test sets
def split_xy(df, y_col):
    """Splits a dataframe into X and y components. The function assumes that the last column is the y variable."""
    X = df.drop(columns=[y_col])
    y = df[y_col]
    return [X, y]

# Create sample function that only draws from specified values in a column
def value_sample(df, col, value, sample_size):
    """Creates a dataframe that has a randome sample of observations that have specified value in
    specified column. For example, if col = 'applicant_race_1' and value = 5 (which corresponds to white), 
    the sample will only have observations where the applicant is white.""" 
    sample = df[df[col] == value].sample(sample_size, random_state=313)
    return sample

# Function that create training data for white model and all race model, and test data for each race
def model_samples(df, year_sample_size, test_sample_size, remove_race=True):
    """Fucntion that creates our training and test data for our ML models. If remove_race is true, the models will be 
    blind to the races of each application."""
    # First collect equal sized samples from each year
    data_by_year = []
    for year in range(2007, 2018):
        year_dataset = set_year(df, year)
        # Create training data for white and all race models
        white_train = value_sample(year_dataset, 'applicant_race_1', 5, year_sample_size)
        all_train = stratified_sample(year_dataset, 'applicant_race_1', year_sample_size)
        if remove_race:
            white_train = white_train.drop(columns=['applicant_race_1', 'co_applicant_race_1'])
            all_train = all_train.drop(columns=['applicant_race_1', 'co_applicant_race_1'])
        # Create test data for each race
        race_tests = []
        for race in range(1, 6):
            race_sample = value_sample(year_dataset, 'applicant_race_1', race, test_sample_size)
            if remove_race:
                race_sample = race_sample.drop(columns=['applicant_race_1', 'co_applicant_race_1'])
            race_tests.append(race_sample)
        data_by_year.append([white_train, all_train, race_tests])
    # Complied the data for each year
    year_data = data_by_year.pop(0) 
    compiled_white = year_data[0]
    compiled_all_race = year_data[1]
    compiled_race_tests = year_data[2]  
    while len(data_by_year) > 0:
        year_data = data_by_year.pop(0)
        compiled_white = pd.concat([compiled_white, year_data[0]], ignore_index=True)
        compiled_all_race = pd.concat([compiled_all_race, year_data[1]], ignore_index=True)
        for i in range(0, 5):
            compiled_race_tests[i] = pd.concat([compiled_race_tests[i], year_data[2][i]], ignore_index=True)
    return [compiled_white, compiled_all_race, compiled_race_tests]

def model_testing(model, sample_data, y_col, cv=5, param_grid=None):
    """Fits white model and all race based on the specified training data. The y_col is the y variable column name.
    The sample_data parameter is the output of the model_samples function and param_grid is the hyperparameter grid
    that will be used in the GridSearchCV function. This function will return the results of the each model with given 
    parameters and the testing results from each race."""
    # Create the white model dataset
    X_white, y_white = split_xy(sample_data[0], y_col)
    # Create the all race model dataset
    X_all, y_all = split_xy(sample_data[1], y_col)

    # Set up Stratified K-Fold cross-validation
    stratified_kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=313)
    # Define scoring metrics
    scoring = {'roc_auc': 'roc_auc', 'precision': 'precision', 'recall': 'recall', 'accuracy': 'accuracy'}

    # Initialize GridSearchCV for White Only Model
    white_search = GridSearchCV(model, param_grid, scoring=scoring, cv=stratified_kfold, refit='roc_auc', return_train_score=True)
    white_search.fit(X_white, y_white)
    # Initialize GridSearchCV for All Races Model
    all_search = GridSearchCV(model, param_grid, scoring=scoring, cv=stratified_kfold, refit='roc_auc', return_train_score=True)
    all_search.fit(X_all, y_all)

    # Extract results for each model
    white_results = pd.DataFrame(white_search.cv_results_)
    all_results = pd.DataFrame(all_search.cv_results_)
    # Add model type to results
    white_results['Model'] = 'White only'
    all_results['Model'] = 'All races'
    # Combine results
    combined_results = pd.concat([white_results, all_results])

    # Sort combined results by index then drop
    combined_results = combined_results.sort_index()

    # Select relevant columns and rename them for clarity
    relevant_columns = [f'param_{param}' for param in param_grid.keys()] + [
        'mean_test_roc_auc', 'mean_test_precision', 'mean_test_recall', 'mean_test_accuracy', 'Model']
    combined_results = combined_results[relevant_columns]

    # Rename columns for final output
    column_mapping = {f'param_{param}': param for param in param_grid.keys()}
    column_mapping.update({
        'mean_test_roc_auc': 'ROC-AUC Score',
        'mean_test_precision': 'Precision',
        'mean_test_recall': 'Recall',
        'mean_test_accuracy': 'Accuracy'})
    combined_results.rename(columns=column_mapping, inplace=True)

    # Order the columns of so that Model is first
    column_order = ['Model'] + [param for param in param_grid.keys()] + ['ROC-AUC Score', 'Precision', 'Recall', 'Accuracy']
    combined_results = combined_results[column_order]

    return combined_results

# Remove rate spread from the dataset
hmda_no_rate_spread = hmda_sample.drop(columns=['rate_spread'])

# Adjust sample and test sizes for models and testing 
samples = model_samples(hmda_no_rate_spread, 1000, 200)

# Set up model and hyperparameter grids
logit = LogisticRegression(random_state=313)
rforest = RandomForestClassifier(random_state=313)

model_testing(rforest, samples, 'action_taken', param_grid={'max_depth': [3, 5], 'bootstrap': [True], 'max_samples':[100, 500]}, cv=5)

Unnamed: 0,Model,max_depth,bootstrap,max_samples,ROC-AUC Score,Precision,Recall,Accuracy
0,White only,3,True,100,0.864321,1.0,0.441509,0.906636
0,All races,3,True,100,0.866177,1.0,0.466737,0.916204
1,White only,3,True,500,0.873952,1.0,0.550829,0.924909
1,All races,3,True,500,0.880286,1.0,0.567399,0.932019
2,White only,5,True,100,0.865381,1.0,0.486669,0.914182
2,All races,5,True,100,0.86799,1.0,0.498561,0.921203
3,White only,5,True,500,0.88291,1.0,0.557908,0.926091
3,All races,5,True,500,0.890341,1.0,0.572601,0.932837


In [353]:
state_mapping = {
    1: 'Alabama', 2: 'Alaska', 4: 'Arizona', 5: 'Arkansas', 6: 'California', 8: 'Colorado', 
    9: 'Connecticut', 10: 'Delaware', 11: 'District of Columbia', 12: 'Florida', 13: 'Georgia', 
    15: 'Hawaii', 16: 'Idaho', 17: 'Illinois', 18: 'Indiana', 19: 'Iowa', 20: 'Kansas', 
    21: 'Kentucky', 22: 'Louisiana', 23: 'Maine', 24: 'Maryland', 25: 'Massachusetts', 
    26: 'Michigan', 27: 'Minnesota', 28: 'Mississippi', 29: 'Missouri', 30: 'Montana', 
    31: 'Nebraska', 32: 'Nevada', 33: 'New Hampshire', 34: 'New Jersey', 35: 'New Mexico', 
    36: 'New York', 37: 'North Carolina', 38: 'North Dakota', 39: 'Ohio', 40: 'Oklahoma', 
    41: 'Oregon', 42: 'Pennsylvania', 44: 'Rhode Island', 45.0: 'South Carolina', 46: 'South Dakota', 
    47: 'Tennessee', 48: 'Texas', 49: 'Utah', 50: 'Vermont', 51: 'Virginia', 53: 'Washington', 
    54: 'West Virginia', 55: 'Wisconsin', 56: 'Wyoming'
}

In [389]:
state_mapping_reversed = {
    'Alabama': 1, 'Alaska': 2, 'Arizona': 4, 'Arkansas': 5, 'California': 6, 'Colorado': 8,
    'Connecticut': 9, 'Delaware': 10, 'District of Columbia': 11, 'Florida': 12, 'Georgia': 13,
    'Hawaii': 15, 'Idaho': 16, 'Illinois': 17, 'Indiana': 18, 'Iowa': 19, 'Kansas': 20,
    'Kentucky': 21, 'Louisiana': 22, 'Maine': 23, 'Maryland': 24, 'Massachusetts': 25,
    'Michigan': 26, 'Minnesota': 27, 'Mississippi': 28, 'Missouri': 29, 'Montana': 30,
    'Nebraska': 31, 'Nevada': 32, 'New Hampshire': 33, 'New Jersey': 34, 'New Mexico': 35,
    'New York': 36, 'North Carolina': 37, 'North Dakota': 38, 'Ohio': 39, 'Oklahoma': 40,
    'Oregon': 41, 'Pennsylvania': 42, 'Rhode Island': 44, 'South Carolina': 45.0, 'South Dakota': 46,
    'Tennessee': 47, 'Texas': 48, 'Utah': 49, 'Vermont': 50, 'Virginia': 51, 'Washington': 53,
    'West Virginia': 54, 'Wisconsin': 55, 'Wyoming': 56
}

In [392]:
hmda_sample['state_code'] = hmda_sample['state_code'].map(state_mapping_reversed)

In [397]:
hmda_sample['state_code']

20493630    45.0
785654       6.0
23721266     4.0
11834421    39.0
5852959      6.0
            ... 
15768785    37.0
10510466     8.0
18606040    41.0
23705045    48.0
23385277     6.0
Name: state_code, Length: 2486666, dtype: float64

In [171]:
race_mapping = {5: 'White', 3: 'Black or African American', 2: 'Asian', 1: 'American Indian or Alaska Native', 4: 'Native Hawaiian or Other Pacific Islander'}
hmda_sample['action_taken'] = hmda_sample['action_taken'].apply(lambda x: 1 if x in [1, 2] else 0)
hmda_sample['applicant_race_1'] = hmda_sample['applicant_race_1'].map(race_mapping)
hmda_sample.describe()

Unnamed: 0,as_of_year,action_taken,loan_type,loan_purpose,loan_amount_000s,msamd,county_code,applicant_ethnicity,co_applicant_ethnicity,co_applicant_race_1,...,applicant_income_000s,purchaser_type,rate_spread,hoepa_status,population,minority_population,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units
count,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,...,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0
mean,2011.078,0.1681597,1.236123,2.28844,184.026,30433.49,86.4012,1.852879,0.7400962,1.825391,...,90.12557,0.5268335,-0.2136762,1.99916,5556.639,33.66403,66974.21,107.4395,1447.721,1885.659
std,3.375317,0.3740081,0.5249497,0.8884358,361.7722,11255.29,100.7911,0.3542267,0.937537,2.328069,...,151.3403,1.841884,1.947055,0.02897197,2765.717,29.09032,13741.92,39.08267,809.8524,975.4884
min,2007.0,0.0,1.0,1.0,1.0,10140.0,1.0,1.0,0.0,0.0,...,1.0,0.0,-1.0,1.0,0.0,0.0,14400.0,0.0,0.0,0.0
25%,2008.0,0.0,1.0,1.0,73.0,19740.0,29.0,2.0,0.0,0.0,...,41.0,0.0,-1.0,2.0,3787.0,9.91,59000.0,82.61,913.0,1260.0
50%,2011.0,0.0,1.0,3.0,139.0,31540.0,63.0,2.0,0.0,0.0,...,65.0,0.0,-1.0,2.0,5110.0,23.25,65000.0,101.94,1329.0,1729.0
75%,2014.0,0.0,1.0,3.0,237.0,40140.0,109.0,2.0,2.0,5.0,...,102.0,0.0,-1.0,2.0,6719.0,51.39,73300.0,125.22,1819.0,2310.0
max,2017.0,1.0,4.0,3.0,344000.0,49740.0,840.0,2.0,2.0,5.0,...,65000.0,9.0,99.99,2.0,53812.0,100.0,131500.0,507.47,19529.0,25391.0


In [173]:
hmda_sample['applicant_race_1']

20493630                               White
785654                                 White
23721266                               White
11834421           Black or African American
5852959     American Indian or Alaska Native
                          ...               
15768785           Black or African American
10510466                               White
18606040                               White
23705045                               White
23385277                               White
Name: applicant_race_1, Length: 2486666, dtype: object

In [272]:
race_state_dictionary = race_state_dict(hmda_sample, 'applicant_race_1', 'state_code')

In [429]:
Reversed_Race_Mapping = {'White': 5, 'Black or African American': 3, 'Asian': 2, 'American Indian or Alaska Native': 1, 'Native Hawaiian or Other Pacific Islander': 4}

In [431]:
hmda_sample['applicant_race_1'] = hmda_sample['applicant_race_1'].map(Reversed_Race_Mapping)

In [580]:
hmda_sample_no_rate_spread = hmda_sample.drop(columns=['rate_spread'])

In [874]:
hmda_remove_rate_spread = hmda.drop(columns=['rate_spread'])

In [1057]:
def by_state_samples_5(df, year_sample_size, remove_race=True):
    
    states = sorted(df['state_code'].unique())
    
    # Lists to store DataFrames for the "All Races" and "White" data for each state
    all_race_dfs = []  # Will store lists of DataFrames, each list corresponding to a state
    white_data_dfs = []  # Will store lists of DataFrames, each list corresponding to a state
    
    # Loop through each state
    for state in states:
        # Filter the dataframe for the current state
        state_data = df[df['state_code'] == state]

        sample_size = min(year_sample_size, len(state_data))

        # Sample the "White" race (assuming 'White' corresponds to value 5 in the ethnicity column)
        white_sample = value_sample(state_data, 'applicant_race_1', 5, sample_size)

        # Sample "All Races" (a random sample from the entire state's data, excluding specific race samples)
        all_sample = stratified_sample(state_data, 'applicant_race_1', sample_size)

        if remove_race:
            white_sample = white_sample.drop(columns=['co_applicant_race_1', 'applicant_race_1'])
            all_sample = all_sample.drop(columns=['co_applicant_race_1', 'applicant_race_1'])
        
        # # Add the state code to each sample to identify which state they belong to
        # white_sample['state_code'] = state
        # all_sample['state_code'] = state

        # Append to the lists
        all_race_dfs.append(all_sample)  
        white_data_dfs.append(white_sample) 
    
    return [all_race_dfs, white_data_dfs]

In [1059]:
state_sample5 = by_state_samples_5(hmda_remove_rate_spread, 1000)

In [1049]:
state_sample5

[[        index  as_of_year  action_taken  loan_type  loan_purpose  \
  0    24670037        2017             0          1             2   
  1     6979983        2008             0          2             1   
  2    12702138        2011             0          1             3   
  3     3617120        2007             0          1             3   
  4    17435252        2013             0          1             3   
  ..        ...         ...           ...        ...           ...   
  995  24589013        2017             0          1             3   
  996  23029000        2016             1          1             1   
  997  23175695        2016             0          2             3   
  998  12532534        2011             0          2             1   
  999   7101262        2008             0          1             1   
  
       loan_amount_000s    msamd  state_code  county_code  applicant_ethnicity  \
  0                15.0  13820.0         1.0          9.0                  

In [979]:
len(state_sample5)

2

In [1075]:
def model_testing5(model, sample_data, y_col, cv=5, param_grid = None):

    race_strings = ['American Indian', 'Asian', 'Black', 'Hawaiian/Other', 'White']
    # List to store results for each state and dataset
    all_results = []  

    # Set up Stratified K-Fold cross-validation
    stratified_kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=313)

    # Define scoring metrics
    scoring = {'roc_auc': 'roc_auc', 'precision': 'precision', 'recall': 'recall', 'accuracy': 'accuracy'}

    # Loop through each state
    models = []
    race = []
    state = []
    accuracy = []
    precision = []
    recall = []
    roc_auc = []
    confusion_mats = []
    i = 0
    for i in range(len(sample_data[1])):

        stratified_kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=313)
        
        # Split the data into features (X) and target (y) for both datasets
        X_all_race, y_all_race = split_xy(sample_data[0][i], y_col)
        X_white, y_white = split_xy(sample_data[1][i], y_col)

        # Initialize GridSearchCV for "All Races" model
        all_race_search = GridSearchCV(model, param_grid, scoring=scoring, cv=stratified_kfold, refit='roc_auc', return_train_score=True)
        all_race_search.fit(X_all_race, y_all_race)
        
        # Initialize GridSearchCV for "White" model
        white_search = GridSearchCV(model, param_grid, scoring=scoring, cv=stratified_kfold, refit='roc_auc', return_train_score=True)
        white_search.fit(X_white, y_white)

          # Predict the test data for each race

        for j in range(0, 5):
            X_test, y_test = split_xy(sample_data[2][j], y_col)
            white_preds = white_search.predict(X_test)
            white_results = classification_report(y_test, white_preds, output_dict=True)
            all_race_preds = all_race_search.predict(X_test)
            all_race_results = classification_report(y_test, all_race_preds, output_dict=True)
            # Add results to thier respective lists
            models.append('White Only')
            models.append('All Races')
            race.append(race_strings[j])
            race.append(race_strings[j])
            state.append(sample_data[0][i]['state_code'])
            state.append(sample_data[1][i]['state_code'])
            accuracy.append(white_results['accuracy'])
            accuracy.append(all_race_results['accuracy'])
            precision.append(white_results['1']['precision'])
            precision.append(all_race_results['1']['precision'])
            recall.append(white_results['1']['recall'])
            recall.append(all_race_results['1']['recall'])
            roc_auc.append(white_results['1']['f1-score'])
            roc_auc.append(all_race_results['1']['f1-score'])
            # Add confusion matrices values to list
            confusion_mats.append(confusion_matrix(y_test, white_preds))
            confusion_mats.append(confusion_matrix(y_test, all_race_preds))
        
    return pd.DataFrame({'Model': models, 'Race': race, 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'ROC-AUC': roc_auc, 'Confusion Matrix': confusion_mats, 'State': state })
        
    #     # Add model type and state to results
    #     all_race_results['Model'] = 'All Races'
    #     white_results['Model'] = 'White only'
    #     all_race_results['state_code'] = state_data_all_race['state_code']
    #     white_results['state_code'] = state_data_white['state_code']
        
    #     # Combine results for both models into a single DataFrame
    #     combined_results = pd.concat([all_race_results, white_results])

    #     # Select relevant columns and rename for clarity
    #     relevant_columns = [f'param_{param}' for param in param_grid.keys()] + [
    #         'mean_test_roc_auc', 'mean_test_precision', 'mean_test_recall', 'mean_test_accuracy', 'State', 'Model']
    #     combined_results = combined_results[relevant_columns]

    #     # Rename columns for final output
    #     column_mapping = {f'param_{param}': param for param in param_grid.keys()}
    #     column_mapping.update({
    #         'mean_test_roc_auc': 'ROC-AUC Score',
    #         'mean_test_precision': 'Precision',
    #         'mean_test_recall': 'Recall',
    #         'mean_test_accuracy': 'Accuracy'})
    #     combined_results.rename(columns=column_mapping, inplace=True)

    #     # Order the columns so that Model and State come first
    #     column_order = ['State', 'Model'] + [param for param in param_grid.keys()] + ['ROC-AUC Score', 'Precision', 'Recall', 'Accuracy']
    #     combined_results = combined_results[column_order]

    #     # Append results for the current state to the all_results list
    #     all_results.append(combined_results)
    
    # # Concatenate all results into a single DataFrame
    # final_results = pd.concat(all_results, ignore_index=True)
    
    # return final_results

In [1038]:
state_sample5

[[[        index  as_of_year  action_taken  loan_type  loan_purpose  \
   0    24670037        2017             0          1             2   
   1     6979983        2008             0          2             1   
   2    12702138        2011             0          1             3   
   3     3617120        2007             0          1             3   
   4    17435252        2013             0          1             3   
   ..        ...         ...           ...        ...           ...   
   995  24589013        2017             0          1             3   
   996  23029000        2016             1          1             1   
   997  23175695        2016             0          2             3   
   998  12532534        2011             0          2             1   
   999   7101262        2008             0          1             1   
   
        loan_amount_000s    msamd  state_code  county_code  applicant_ethnicity  \
   0                15.0  13820.0         1.0          9.0   

In [1077]:
(state_sample5.append(samples[2]))
state_sample5
model_testing5(rforest, state_sample5, 'action_taken', param_grid={'max_depth': [3, 5], 'bootstrap': [True], 'max_samples':[100, 200]}, cv=5)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Unnamed: 0,Model,Race,Accuracy,Precision,Recall,ROC-AUC,Confusion Matrix,State
0,White Only,American Indian,0.950909,1.000000,0.465347,0.635135,"[[1998, 0], [108, 94]]",0 1.0 1 1.0 2 1.0 3 1.0 4 ...
1,All Races,American Indian,0.951364,1.000000,0.470297,0.639731,"[[1998, 0], [107, 95]]",9660459 1.0 23084669 1.0 7075930 1....
2,White Only,Asian,0.965000,1.000000,0.594737,0.745875,"[[2010, 0], [77, 113]]",0 1.0 1 1.0 2 1.0 3 1.0 4 ...
3,All Races,Asian,0.965455,1.000000,0.600000,0.750000,"[[2010, 0], [76, 114]]",9660459 1.0 23084669 1.0 7075930 1....
4,White Only,Black,0.953182,1.000000,0.679128,0.808905,"[[1879, 0], [103, 218]]",0 1.0 1 1.0 2 1.0 3 1.0 4 ...
...,...,...,...,...,...,...,...,...
515,All Races,Black,0.945909,0.990291,0.635514,0.774194,"[[1877, 2], [117, 204]]",11957424 72.0 5868488 72.0 21182885 ...
516,White Only,Hawaiian/Other,0.950455,0.834356,0.623853,0.713911,"[[1955, 27], [82, 136]]",0 72.0 1 72.0 2 72.0 3 72....
517,All Races,Hawaiian/Other,0.954091,0.939850,0.573394,0.712251,"[[1974, 8], [93, 125]]",11957424 72.0 5868488 72.0 21182885 ...
518,White Only,White,0.918182,0.930876,0.550409,0.691781,"[[1818, 15], [165, 202]]",0 72.0 1 72.0 2 72.0 3 72....


In [897]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

def race_tests5(model, sample_data, y_col, cv=5, param_grid=None):
    """Function that tests each race sample against the specified model.
    The y_col is the target column name.
    
    - pd.DataFrame: Contains the performance metrics for each race and model
    """
    
    # Race categories to label the race column
    race_strings = ['American Indian', 'Asian', 'Black', 'Hawaiian/Other', 'White']
    
    # Create the white model dataset (White only)
    X_white, y_white = split_xy(sample_data[1], y_col)
    
    # Create the all race model dataset (All Races combined)
    X_all, y_all = split_xy(sample_data[0], y_col)

    # Set up Stratified K-Fold cross-validation
    stratified_kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=313)

    # Initialize GridSearchCV for White Only Model
    white_search = GridSearchCV(model, param_grid=param_grid, cv=stratified_kfold)
    white_model = white_search.fit(X_white, y_white)
    
    # Initialize GridSearchCV for All Races Model
    all_search = GridSearchCV(model, param_grid=param_grid, cv=stratified_kfold)
    all_race_model = all_search.fit(X_all, y_all)

    # Store results for each race
    models = []
    race = []
    accuracy = []
    precision = []
    recall = []
    roc_auc = []
    confusion_mats = []

    # Loop through race categories (1 to 5) for testing
    for i in range(0, 5):
        # Get the current race data (from race_tests_df)
        X_test, y_test = split_xy(sample_data[2], y_col)
        
        # Get predictions for White Only Model
        white_preds = white_model.predict(X_test)
        white_results = classification_report(y_test, white_preds, output_dict=True)
        
        # Get predictions for All Races Model
        all_race_preds = all_race_model.predict(X_test)
        all_race_results = classification_report(y_test, all_race_preds, output_dict=True)
        
        # Add results to their respective lists
        models.append('White Only')
        models.append('All Races')
        
        race.append(race_strings[i])
        race.append(race_strings[i])
        
        accuracy.append(white_results['accuracy'])
        accuracy.append(all_race_results['accuracy'])
        
        precision.append(white_results['1']['precision'])
        precision.append(all_race_results['1']['precision'])
        
        recall.append(white_results['1']['recall'])
        recall.append(all_race_results['1']['recall'])
        
        roc_auc.append(white_results['1']['f1-score'])
        roc_auc.append(all_race_results['1']['f1-score'])
        
        # Add confusion matrix values to list
        confusion_mats.append(confusion_matrix(y_test, white_preds))
        confusion_mats.append(confusion_matrix(y_test, all_race_preds))

    # Create a DataFrame to summarize the results
    results_df = pd.DataFrame({
        'Model': models,
        'Race': race,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'ROC-AUC': roc_auc,
        'Confusion Matrix': confusion_mats
    })

    return results_df

In [None]:
logit = Pipeline([
    ('scaler', StandardScaler()),  
    ('logit', LogisticRegression(random_state=313))
])

model_testing5(logit, state_sample5, 'action_taken', param_grid={'logit__penalty': ['l2'],
             'logit__C': [0.1, 1, 10, 100], 'logit__max_iter':[100, 200]}, cv=5)

In [655]:
model_testing_state(rforest, state_sample5, 'action_taken', param_grid={'max_depth': [3, 5], 'bootstrap': [True], 'max_samples':[100, 500]}, cv=5)

KeyError: 0

In [836]:
hmda.dtypes

as_of_year                          int64
action_taken                        int64
loan_type                           int64
loan_purpose                        int64
loan_amount_000s                  float64
msamd                             float64
state_code                        float64
county_code                       float64
applicant_ethnicity                 int64
co_applicant_ethnicity              int64
applicant_race_1                    int64
co_applicant_race_1                 int64
applicant_sex                       int64
co_applicant_sex                    int64
applicant_income_000s             float64
purchaser_type                      int64
rate_spread                       float64
hoepa_status                        int64
population                        float64
minority_population               float64
hud_median_family_income          float64
tract_to_msamd_income             float64
number_of_owner_occupied_units    float64
number_of_1_to_4_family_units     

In [None]:
logit = LogisticRegression(random_state=313)
rforest = RandomForestClassifier(random_state=313)

model_testing(rforest, state_samples, 'action_taken', param_grid={'max_depth': [3, 5], 'bootstrap': [True], 'max_samples':[100, 500]}, cv=5)

In [None]:
race_tests(RandomForestClassifier(random_state=313), state_samples, 'action_taken', param_grid={'max_depth': [5], 'bootstrap': [True], 'max_samples':[500]})