In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Read File
file_path = 'cleaned.csv'
hmda = pd.read_csv(file_path)

In [4]:
hmda_sample = hmda.sample(frac =.10) 

In [7]:
len(hmda_sample)

2486666

In [9]:
hmda_sample.dtypes

as_of_year                          int64
action_taken                        int64
loan_type                           int64
loan_purpose                        int64
loan_amount_000s                  float64
msamd                             float64
state_code                        float64
county_code                       float64
applicant_ethnicity                 int64
co_applicant_ethnicity              int64
applicant_race_1                    int64
co_applicant_race_1                 int64
applicant_sex                       int64
co_applicant_sex                    int64
applicant_income_000s             float64
purchaser_type                      int64
rate_spread                       float64
hoepa_status                        int64
population                        float64
minority_population               float64
hud_median_family_income          float64
tract_to_msamd_income             float64
number_of_owner_occupied_units    float64
number_of_1_to_4_family_units     

In [11]:
# Dropping rate_spread for now as well
X = hmda_sample.drop(columns=['action_taken', 'rate_spread'])
y = hmda_sample['action_taken']


In [13]:
X = pd.get_dummies(X, drop_first=True)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Random Forest Model
rf_model = RandomForestClassifier(random_state=42, verbose=2)
rf_model.fit(X_train, y_train)


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100



KeyboardInterrupt



In [21]:
# Accuracy Scores and Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix

y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))


KeyboardInterrupt



In [None]:
# ROC AUC Curve
from sklearn.metrics import roc_curve, auc

fpr, tpr, _ = roc_curve(y_test, rf_model.predict_proba(X_test)[:, 1])
auc_score = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f'AUC = {auc_score:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt
importances = rf_model.feature_importances_
features = X.columns
plt.barh(features, importances)
plt.xlabel("Feature Importance")
plt.title("Random Forest Feature Importance")
plt.show()

In [25]:
from sklearn.naive_bayes import GaussianNB

In [26]:
hmda_nb = GaussianNB()

hmda_nb.fit(X_train, y_train)

In [28]:
y_pred = hmda_nb.predict(X_test)
print(classification_report(y_test, y_pred))
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95    413621
           1       1.00      0.53      0.69     83713

    accuracy                           0.92    497334
   macro avg       0.95      0.77      0.82    497334
weighted avg       0.93      0.92      0.91    497334

Confusion Matrix
[[413403    218]
 [ 39169  44544]]


In [31]:
y_pred = hmda_nb.predict(X_test)

from sklearn import metrics
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)

Gaussian Naive Bayes model accuracy(in %): 92.0803725464175


In [33]:
# race_mapping = {5: 'White', 3: 'Black or African American', 2: 'Asian', 1: 'American Indian or Alaska Native', 4: 'Native Hawaiian or Other Pacific Islander'}
# hmda_sample['action_taken'] = hmda_sample['action_taken'].apply(lambda x: 1 if x in [1, 2] else 0)
# hmda_sample['applicant_race_1'] = hmda_sample['applicant_race_1'].map(race_mapping)
# hmda_sample.describe()

In [35]:
hmda_sample['applicant_race_1']

20493630    5
785654      5
23721266    5
11834421    3
5852959     1
           ..
15768785    3
10510466    5
18606040    5
23705045    5
23385277    5
Name: applicant_race_1, Length: 2486666, dtype: int64

In [37]:
#Training the data on white data and then testing on all data
# 5 is white
white_data = hmda_sample[hmda_sample['applicant_race_1'] == 5]
X_white = white_data.drop(columns=['action_taken', 'rate_spread'])
y_white = white_data['action_taken']


In [39]:
white_data

Unnamed: 0,as_of_year,action_taken,loan_type,loan_purpose,loan_amount_000s,msamd,state_code,county_code,applicant_ethnicity,co_applicant_ethnicity,...,applicant_income_000s,purchaser_type,rate_spread,hoepa_status,population,minority_population,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units
20493630,2015,0,2,3,118.0,43900.0,45.0,83.0,1,0,...,45.0,0,-1.00,2,7920.0,16.290001,54800.0,119.900002,2317.0,3005.0
785654,2007,0,1,1,34.0,31084.0,6.0,37.0,2,2,...,371.0,0,-1.00,2,4268.0,35.259998,56500.0,154.720001,1353.0,1538.0
23721266,2017,0,2,1,209.0,38060.0,4.0,13.0,2,0,...,70.0,0,-1.00,2,7333.0,30.400000,66200.0,130.979996,1982.0,3416.0
15220120,2012,0,2,1,181.0,24020.0,36.0,113.0,2,0,...,50.0,0,-1.00,2,3731.0,5.010000,62600.0,99.370003,1068.0,1750.0
9401259,2009,0,1,3,328.0,17140.0,39.0,165.0,2,2,...,149.0,0,-1.00,2,7708.0,6.330000,69200.0,155.580002,2250.0,2500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075418,2007,0,1,3,164.0,18140.0,39.0,159.0,2,0,...,46.0,0,-1.00,2,5738.0,1.850000,64200.0,113.629997,1763.0,2131.0
10510466,2010,0,3,3,305.0,17820.0,8.0,41.0,2,2,...,98.0,0,-1.00,2,4485.0,12.910000,70600.0,154.130005,1261.0,1577.0
18606040,2014,1,2,1,132.0,41420.0,41.0,47.0,1,0,...,31.0,7,1.57,2,4834.0,52.849998,55800.0,105.480003,1012.0,1305.0
23705045,2017,0,1,3,42.0,19124.0,48.0,85.0,2,2,...,112.0,0,-1.00,2,17773.0,33.750000,73400.0,165.919998,4815.0,5128.0


In [40]:
white_data['applicant_race_1']

20493630    5
785654      5
23721266    5
15220120    5
9401259     5
           ..
1075418     5
10510466    5
18606040    5
23705045    5
23385277    5
Name: applicant_race_1, Length: 1959501, dtype: int64

In [43]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_white_scaled = scaler.fit_transform(X_white)

In [44]:
hmda_nb.fit(X_white_scaled, y_white)

In [46]:
# Set up data for machine learning
# Create function that creates stratified samples of a dataset
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
def stratified_sample(df, col, sample_size):
    """Creates a new dataframe that has the same proportions of specified column as the original dataframe"""
    sample = pd.DataFrame()
    total_obs = df.shape[0]
    grouped = df.groupby(col)
    for category in list(grouped.groups.keys()):
        group = grouped.get_group(category)
        group_size = group.shape[0]
        group_sample_size = round((group_size / total_obs) * sample_size)
        if sample.shape[0] > 0:
            sample = pd.concat([sample, group.sample(n=group_sample_size, random_state=313)], ignore_index=True)
        else:
            sample = grouped.get_group(category).sample(n=group_sample_size, random_state=313)
    return sample

# Function that collects all data for a specific year
def set_year(df, year):
    """Creates a new dataframe that contains all observations for a specific year"""
    year_data = df[df['as_of_year'] == year]
    return year_data

# Function that splits data into training and test sets
def split_xy(df, y_col):
    """Splits a dataframe into X and y components. The function assumes that the last column is the y variable."""
    X = df.drop(columns=[y_col])
    y = df[y_col]
    return [X, y]

# Create sample function that only draws from specified values in a column
def value_sample(df, col, value, sample_size):
    """Creates a dataframe that has a randome sample of observations that have specified value in
    specified column. For example, if col = 'applicant_race_1' and value = 5 (which corresponds to white), 
    the sample will only have observations where the applicant is white.""" 
    sample = df[df[col] == value].sample(sample_size, random_state=313)
    return sample

# Function that create training data for white model and all race model, and test data for each race
def model_samples(df, year_sample_size, test_sample_size, remove_race=True):
    """Fucntion that creates our training and test data for our ML models. If remove_race is true, the models will be 
    blind to the races of each application."""
    # First collect equal sized samples from each year
    data_by_year = []
    for year in range(2007, 2018):
        year_dataset = set_year(df, year)
        # Create training data for white and all race models
        white_train = value_sample(year_dataset, 'applicant_race_1', 5, year_sample_size)
        all_train = stratified_sample(year_dataset, 'applicant_race_1', year_sample_size)
        if remove_race:
            white_train = white_train.drop(columns=['applicant_race_1', 'co_applicant_race_1'])
            all_train = all_train.drop(columns=['applicant_race_1', 'co_applicant_race_1'])
        # Create test data for each race
        race_tests = []
        for race in range(1, 6):
            race_sample = value_sample(year_dataset, 'applicant_race_1', race, test_sample_size)
            if remove_race:
                race_sample = race_sample.drop(columns=['applicant_race_1', 'co_applicant_race_1'])
            race_tests.append(race_sample)
        data_by_year.append([white_train, all_train, race_tests])
    # Complied the data for each year
    year_data = data_by_year.pop(0) 
    compiled_white = year_data[0]
    compiled_all_race = year_data[1]
    compiled_race_tests = year_data[2]  
    while len(data_by_year) > 0:
        year_data = data_by_year.pop(0)
        compiled_white = pd.concat([compiled_white, year_data[0]], ignore_index=True)
        compiled_all_race = pd.concat([compiled_all_race, year_data[1]], ignore_index=True)
        for i in range(0, 5):
            compiled_race_tests[i] = pd.concat([compiled_race_tests[i], year_data[2][i]], ignore_index=True)
    return [compiled_white, compiled_all_race, compiled_race_tests]

def model_testing(model, sample_data, y_col, cv=5, param_grid=None):
    """Fits white model and all race based on the specified training data. The y_col is the y variable column name.
    The sample_data parameter is the output of the model_samples function and param_grid is the hyperparameter grid
    that will be used in the GridSearchCV function. This function will return the results of the each model with given 
    parameters and the testing results from each race."""
    # Create the white model dataset
    X_white, y_white = split_xy(sample_data[0], y_col)
    # Create the all race model dataset
    X_all, y_all = split_xy(sample_data[1], y_col)

    # Set up Stratified K-Fold cross-validation
    stratified_kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=313)
    # Define scoring metrics
    scoring = {'roc_auc': 'roc_auc', 'precision': 'precision', 'recall': 'recall', 'accuracy': 'accuracy'}

    # Initialize GridSearchCV for White Only Model
    white_search = GridSearchCV(model, param_grid, scoring=scoring, cv=stratified_kfold, refit='roc_auc', return_train_score=True)
    white_search.fit(X_white, y_white)
    # Initialize GridSearchCV for All Races Model
    all_search = GridSearchCV(model, param_grid, scoring=scoring, cv=stratified_kfold, refit='roc_auc', return_train_score=True)
    all_search.fit(X_all, y_all)

    # Extract results for each model
    white_results = pd.DataFrame(white_search.cv_results_)
    all_results = pd.DataFrame(all_search.cv_results_)
    # Add model type to results
    white_results['Model'] = 'White only'
    all_results['Model'] = 'All races'
    # Combine results
    combined_results = pd.concat([white_results, all_results])

    # Sort combined results by index then drop
    combined_results = combined_results.sort_index()

    # Select relevant columns and rename them for clarity
    relevant_columns = [f'param_{param}' for param in param_grid.keys()] + [
        'mean_test_roc_auc', 'mean_test_precision', 'mean_test_recall', 'mean_test_accuracy', 'Model']
    combined_results = combined_results[relevant_columns]

    # Rename columns for final output
    column_mapping = {f'param_{param}': param for param in param_grid.keys()}
    column_mapping.update({
        'mean_test_roc_auc': 'ROC-AUC Score',
        'mean_test_precision': 'Precision',
        'mean_test_recall': 'Recall',
        'mean_test_accuracy': 'Accuracy'})
    combined_results.rename(columns=column_mapping, inplace=True)

    # Order the columns of so that Model is first
    column_order = ['Model'] + [param for param in param_grid.keys()] + ['ROC-AUC Score', 'Precision', 'Recall', 'Accuracy']
    combined_results = combined_results[column_order]

    return combined_results

# Remove rate spread from the dataset
hmda_no_rate_spread = hmda_sample.drop(columns=['rate_spread'])

# Adjust sample and test sizes for models and testing 
samples = model_samples(hmda_no_rate_spread, 1000, 200)

# Set up model and hyperparameter grids
logit = LogisticRegression(random_state=313)
rforest = RandomForestClassifier(random_state=313)

model_testing(rforest, samples, 'action_taken', param_grid={'max_depth': [3, 5], 'bootstrap': [True], 'max_samples':[100, 500]}, cv=5)

Unnamed: 0,Model,max_depth,bootstrap,max_samples,ROC-AUC Score,Precision,Recall,Accuracy
0,White only,3,True,100,0.864321,1.0,0.441509,0.906636
0,All races,3,True,100,0.866177,1.0,0.466737,0.916204
1,White only,3,True,500,0.873952,1.0,0.550829,0.924909
1,All races,3,True,500,0.880286,1.0,0.567399,0.932019
2,White only,5,True,100,0.865381,1.0,0.486669,0.914182
2,All races,5,True,100,0.86799,1.0,0.498561,0.921203
3,White only,5,True,500,0.88291,1.0,0.557908,0.926091
3,All races,5,True,500,0.890341,1.0,0.572601,0.932837


In [353]:
state_mapping = {
    1: 'Alabama', 2: 'Alaska', 4: 'Arizona', 5: 'Arkansas', 6: 'California', 8: 'Colorado', 
    9: 'Connecticut', 10: 'Delaware', 11: 'District of Columbia', 12: 'Florida', 13: 'Georgia', 
    15: 'Hawaii', 16: 'Idaho', 17: 'Illinois', 18: 'Indiana', 19: 'Iowa', 20: 'Kansas', 
    21: 'Kentucky', 22: 'Louisiana', 23: 'Maine', 24: 'Maryland', 25: 'Massachusetts', 
    26: 'Michigan', 27: 'Minnesota', 28: 'Mississippi', 29: 'Missouri', 30: 'Montana', 
    31: 'Nebraska', 32: 'Nevada', 33: 'New Hampshire', 34: 'New Jersey', 35: 'New Mexico', 
    36: 'New York', 37: 'North Carolina', 38: 'North Dakota', 39: 'Ohio', 40: 'Oklahoma', 
    41: 'Oregon', 42: 'Pennsylvania', 44: 'Rhode Island', 45.0: 'South Carolina', 46: 'South Dakota', 
    47: 'Tennessee', 48: 'Texas', 49: 'Utah', 50: 'Vermont', 51: 'Virginia', 53: 'Washington', 
    54: 'West Virginia', 55: 'Wisconsin', 56: 'Wyoming'
}

In [389]:
state_mapping_reversed = {
    'Alabama': 1, 'Alaska': 2, 'Arizona': 4, 'Arkansas': 5, 'California': 6, 'Colorado': 8,
    'Connecticut': 9, 'Delaware': 10, 'District of Columbia': 11, 'Florida': 12, 'Georgia': 13,
    'Hawaii': 15, 'Idaho': 16, 'Illinois': 17, 'Indiana': 18, 'Iowa': 19, 'Kansas': 20,
    'Kentucky': 21, 'Louisiana': 22, 'Maine': 23, 'Maryland': 24, 'Massachusetts': 25,
    'Michigan': 26, 'Minnesota': 27, 'Mississippi': 28, 'Missouri': 29, 'Montana': 30,
    'Nebraska': 31, 'Nevada': 32, 'New Hampshire': 33, 'New Jersey': 34, 'New Mexico': 35,
    'New York': 36, 'North Carolina': 37, 'North Dakota': 38, 'Ohio': 39, 'Oklahoma': 40,
    'Oregon': 41, 'Pennsylvania': 42, 'Rhode Island': 44, 'South Carolina': 45.0, 'South Dakota': 46,
    'Tennessee': 47, 'Texas': 48, 'Utah': 49, 'Vermont': 50, 'Virginia': 51, 'Washington': 53,
    'West Virginia': 54, 'Wisconsin': 55, 'Wyoming': 56
}

In [392]:
hmda_sample['state_code'] = hmda_sample['state_code'].map(state_mapping_reversed)

In [397]:
hmda_sample['state_code']

20493630    45.0
785654       6.0
23721266     4.0
11834421    39.0
5852959      6.0
            ... 
15768785    37.0
10510466     8.0
18606040    41.0
23705045    48.0
23385277     6.0
Name: state_code, Length: 2486666, dtype: float64

In [256]:
# all_race_state_df = hmda_sample.groupby(['applicant_race_1', 'state_code']).apply(lambda x: x.to_dict(orient='records')).unstack(fill_value=[])

  all_race_state_df = hmda_sample.groupby(['applicant_race_1', 'state_code']).apply(lambda x: x.to_dict(orient='records')).unstack(fill_value=[])


In [171]:
race_mapping = {5: 'White', 3: 'Black or African American', 2: 'Asian', 1: 'American Indian or Alaska Native', 4: 'Native Hawaiian or Other Pacific Islander'}
hmda_sample['action_taken'] = hmda_sample['action_taken'].apply(lambda x: 1 if x in [1, 2] else 0)
hmda_sample['applicant_race_1'] = hmda_sample['applicant_race_1'].map(race_mapping)
hmda_sample.describe()

Unnamed: 0,as_of_year,action_taken,loan_type,loan_purpose,loan_amount_000s,msamd,county_code,applicant_ethnicity,co_applicant_ethnicity,co_applicant_race_1,...,applicant_income_000s,purchaser_type,rate_spread,hoepa_status,population,minority_population,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units
count,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,...,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0,2486666.0
mean,2011.078,0.1681597,1.236123,2.28844,184.026,30433.49,86.4012,1.852879,0.7400962,1.825391,...,90.12557,0.5268335,-0.2136762,1.99916,5556.639,33.66403,66974.21,107.4395,1447.721,1885.659
std,3.375317,0.3740081,0.5249497,0.8884358,361.7722,11255.29,100.7911,0.3542267,0.937537,2.328069,...,151.3403,1.841884,1.947055,0.02897197,2765.717,29.09032,13741.92,39.08267,809.8524,975.4884
min,2007.0,0.0,1.0,1.0,1.0,10140.0,1.0,1.0,0.0,0.0,...,1.0,0.0,-1.0,1.0,0.0,0.0,14400.0,0.0,0.0,0.0
25%,2008.0,0.0,1.0,1.0,73.0,19740.0,29.0,2.0,0.0,0.0,...,41.0,0.0,-1.0,2.0,3787.0,9.91,59000.0,82.61,913.0,1260.0
50%,2011.0,0.0,1.0,3.0,139.0,31540.0,63.0,2.0,0.0,0.0,...,65.0,0.0,-1.0,2.0,5110.0,23.25,65000.0,101.94,1329.0,1729.0
75%,2014.0,0.0,1.0,3.0,237.0,40140.0,109.0,2.0,2.0,5.0,...,102.0,0.0,-1.0,2.0,6719.0,51.39,73300.0,125.22,1819.0,2310.0
max,2017.0,1.0,4.0,3.0,344000.0,49740.0,840.0,2.0,2.0,5.0,...,65000.0,9.0,99.99,2.0,53812.0,100.0,131500.0,507.47,19529.0,25391.0


In [173]:
hmda_sample['applicant_race_1']

20493630                               White
785654                                 White
23721266                               White
11834421           Black or African American
5852959     American Indian or Alaska Native
                          ...               
15768785           Black or African American
10510466                               White
18606040                               White
23705045                               White
23385277                               White
Name: applicant_race_1, Length: 2486666, dtype: object

In [175]:
# def race_state_dict(df, race_col, state_col, value_col):
#     race_state_dict = {}

#     # Group by race
#     grouped_by_race = df.groupby(race_col)

#     for race, race_group in grouped_by_race:
#         race_state_dict[race] = {}

#         # Group by state within each race
#         grouped_by_state = race_group.groupby(state_col)

#         for state, state_group in grouped_by_state:
#             # Store the values as a list for each state under each race
#             race_state_dict[race][state] = state_group[value_col].tolist()

#     return race_state_dict

In [260]:
def race_state_dict(df, race_col, state_col):
    race_state_dict = {}

    # Group by race
    grouped_by_race = df.groupby(race_col)

    for race, race_group in grouped_by_race:
        race_state_dict[race] = {}

        # Group by state within each race
        grouped_by_state = race_group.groupby(state_col)

        for state, state_group in grouped_by_state:
            # Store the entire row (all columns) as a list of lists for each state under each race
            race_state_dict[race][state] = state_group.values.tolist()

    return race_state_dict

In [272]:
race_state_dictionary = race_state_dict(hmda_sample, 'applicant_race_1', 'state_code')

In [413]:
def state_model_samples(df, year_sample_size, test_sample_size, remove_race=True):
    # Create empty lists for all race and white race models
    all_race_data = []
    white_race_data = []
    
    # Loop over all the states (50 states)
    states = df['state_code'].unique()

    # Iterate over each state and collect data
    for state in states:
        state_data = df[df['state_code'] == state] 
        
        # Create training data for all races and white race models for this state
        all_train_state = stratified_sample(state_data, 'applicant_race_1', year_sample_size) 
        white_train_state = value_sample(state_data, 'applicant_race_1', 'White', year_sample_size) 
        
        if remove_race:
            all_train_state = all_train_state.drop(columns=['applicant_race_1', 'co_applicant_race_1'])
            white_train_state = white_train_state.drop(columns=['applicant_race_1', 'co_applicant_race_1'])
        
        
        race_tests_all = [] 
        race_tests_white = [] 
        
       
        for race in df['applicant_race_1']:
            race_data = state_data[state_data['applicant_race_1'] == race] 
            
            # Adjust the sample size if there is not enough data
            if len(race_data) < test_sample_size:
                race_sample_size = len(race_data)  
            else:
                race_sample_size = test_sample_size 
            
            # Sample data if there is data available
            if race_sample_size > 0:
                race_sample = value_sample(race_data, 'applicant_race_1', race, race_sample_size)
                
                if remove_race:
                    race_sample = race_sample.drop(columns=['applicant_race_1', 'co_applicant_race_1'])
                
                # Append race test data for "all races" and "white race"
                if race == 'White':  
                    race_tests_white.append(race_sample)
                else:  # White race (race 5)
                    race_tests_all.append(race_sample)
            else:
                if race == 'White':
                    race_tests_white.append(pd.DataFrame())  # All races
                else:
                    race_tests_all.append(pd.DataFrame())  # White race

       
        all_race_data.append(race_tests_all)
        white_race_data.append(race_tests_white)
    
    return [all_race_data, white_race_data]

In [None]:
state_samples = state_model_samples(hmda_sample, 1000, 200)

In [None]:
logit = LogisticRegression(random_state=313)
rforest = RandomForestClassifier(random_state=313)

model_testing(rforest, state_samples, 'action_taken', param_grid={'max_depth': [3, 5], 'bootstrap': [True], 'max_samples':[100, 500]}, cv=5)

In [190]:
# import random

# def equal_sample_for_each_key_and_innerkey(data_dict, sample_size):
#     """Samples equal number of elements for each race and state in the dictionary."""
#     sampled_dict = {}
    
#     for race, states in data_dict.items():
#         sampled_dict[race] = {}
        
#         for state, values in states.items():
#             # Ensure there are enough values to sample
#             if len(values) >= sample_size:
#                 sampled_dict[race][state] = random.sample(values, sample_size)
#             else:
#                 sampled_dict[race][state] = values  # Use all data if not enough to sample
    
#     return sampled_dict

# def create_dataframe(data_dict, sample_size):
#     """Creates a DataFrame from the sampled data."""
#     # First, sample the data for each race and state
#     sampled_data = equal_sample_for_each_key_and_innerkey(data_dict, sample_size)
    
#     # Create a list of rows for DataFrame
#     rows = []
#     for race, states in sampled_data.items():
#         for state, values in states.items():
#             for value in values:
#                 # Add each value as a row with race and state as columns
#                 rows.append([race, state, value])
    
#     # Create a DataFrame
#     df = pd.DataFrame(rows, columns=['Race', 'State', 'action_taken'])
    
#     return df

In [192]:
# race_state = create_dataframe(race_state_dictionary, 500)

In [193]:
# race_state

Unnamed: 0,Race,State,action_taken
0,American Indian or Alaska Native,Alabama,0
1,American Indian or Alaska Native,Alabama,0
2,American Indian or Alaska Native,Alabama,0
3,American Indian or Alaska Native,Alabama,0
4,American Indian or Alaska Native,Alabama,0
...,...,...,...
90269,White,Wyoming,0
90270,White,Wyoming,0
90271,White,Wyoming,0
90272,White,Wyoming,0


In [224]:
# def split_train_test(data_dict, train_size=0.8):
#     """Splits the data into training and testing sets for each race and state."""
#     train_dict = {}
#     test_dict = {}

#     # Iterate over each race in the dictionary
#     for race, states in data_dict.items():
#         train_dict[race] = {}
#         test_dict[race] = {}

#         # Iterate over each state within the race
#         for state, values in states.items():
#             # Calculate the split index
#             split_idx = int(len(values) * train_size)
#             # Split the values into training and testing sets
#             train_dict[race][state] = values[:split_idx]
#             test_dict[race][state] = values[split_idx:]
    
#     return train_dict, test_dict

# def create_dataframe(data_dict, sample_size, train_size):
#     """Creates a DataFrame from the sampled and split data for each race (including white and all races)."""
#     # First, sample the data for each race and state
#     sampled_data = equal_sample_for_each_key_and_innerkey(data_dict, sample_size)
    
#     # Now, split the data into training and testing sets
#     train_data, test_data = split_train_test(sampled_data, train_size)
    
#     # Create DataFrames for each category (white and all races)
#     white_train_data = train_data.get(5, {})  # Assuming race '5' is White
#     white_test_data = test_data.get(5, {})    # Assuming race '5' is White
    
#     # For all races, combine the data
#     all_race_train_data = {race: states for race, states in train_data.items()}
#     all_race_test_data = {race: states for race, states in test_data.items()}

#     # Helper function to convert dictionary to a DataFrame
#     def dict_to_dataframe(data_dict, label):
#         rows = []
#         for race, states in data_dict.items():
#             for state, values in states.items():
#                 for value in values:
#                     rows.append([label, race, state, value])
#         return pd.DataFrame(rows, columns=['Model', 'Race', 'State', 'action_taken'])
    
#     # Create DataFrames for white and all races
#     white_train_df = dict_to_dataframe(white_train_data, 'White Train')
#     white_test_df = dict_to_dataframe(white_test_data, 'White Test')
#     all_race_train_df = dict_to_dataframe(all_race_train_data, 'All Race Train')
#     all_race_test_df = dict_to_dataframe(all_race_test_data, 'All Race Test')

#     # Combine all the DataFrames into a single DataFrame
#     final_df = pd.concat([white_train_df, white_test_df, all_race_train_df, all_race_test_df], ignore_index=True)

#     return final_df

In [226]:
# race_sample = create_dataframe(race_state_dictionary, 200, 1000)

In [245]:
# race_sample

Unnamed: 0,Model,Race,State,action_taken
0,All Race Train,American Indian or Alaska Native,Alabama,0
1,All Race Train,American Indian or Alaska Native,Alabama,0
2,All Race Train,American Indian or Alaska Native,Alabama,0
3,All Race Train,American Indian or Alaska Native,Alabama,0
4,All Race Train,American Indian or Alaska Native,Alabama,0
...,...,...,...,...
42108,All Race Train,White,Wyoming,0
42109,All Race Train,White,Wyoming,1
42110,All Race Train,White,Wyoming,0
42111,All Race Train,White,Wyoming,0


In [234]:
# def split_xy_state(data, y_col):
#     """Splits the data into X (features) and y (target) based on the target column (y_col)."""
#     X = data.drop(columns=[y_col])
#     y = data[y_col]
#     return X, y

# def model_testing_state(model, sample_data, y_col, cv=5, param_grid=None):
#     """Fits white model and all race based on the specified training data. 
#     The y_col is the y variable column name.
#     The sample_data parameter is the output of the model_samples function, 
#     and param_grid is the hyperparameter grid used in GridSearchCV.
#     This function will return the results of each model with given parameters 
#     and the testing results from each race."""
    
#     # Create the white model dataset
#     X_white, y_white = split_xy(sample_data[1], y_col)
#     # Create the all race model dataset
#     X_all, y_all = split_xy(sample_data[2], y_col)

#     # Set up Stratified K-Fold cross-validation
#     stratified_kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=313)
    
#     # Define scoring metrics
#     scoring = {'roc_auc': 'roc_auc', 'precision': 'precision', 'recall': 'recall', 'accuracy': 'accuracy'}
    
#     # Initialize GridSearchCV for White Only Model
#     white_search = GridSearchCV(model, param_grid, scoring=scoring, cv=stratified_kfold, refit='roc_auc', return_train_score=True)
#     white_search.fit(X_white, y_white)
    
#     # Initialize GridSearchCV for All Races Model
#     all_search = GridSearchCV(model, param_grid, scoring=scoring, cv=stratified_kfold, refit='roc_auc', return_train_score=True)
#     all_search.fit(X_all, y_all)

#     # Extract results for each model
#     white_results = pd.DataFrame(white_search.cv_results_)
#     all_results = pd.DataFrame(all_search.cv_results_)
    
#     # Add model type to results
#     white_results['Model'] = 'White only'
#     all_results['Model'] = 'All races'
    
#     # Combine results
#     combined_results = pd.concat([white_results, all_results])

#     # Sort combined results by index
#     combined_results = combined_results.sort_index()

#     # Select relevant columns and rename them for clarity
#     relevant_columns = [f'param_{param}' for param in param_grid.keys()] + [
#         'mean_test_roc_auc', 'mean_test_precision', 'mean_test_recall', 'mean_test_accuracy', 'Model']
#     combined_results = combined_results[relevant_columns]

#     # Rename columns for final output
#     column_mapping = {f'param_{param}': param for param in param_grid.keys()}
#     column_mapping.update({
#         'mean_test_roc_auc': 'ROC-AUC Score',
#         'mean_test_precision': 'Precision',
#         'mean_test_recall': 'Recall',
#         'mean_test_accuracy': 'Accuracy'})
    
#     combined_results.rename(columns=column_mapping, inplace=True)

#     # Order the columns of so that Model is first
#     column_order = ['Model'] + [param for param in param_grid.keys()] + ['ROC-AUC Score', 'Precision', 'Recall', 'Accuracy']
#     combined_results = combined_results[column_order]

#     return combined_results

In [238]:
# model_testing(rforest, race_sample, 'State', param_grid={'max_depth': [3, 5], 'bootstrap': [True], 'max_samples':[100, 500]}, cv=5)

KeyError: 0

In [None]:
# def sample_states(df):
    
#     all_race_state_list = []

#     for race in df.index:
    
#         race_data = [race] 
#         for state in df.columns:
       
#             race_data.append(df.loc[race, state])
    
   
#         all_race_state_list.append(race_data)


#     return all_race_state_list[:10]



In [101]:
# sample_state = sample_states(all_race_state_df)

In [95]:
# # Two nested lists containing every state 
# def dict_states(df):
#     all_race_data = {
#         1: {
#         },
#         2: {},
#         5: {},
#         3: {},
#         4: {}
#     }

#     # for race in df['applicant_race_1']:
#     #     for state in df['state_code']:            
#     #         if not all_race_data[race][state]:
#     #             all_race_data[race][state] = []

#     for race in df['applicant_race_1']:  # Use .unique() to ensure no duplicates
#         if race not in all_race_data:
#             all_race_data[race] = {}  # Initialize an empty dictionary for each race

#         # Iterate through each unique state in the dataset
#         for state in df['state_code'].unique():  # Use .unique() to ensure no duplicates
#             if state not in all_race_data[race]:
#                 all_race_data[race][state] = []  


#     for i in range(len(df)):
#         race = df.loc[i, "applicant_race_1"]
#         state = df.loc[i, "state_code"]
#         data_value = df.loc[i, "action_taken"] 
#         all_race_data[race][state].append([data_value])

#     return all_race_data

In [99]:
# dict_states(hmda_sample)

KeyboardInterrupt: 

In [66]:
# def sample_states(df, year_sample_size, test_sample_size, remove = True):

# # Creating training data for white and all race data
#      all_race_data_list = []
#      white_race_data_list = []

#     # Iterate through each state
#      for state in df['state_code']:
#         # Create lists to store the sampled data for all races and for "White" race
#         all_state_data = []
#         white_state_data = []
        
#         # Sample for "White" race using value_sample function
#         white_sample = value_sample(df, 'applicant_race_1', 5, year_sample_size)  
#         white_state_data = white_sample[white_sample['state_code'] == state]

#         # Add the sampled white race data to the white_race_data_list
#         white_race_data_list.append(white_state_data)

#         # For other races, sample using stratified sampling
#         all_race_sample = stratified_sample(df, 'applicant_race_1', year_sample_size)
#         all_state_data = all_race_sample[all_race_sample['state_code'] == state]

#         # Add the sampled all races data to the all_race_data_list
#         all_race_data_list.append(all_state_data)


#     #Testing data
#      race_tests = []
#      for race in range(1, 6):  # Iterate through races 1 to 5
#     # Get the sampled data for the specific race
#         race_sample = value_sample(year_dataset, 'applicant_race_1', race, test_sample_size)
#         testing_data = all_race_sample[all_race_sample['state_code'] == state]
#     # # If remove_race is True, drop the race columns
#     #     if remove_race:
#     #         race_sample = race_sample.drop(columns=['applicant_race_1', 'co_applicant_race_1'])
#     # Append the race sample to the race_tests list
#         race_tests.append(testing_data)
#     # Append the results to the data_by_year list
#      data_for_df.append([White_train, All_train, race_tests])

#      return [all_race_data_list, white_race_data_list, race_tests]

In [None]:
# state_samples = sample_states(all_race_state_df, 200, 1000) 

In [None]:
# # Two lists containing every state 
# all_race_data = {}

# for race in hmda_sample['applicant_race_1']: 
#     if race not in all_race_data:
#         all_race_data[race] = {} 

#     # Iterate through each unique state in the dataset
#     for state in hmda_sample['state_code']:  
#         if state not in all_race_data[race]:
#             all_race_data[race][state] = []  


# for i in range(len(hmda_sample)):
#     race = df.loc[i, "applicant_race_1"]
#     state = df.loc[i, "state_code"]
#     data_value = df.loc[i, "action_taken"] 
#     all_race_data[race][state].append([data_value])

# all_race_state_df = pd.DataFrame(all_race_data)

In [None]:
# Adjust sample and test sizes for models and testing 
# state_samples = sample_states(hmda_no_rate_spread, 200, 1000) 