In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
pd.set_option('display.max_colwidth', None)

In [2]:
import warnings
# Ignore warnings by category
warnings.filterwarnings("ignore", category=DeprecationWarning)
# Ignore warnings by message
warnings.filterwarnings("ignore", message=".*")

In [3]:
# Load data from CSV file
bank_tract = pd.read_csv("../input_data_clean/bank_tract_clean_WITH_CENSUS.csv")
CA_df = bank_tract[bank_tract["state"] == 6]
AZ_df = bank_tract[bank_tract["state"] == 4]
CA_df['which_bank'] = CA_df['which_bank'].str.replace(' ', '')
AZ_df['which_bank'] = AZ_df['which_bank'].str.replace(' ', '')

## Linear Regression - CA

In [4]:
# Calculate the hispanic rate and whether it's above the median
CA_df['hisp_rate'] = (CA_df['HispanicLatinoPop'] / CA_df['Tot.Pop']) * 100
CA_df['hisp_over_med'] = CA_df['hisp_rate'] > np.median(CA_df['hisp_rate'].dropna())
CA_df['hisp_over_med'] = CA_df['hisp_over_med'].astype(int)

# Calculate the log of number of applications
CA_df['log_num_apps'] = np.log(CA_df['num_applications'])

# Build initial models
# Fit the linear regression models
model1 = smf.ols('denial_rate ~ hisp_rate + C(which_bank)', data=CA_df).fit()
model2 = smf.ols('denial_rate ~ hisp_over_med + C(which_bank)', data=CA_df).fit()
model3 = smf.ols('log_num_apps ~ hisp_rate + C(which_bank)', data=CA_df).fit()
model4 = smf.ols('log_num_apps ~ hisp_over_med + C(which_bank)', data=CA_df).fit()

coef1 = model1.params
coef2 = model2.params
coef3 = model3.params
coef4 = model4.params

eq1 = f"denial_rate = {round(coef1['Intercept'], 5)} + {round(coef1['hisp_rate'], 5)} * hisp_rate + {round(coef1['C(which_bank)[T.BankofWest]'], 5)} * which_bank_BankofWest"
eq2 = f"denial_rate = {round(coef2['Intercept'], 5)} + {round(coef2['hisp_over_med'], 5)} * hisp_over_med + {round(coef2['C(which_bank)[T.BankofWest]'], 5)} * which_bank_BankofWest"
eq3 = f"log_num_apps = {round(coef3['Intercept'], 5)} + {round(coef3['hisp_rate'], 5)} * hisp_rate + {round(coef3['C(which_bank)[T.BankofWest]'], 5)} * which_bank_BankofWest"
eq4 = f"log_num_apps = {round(coef4['Intercept'], 5)} + {round(coef4['hisp_over_med'], 5)} * hisp_over_med + {round(coef4['C(which_bank)[T.BankofWest]'], 5)} * which_bank_BankofWest"

# Create a DataFrame with the equations as rows
eqs_df = pd.DataFrame({'Equation': [eq1, eq2, eq3, eq4]}, index=['Model 1', 'Model 2', 'Model 3', 'Model 4'])

# Display the DataFrame
print(eqs_df)

                                                                                    Equation
Model 1      denial_rate = 0.84978 + -0.00049 * hisp_rate + -0.04656 * which_bank_BankofWest
Model 2  denial_rate = 0.84267 + -0.02131 * hisp_over_med + -0.04532 * which_bank_BankofWest
Model 3     log_num_apps = 4.28543 + -0.01247 * hisp_rate + -2.96541 * which_bank_BankofWest
Model 4    log_num_apps = 4.05679 + -0.46 * hisp_over_med + -2.92249 * which_bank_BankofWest


## Linear Regression - AZ

In [5]:
# Calculate the hispanic rate and whether it's above the median
AZ_df['hisp_rate'] = (AZ_df['HispanicLatinoPop'] / AZ_df['Tot.Pop']) * 100
AZ_df['hisp_over_med'] = AZ_df['hisp_rate'] > np.median(AZ_df['hisp_rate'].dropna())
AZ_df['hisp_over_med'] = AZ_df['hisp_over_med'].astype(int)

# Calculate the log of number of applications
AZ_df['log_num_apps'] = np.log(AZ_df['num_applications'])

# Build initial models
# Fit the linear regression models
model1 = smf.ols('denial_rate ~ hisp_rate + C(which_bank)', data=AZ_df).fit()
model2 = smf.ols('denial_rate ~ hisp_over_med + C(which_bank)', data=AZ_df).fit()
model3 = smf.ols('log_num_apps ~ hisp_rate + C(which_bank)', data=AZ_df).fit()
model4 = smf.ols('log_num_apps ~ hisp_over_med + C(which_bank)', data=AZ_df).fit()

coef1 = model1.params
coef2 = model2.params
coef3 = model3.params
coef4 = model4.params

eq1 = f"denial_rate = {round(coef1['Intercept'], 5)} + {round(coef1['hisp_rate'], 5)} * hisp_rate + {round(coef1['C(which_bank)[T.BankofWest]'], 5)} * which_bank_BankofWest"
eq2 = f"denial_rate = {round(coef2['Intercept'], 5)} + {round(coef2['hisp_over_med'], 5)} * hisp_over_med + {round(coef2['C(which_bank)[T.BankofWest]'], 5)} * which_bank_BankofWest"
eq3 = f"log_num_apps = {round(coef3['Intercept'], 5)} + {round(coef3['hisp_rate'], 5)} * hisp_rate + {round(coef3['C(which_bank)[T.BankofWest]'], 5)} * which_bank_BankofWest"
eq4 = f"log_num_apps = {round(coef4['Intercept'], 5)} + {round(coef4['hisp_over_med'], 5)} * hisp_over_med + {round(coef4['C(which_bank)[T.BankofWest]'], 5)} * which_bank_BankofWest"

# Create a DataFrame with the equations as rows
eqs_df = pd.DataFrame({'Equation': [eq1, eq2, eq3, eq4]}, index=['Model 1', 'Model 2', 'Model 3', 'Model 4'])

# Display the DataFrame
print(eqs_df)

                                                                                     Equation
Model 1        denial_rate = 0.85239 + -0.00064 * hisp_rate + 0.04171 * which_bank_BankofWest
Model 2    denial_rate = 0.84733 + -0.02891 * hisp_over_med + 0.04256 * which_bank_BankofWest
Model 3      log_num_apps = 4.41751 + -0.01104 * hisp_rate + -3.69478 * which_bank_BankofWest
Model 4  log_num_apps = 4.26027 + -0.36179 * hisp_over_med + -3.66678 * which_bank_BankofWest


## Linear Regression With Interaction Terms - CA

In [6]:
# Build initial models
# Fit the linear regression models
model1 = smf.ols('denial_rate ~ hisp_rate * which_bank', data=CA_df).fit()
model2 = smf.ols('denial_rate ~ hisp_over_med * which_bank', data=CA_df).fit()
model3 = smf.ols('log_num_apps ~ hisp_rate * which_bank', data=CA_df).fit()
model4 = smf.ols('log_num_apps ~ hisp_over_med * which_bank', data=CA_df).fit()

coef1 = model1.params
coef2 = model2.params
coef3 = model3.params
coef4 = model4.params

eq1 = f"denial_rate = {round(coef1['Intercept'], 5)} + {round(coef1['hisp_rate'], 5)} * hisp_rate + {round(coef1['which_bank[T.BankofWest]'], 5)} * which_bank_BankofWest + {round(coef1['hisp_rate:which_bank[T.BankofWest]'], 5)} * hisp_rate * which_bank_BankofWest"
eq2 = f"denial_rate = {round(coef2['Intercept'], 5)} + {round(coef2['hisp_over_med'], 5)} * hisp_over_med + {round(coef2['which_bank[T.BankofWest]'], 5)} * which_bank_BankofWest + {round(coef2['hisp_over_med:which_bank[T.BankofWest]'], 5)} * hisp_over_med * which_bank_BankofWest"
eq3 = f"log_num_apps = {round(coef3['Intercept'], 5)} + {round(coef3['hisp_rate'], 5)} * hisp_rate + {round(coef3['which_bank[T.BankofWest]'], 5)} * which_bank_BankofWest + {round(coef3['hisp_rate:which_bank[T.BankofWest]'], 5)} * hisp_rate * which_bank_BankofWest"
eq4 = f"log_num_apps = {round(coef4['Intercept'], 5)} + {round(coef4['hisp_over_med'], 5)} * hisp_over_med + {round(coef4['which_bank[T.BankofWest]'], 5)} * which_bank_BankofWest + {round(coef4['hisp_over_med:which_bank[T.BankofWest]'], 5)} * hisp_over_med * which_bank_BankofWest"

eqs_df = pd.DataFrame({'Equation': [eq1, eq2, eq3, eq4]}, index=['Model 1', 'Model 2', 'Model 3', 'Model 4'])

# Display the DataFrame
print(eqs_df)

                                                                                                                                       Equation
Model 1           denial_rate = 0.87515 + -0.00115 * hisp_rate + -0.12342 * which_bank_BankofWest + 0.00235 * hisp_rate * which_bank_BankofWest
Model 2    denial_rate = 0.86009 + -0.05366 * hisp_over_med + -0.0893 * which_bank_BankofWest + 0.09464 * hisp_over_med * which_bank_BankofWest
Model 3          log_num_apps = 4.30888 + -0.01309 * hisp_rate + -3.03644 * which_bank_BankofWest + 0.00217 * hisp_rate * which_bank_BankofWest
Model 4  log_num_apps = 4.06668 + -0.47836 * hisp_over_med + -2.94744 * which_bank_BankofWest + 0.05368 * hisp_over_med * which_bank_BankofWest


## Linear Regression With Interaction Terms - AZ

In [7]:
# Build initial models
# Fit the linear regression models
model1 = smf.ols('denial_rate ~ hisp_rate * which_bank', data=AZ_df).fit()
model2 = smf.ols('denial_rate ~ hisp_over_med * which_bank', data=AZ_df).fit()
model3 = smf.ols('log_num_apps ~ hisp_rate * which_bank', data=AZ_df).fit()
model4 = smf.ols('log_num_apps ~ hisp_over_med * which_bank', data=AZ_df).fit()

coef1 = model1.params
coef2 = model2.params
coef3 = model3.params
coef4 = model4.params

eq1 = f"denial_rate = {round(coef1['Intercept'], 5)} + {round(coef1['hisp_rate'], 5)} * hisp_rate + {round(coef1['which_bank[T.BankofWest]'], 5)} * which_bank_BankofWest + {round(coef1['hisp_rate:which_bank[T.BankofWest]'], 5)} * hisp_rate * which_bank_BankofWest"
eq2 = f"denial_rate = {round(coef2['Intercept'], 5)} + {round(coef2['hisp_over_med'], 5)} * hisp_over_med + {round(coef2['which_bank[T.BankofWest]'], 5)} * which_bank_BankofWest + {round(coef2['hisp_over_med:which_bank[T.BankofWest]'], 5)} * hisp_over_med * which_bank_BankofWest"
eq3 = f"log_num_apps = {round(coef3['Intercept'], 5)} + {round(coef3['hisp_rate'], 5)} * hisp_rate + {round(coef3['which_bank[T.BankofWest]'], 5)} * which_bank_BankofWest + {round(coef3['hisp_rate:which_bank[T.BankofWest]'], 5)} * hisp_rate * which_bank_BankofWest"
eq4 = f"log_num_apps = {round(coef4['Intercept'], 5)} + {round(coef4['hisp_over_med'], 5)} * hisp_over_med + {round(coef4['which_bank[T.BankofWest]'], 5)} * which_bank_BankofWest + {round(coef4['hisp_over_med:which_bank[T.BankofWest]'], 5)} * hisp_over_med * which_bank_BankofWest"

eqs_df = pd.DataFrame({'Equation': [eq1, eq2, eq3, eq4]}, index=['Model 1', 'Model 2', 'Model 3', 'Model 4'])

# Display the DataFrame
print(eqs_df)

                                                                                                                                       Equation
Model 1            denial_rate = 0.85588 + -0.00075 * hisp_rate + 0.02098 * which_bank_BankofWest + 0.00079 * hisp_rate * which_bank_BankofWest
Model 2    denial_rate = 0.84835 + -0.03088 * hisp_over_med + 0.03791 * which_bank_BankofWest + 0.01064 * hisp_over_med * which_bank_BankofWest
Model 3             log_num_apps = 4.46175 + -0.01245 * hisp_rate + -3.95744 * which_bank_BankofWest + 0.01 * hisp_rate * which_bank_BankofWest
Model 4  log_num_apps = 4.30203 + -0.44237 * hisp_over_med + -3.85692 * which_bank_BankofWest + 0.43483 * hisp_over_med * which_bank_BankofWest
