In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
pd.set_option('display.max_colwidth', None)
from statsmodels.iolib.summary2 import summary_col 


In [2]:
import warnings
# Ignore warnings by category
warnings.filterwarnings("ignore", category=DeprecationWarning)
# Ignore warnings by message
warnings.filterwarnings("ignore", message=".*")

In [3]:
bank_tract = pd.read_csv("../input_data_clean/bank_tract_clean_WITH_CENSUS.csv")
CA_df = bank_tract[bank_tract["state"] == 6]
AZ_df = bank_tract[bank_tract["state"] == 4]
CA_df['which_bank'] = CA_df['which_bank'].str.replace(' ', '')
AZ_df['which_bank'] = AZ_df['which_bank'].str.replace(' ', '')

In [4]:
# Calculate the hispanic rate and whether it's above the median
CA_df['hisp_rate'] = (CA_df['HispanicLatinoPop'] / CA_df['Tot.Pop']) * 100
CA_df['hisp_over_med'] = CA_df['hisp_rate'] > np.median(CA_df['hisp_rate'].dropna())
CA_df['hisp_over_med'] = CA_df['hisp_over_med'].astype(int)

# Calculate the log of number of applications
CA_df['log_num_apps'] = np.log(CA_df['num_applications'])

## Linear Regression - CA

In [5]:
# Calculate the hispanic rate and whether it's above the median
CA_df['hisp_rate'] = (CA_df['HispanicLatinoPop'] / CA_df['Tot.Pop']) * 100
CA_df['hisp_over_med'] = CA_df['hisp_rate'] > np.median(CA_df['hisp_rate'].dropna())
CA_df['hisp_over_med'] = CA_df['hisp_over_med'].astype(int)

# Calculate the log of number of applications
CA_df['log_num_apps'] = np.log(CA_df['num_applications'])

# Build initial models
# Fit the linear regression models
model1 = smf.ols('denial_rate ~ hisp_rate + C(which_bank)', data=CA_df).fit()
model2 = smf.ols('denial_rate ~ hisp_over_med + C(which_bank)', data=CA_df).fit()
model3 = smf.ols('log_num_apps ~ hisp_rate + C(which_bank)', data=CA_df).fit()
model4 = smf.ols('log_num_apps ~ hisp_over_med + C(which_bank)', data=CA_df).fit()

# Create a list of model results
models = [model1, model2, model3, model4]

# Generate the table of regression results
table = summary_col(models, 
                    model_names=['Model 1', 'Model 2', 'Model 3', 'Model 4'], 
                    regressor_order=['Intercept', 'hisp_rate', 'hisp_over_med', 'C(which_bank)[T.BankofWest]'], 
                    float_format='%0.4f', 
                    stars=True)

# Display the table
print(table)



                             Model 1    Model 2    Model 3    Model 4  
-----------------------------------------------------------------------
Intercept                   0.8498***  0.8427***  4.2854***  4.0568*** 
                            (0.0028)   (0.0024)   (0.0136)   (0.0119)  
hisp_rate                   -0.0005***            -0.0125***           
                            (0.0001)              (0.0003)             
hisp_over_med                          -0.0213***            -0.4600***
                                       (0.0029)              (0.0146)  
C(which_bank)[T.BankofWest] -0.0466*** -0.0453*** -2.9654*** -2.9225***
                            (0.0031)   (0.0031)   (0.0148)   (0.0154)  
R-squared                   0.0221     0.0206     0.7705     0.7512    
R-squared Adj.              0.0220     0.0204     0.7704     0.7512    
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


## Linear Regression - AZ

In [6]:
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
import numpy as np
import pandas as pd

# Calculate the hispanic rate and whether it's above the median
AZ_df['hisp_rate'] = (AZ_df['HispanicLatinoPop'] / AZ_df['Tot.Pop']) * 100
AZ_df['hisp_over_med'] = AZ_df['hisp_rate'] > np.median(AZ_df['hisp_rate'].dropna())
AZ_df['hisp_over_med'] = AZ_df['hisp_over_med'].astype(int)

# Calculate the log of number of applications
AZ_df['log_num_apps'] = np.log(AZ_df['num_applications'])

# Fit the linear regression models
model1 = smf.ols('denial_rate ~ hisp_rate + C(which_bank)', data=AZ_df).fit()
model2 = smf.ols('denial_rate ~ hisp_over_med + C(which_bank)', data=AZ_df).fit()
model3 = smf.ols('log_num_apps ~ hisp_rate + C(which_bank)', data=AZ_df).fit()
model4 = smf.ols('log_num_apps ~ hisp_over_med + C(which_bank)', data=AZ_df).fit()

# Combine the regression results into a single table
results_table = summary_col([model1, model2, model3, model4], 
                            model_names=['Model 1', 'Model 2', 'Model 3', 'Model 4'],
                            regressor_order=['Intercept', 'hisp_rate', 'hisp_over_med', 'C(which_bank)[T.BankofWest]'],
                            float_format='%0.5f',
                            stars=True,
                            drop_omitted=True)

# Display the table
print(results_table)




                              Model 1     Model 2     Model 3     Model 4  
---------------------------------------------------------------------------
Intercept                   0.85239***  0.84733***  4.41751***  4.26027*** 
                            (0.00501)   (0.00434)   (0.03390)   (0.03007)  
hisp_rate                   -0.00064***             -0.01104***            
                            (0.00012)               (0.00084)              
hisp_over_med                           -0.02891***             -0.36179***
                                        (0.00572)               (0.03966)  
C(which_bank)[T.BankofWest] 0.04171***  0.04256***  -3.69478*** -3.66678***
                            (0.00732)   (0.00731)   (0.04955)   (0.05063)  
R-squared                   0.03539     0.03474     0.75766     0.74612    
R-squared Adj.              0.03431     0.03367     0.75739     0.74584    
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


## Linear Regression With Interaction Terms - CA

In [7]:
# Build initial models
# Fit the linear regression models
model1 = smf.ols('denial_rate ~ hisp_rate * which_bank', data=CA_df).fit()
model2 = smf.ols('denial_rate ~ hisp_over_med * which_bank', data=CA_df).fit()
model3 = smf.ols('log_num_apps ~ hisp_rate * which_bank', data=CA_df).fit()
model4 = smf.ols('log_num_apps ~ hisp_over_med * which_bank', data=CA_df).fit()

# Create a summary table of the regression models
table = summary_col([model1, model2, model3, model4],
                    model_names=['Model 1', 'Model 2', 'Model 3', 'Model 4'],
                    float_format='%.5f',
                    stars=True,
                    info_dict={'N': lambda x: "{0:d}".format(int(x.nobs)),
                               'R2': lambda x: "{:.2f}".format(x.rsquared)})

# Rename the table columns
table.columns = ['Dep. Variable', 'Model 1', 'Model 2', 'Model 3', 'Model 4']

# Display the table
print(table)


                                         Model 1     Model 2     Model 3     Model 4  
--------------------------------------------------------------------------------------
Intercept                              0.87515***  0.86009***  4.30888***  4.06668*** 
                                       (0.00309)   (0.00261)   (0.01510)   (0.01321)  
R-squared                              0.04997     0.03993     0.77070     0.75130    
R-squared Adj.                         0.04974     0.03969     0.77065     0.75124    
hisp_over_med                                      -0.05366***             -0.47836***
                                                   (0.00355)               (0.01800)  
hisp_over_med:which_bank[T.BankofWest]             0.09464***              0.05368*   
                                                   (0.00607)               (0.03078)  
hisp_rate                              -0.00115***             -0.01309***            
                                       (0.

## Linear Regression with Interaction Terms - AZ

In [8]:
# Build initial models
# Fit the linear regression models
model1 = smf.ols('denial_rate ~ hisp_rate * which_bank', data=AZ_df).fit()
model2 = smf.ols('denial_rate ~ hisp_over_med * which_bank', data=AZ_df).fit()
model3 = smf.ols('log_num_apps ~ hisp_rate * which_bank', data=AZ_df).fit()
model4 = smf.ols('log_num_apps ~ hisp_over_med * which_bank', data=AZ_df).fit()

# Generate the summary table
table = summary_col([model1, model2, model3, model4],
                    model_names=['Model 1', 'Model 2', 'Model 3', 'Model 4'],
                    regressor_order=['Intercept', 'hisp_rate', 'hisp_over_med', 'which_bank[T.BankofWest]', 'hisp_rate:which_bank[T.BankofWest]', 'hisp_over_med:which_bank[T.BankofWest]'],
                    float_format='%.5f',
                    stars=True,
                    info_dict={'R-squared': lambda x: "{:.2f}".format(x.rsquared)}
                   )

# Display the summary table
print(table)


                                         Model 1     Model 2     Model 3     Model 4  
--------------------------------------------------------------------------------------
Intercept                              0.85588***  0.84835***  4.46175***  4.30203*** 
                                       (0.00524)   (0.00457)   (0.03538)   (0.03148)  
hisp_rate                              -0.00075***             -0.01245***            
                                       (0.00013)               (0.00090)              
hisp_over_med                                      -0.03088***             -0.44237***
                                                   (0.00634)               (0.04373)  
which_bank[T.BankofWest]               0.02098*    0.03791***  -3.95744*** -3.85692***
                                       (0.01186)   (0.00974)   (0.08003)   (0.06717)  
hisp_rate:which_bank[T.BankofWest]     0.00079**               0.01000***             
                                       (0.