In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
pd.set_option('display.max_colwidth', None)
from statsmodels.iolib.summary2 import summary_col 

In [2]:
import warnings
# Ignore warnings by category
warnings.filterwarnings("ignore", category=DeprecationWarning)
# Ignore warnings by message
warnings.filterwarnings("ignore", message=".*")

In [3]:
bank_tract = pd.read_csv("../input_data_clean/bank_tract_clean_WITH_CENSUS.csv")
bank_tract['which_bank'] = bank_tract['which_bank'].str.replace(' ', '')

In [4]:
bank_tract['which_bank'].value_counts()

AllOtherBanks    9337
BankofWest       4500
Name: which_bank, dtype: int64

In [5]:
# Calculate the hispanic rate and whether it's above the median
bank_tract['hisp_rate'] = (bank_tract['HispanicLatinoPop'] / bank_tract['Tot.Pop']) * 100
bank_tract['hisp_over_med'] = bank_tract['hisp_rate'] > np.median(bank_tract['hisp_rate'].dropna())
bank_tract['hisp_over_med'] = bank_tract['hisp_over_med'].astype(int)

# Calculate the log of number of applications
bank_tract['log_num_apps'] = np.log(1+bank_tract['num_applications'])

In [6]:
# Create a copy of the original dataframe
bank_tract_new = bank_tract.copy()

# Find the census tracts that have AllOtherBanks but not BankofWest
missing_tracts = bank_tract_new[(bank_tract_new['which_bank'] == 'AllOtherBanks') & ~(bank_tract_new['census_tract'].isin(bank_tract_new[bank_tract_new['which_bank'] == 'BankofWest']['census_tract']))]

# Update which_bank column to BankofWest
missing_tracts['which_bank'] = 'BankofWest'

# Set bank specific variables to 0
start_col = bank_tract.columns.get_loc('sum_approved_loans')
bank_specific_vars = bank_tract.columns[start_col:]
missing_tracts[bank_specific_vars] = np.NaN
missing_tracts['log_num_apps'] = 0

# Append the missing rows to the new dataframe
bank_tract_new = pd.concat([bank_tract_new, missing_tracts], ignore_index=True)

# Verify that there are now equal numbers of rows for each which_bank value
bank_tract_new["which_bank"].value_counts()

BankofWest       9340
AllOtherBanks    9337
Name: which_bank, dtype: int64

In [7]:
# Create a copy of the original dataframe
bank_tract_new1 = bank_tract_new.copy()

# Find the census tracts that have BankofWest but not AllOtherBanks
missing_tracts1 = bank_tract_new1[(bank_tract_new1['which_bank'] == 'BankofWest') & ~(bank_tract_new1['census_tract'].isin(bank_tract_new1[bank_tract_new1['which_bank'] == 'AllOtherBanks']['census_tract']))]

# Update which_bank column to AllOtherBanks
missing_tracts1['which_bank'] = 'AllOtherBanks'

# Set bank specific variables to NaN
start_col1 = bank_tract.columns.get_loc('sum_approved_loans')
bank_specific_vars1 = bank_tract.columns[start_col:]
missing_tracts1[bank_specific_vars1] = np.NaN
missing_tracts1['log_num_apps'] = 0

# Append the missing rows to the new dataframe
bank_tract_new1 = pd.concat([bank_tract_new1, missing_tracts1], ignore_index=True)

# Verify that there are now equal numbers of rows for each which_bank value
bank_tract_new1["which_bank"].value_counts()

AllOtherBanks    9340
BankofWest       9340
Name: which_bank, dtype: int64

In [8]:
def order_front(df, to_front):
    '''
    Moves columns in to_front to left of df.
    '''
    cols = list(df.columns)
    for c in to_front:
        cols.pop(cols.index(c))
    cols = to_front + cols
    return df[cols]

bank_tract = order_front(bank_tract,['which_bank','census_tract'])

## Linear Regression with Interaction Terms - median income and mean LTV as control

In [9]:
model1 = smf.ols('denial_rate ~ hisp_rate * which_bank + median_all_income + mean_LTV', data=bank_tract).fit()
model2 = smf.ols('denial_rate ~ hisp_over_med * which_bank + median_all_income+ mean_LTV', data=bank_tract).fit()
model3 = smf.ols('log_num_apps ~ hisp_rate * which_bank + median_all_income+ mean_LTV', data=bank_tract).fit()
model4 = smf.ols('log_num_apps ~ hisp_over_med * which_bank + median_all_income+ mean_LTV', data=bank_tract).fit()

# now I'll format an output table
# I'd like to include extra info in the table (not just coefficients)
info_dict={'R-squared' : lambda x: f"{x.rsquared:.4f}",
           'Adj R-squared' : lambda x: f"{x.rsquared_adj:.4f}",
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

# This summary col function combines a bunch of regressions into one nice table
print(summary_col(results=[model1,model2,model3,model4], # list the result obj here
                  float_format='%0.4f',
                  stars = True, # stars are easy way to see if anything is statistically significant
                  model_names=['Denial Rate','Denial Rate',' log_num_apps','log_num_apps'], # these are bad names, lol. Usually, just use the y variable name
                  info_dict=info_dict,
                  regressor_order=[ 'Intercept', "hisp_rate", "hisp_over_med", "which_bank[T.BankofWest]", "hisp_rate:which_bank[T.BankofWest]",
                                  "hisp_over_med:which_bank[T.BankofWest]", "median_all_income", "mean_LTV"]
                  )
     )


                                       Denial Rate I Denial Rate II  log_num_apps I log_num_apps I
--------------------------------------------------------------------------------------------------
Intercept                              0.1144***     0.1263***      4.2705***       4.0785***     
                                       (0.0049)      (0.0048)       (0.0225)        (0.0228)      
hisp_rate                              0.0010***                    -0.0130***                    
                                       (0.0001)                     (0.0003)                      
hisp_over_med                                        0.0476***                      -0.4740***    
                                                     (0.0032)                       (0.0152)      
which_bank[T.BankofWest]               0.1169***     0.0907***      -2.8058***      -2.6806***    
                                       (0.0047)      (0.0039)       (0.0217)        (0.0184)      
hisp_rate