In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
pd.set_option('display.max_colwidth', None)
from statsmodels.iolib.summary2 import summary_col 

In [2]:
import warnings
# Ignore warnings by category
warnings.filterwarnings("ignore", category=DeprecationWarning)
# Ignore warnings by message
warnings.filterwarnings("ignore", message=".*")

In [3]:
bank_tract = pd.read_csv("../input_data_clean/bank_tract_clean_WITH_CENSUS.csv")
bank_tract['which_bank'] = bank_tract['which_bank'].str.replace(' ', '')

In [4]:
bank_tract['which_bank'].value_counts()

AllOtherBanks    9337
BankofWest       4500
Name: which_bank, dtype: int64

In [5]:
# Calculate the hispanic rate and whether it's above the median
bank_tract['hisp_rate'] = (bank_tract['HispanicLatinoPop'] / bank_tract['Tot.Pop']) * 100
bank_tract['hisp_over_med'] = bank_tract['hisp_rate'] > np.median(bank_tract['hisp_rate'].dropna())
bank_tract['hisp_over_med'] = bank_tract['hisp_over_med'].astype(int)

# Adding them at the start so that they don't get set to 0
bank_tract.insert(0, 'hisp_over_med', bank_tract.pop('hisp_over_med'))
bank_tract.insert(0, 'hisp_rate', bank_tract.pop('hisp_rate'))

# Calculate the log of number of applications
bank_tract['log_num_apps'] = np.log(1+bank_tract['num_applications'])

### Adding rows (census tracts) where BoW is not active, but All Other Banks Are 

In [6]:
# Create a copy of the original dataframe
bank_tract_new = bank_tract.copy()

# Find the census tracts that have AllOtherBanks but not BankofWest
missing_tracts = bank_tract_new[(bank_tract_new['which_bank'] == 'AllOtherBanks') & ~(bank_tract_new['census_tract'].isin(bank_tract_new[bank_tract_new['which_bank'] == 'BankofWest']['census_tract']))]

# Update which_bank column to BankofWest
missing_tracts['which_bank'] = 'BankofWest'

# Set bank specific variables to 0
start_col = bank_tract.columns.get_loc('sum_approved_loans')
bank_specific_vars = bank_tract.columns[start_col:]
missing_tracts[bank_specific_vars] = np.NaN
missing_tracts['log_num_apps'] = 0

# Append the missing rows to the new dataframe
bank_tract_new = pd.concat([bank_tract_new, missing_tracts], ignore_index=True)

# Verify that there are now equal numbers of rows for each which_bank value
bank_tract_new["which_bank"].value_counts()

BankofWest       9340
AllOtherBanks    9337
Name: which_bank, dtype: int64

### Adding rows (census tracts) where All Other Banks Are are not active, but BoW is.

In [7]:
# Create a copy of the original dataframe
bank_tract_full = bank_tract_new.copy()

# Find the census tracts that have BankofWest but not AllOtherBanks
missing_tracts1 = bank_tract_full[(bank_tract_full['which_bank'] == 'BankofWest') & ~(bank_tract_full['census_tract'].isin(bank_tract_full[bank_tract_full['which_bank'] == 'AllOtherBanks']['census_tract']))]

# Update which_bank column to AllOtherBanks
missing_tracts1['which_bank'] = 'AllOtherBanks'

# Set bank specific variables to NaN
start_col1 = bank_tract.columns.get_loc('sum_approved_loans')
bank_specific_vars1 = bank_tract.columns[start_col:]
missing_tracts1[bank_specific_vars1] = np.NaN
missing_tracts1['log_num_apps'] = 0

# Append the missing rows to the new dataframe
bank_tract_full = pd.concat([bank_tract_full, missing_tracts1], ignore_index=True)

# Verify that there are now equal numbers of rows for each which_bank value
bank_tract_full["which_bank"].value_counts()

AllOtherBanks    9340
BankofWest       9340
Name: which_bank, dtype: int64

In [20]:
# Creating a denial count variabel so that in th eregression models only rows that have a non-null denial rate are included 
bank_tract_full['tract_denial_count'] = bank_tract_full.groupby('census_tract')['denial_rate'].transform(pd.Series.count)

### Investigating more control variables to include in regression

In [10]:
census_vars = pd.read_csv("../input_data/census_vars.csv")

In [11]:
possible_vars = census_vars.iloc[:88, 1]

In [13]:
pd.set_option("display.max_rows", None)

In [18]:
possible_vars1 = possible_vars.iloc[20:,]

Vars to use: Median household Income, Ratio of Income to Poverty Level, Per Capita Incoome, Median Family Income

### Cleaning census control vars to use in regression

In [34]:
# List of variables to clean
cols_to_clean = ["Med.HousehIncome", "RatioIncomeToPovertyLevel", "PerCap.Income", "Med.FamIncome"]

# Replace empty and negative values with NaN
bank_tract_full[cols_to_clean] = bank_tract_full[cols_to_clean].applymap(lambda x: np.NaN if (x == '' or x < 0) else x)

['Med.HousehIncome',
 'RatioIncomeToPovertyLevel',
 'PerCap.Income',
 'Med.FamIncome']

In [37]:
cols_to_clean.extend(['median_all_income', "mean_LTV"])
bank_tract_full[cols_to_clean].describe()

Unnamed: 0,Med.HousehIncome,RatioIncomeToPovertyLevel,PerCap.Income,Med.FamIncome,median_all_income,mean_LTV,median_all_income.1,mean_LTV.1
count,18658.0,18680.0,18674.0,18614.0,13435.0,13833.0,13435.0,13833.0
mean,79065.1684,4818.880621,37180.969262,90385.196949,153.556085,63.655351,153.556085,63.655351
std,38400.119425,2169.322939,21572.777062,45020.093275,327.767463,19.763747,327.767463,19.763747
min,11576.0,0.0,1276.0,14471.0,0.0,2.8847,0.0,2.8847
25%,50802.0,3416.0,22047.0,56482.75,84.0,55.788607,84.0,55.788607
50%,71000.0,4534.5,31681.0,80332.0,114.0,64.124411,114.0,64.124411
75%,98375.0,5848.0,45846.0,114209.5,170.0,72.3471,170.0,72.3471
max,250001.0,30844.0,176388.0,250001.0,33827.0,1815.708527,33827.0,1815.708527


## Linear Regression with Interaction Terms - median income and mean LTV as control

In [9]:
model1 = smf.ols('denial_rate ~ hisp_rate * which_bank + median_all_income + mean_LTV', data=bank_tract_full.query("tract_denial_count == 2")).fit()
model2 = smf.ols('denial_rate ~ hisp_over_med * which_bank + median_all_income+ mean_LTV', data=bank_tract_full.query("tract_denial_count == 2")).fit()
model3 = smf.ols('log_num_apps ~ hisp_rate * which_bank + median_all_income+ mean_LTV', data=bank_tract_full).fit()
model4 = smf.ols('log_num_apps ~ hisp_over_med * which_bank + median_all_income+ mean_LTV', data=bank_tract_full).fit()

# now I'll format an output table
# I'd like to include extra info in the table (not just coefficients)
info_dict={'R-squared' : lambda x: f"{x.rsquared:.4f}",
           'Adj R-squared' : lambda x: f"{x.rsquared_adj:.4f}",
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

# This summary col function combines a bunch of regressions into one nice table
print(summary_col(results=[model1,model2,model3,model4], # list the result obj here
                  float_format='%0.4f',
                  stars = True, # stars are easy way to see if anything is statistically significant
                  model_names=['Denial Rate','Denial Rate',' log_num_apps','log_num_apps'], # these are bad names, lol. Usually, just use the y variable name
                  info_dict=info_dict,
                  regressor_order=[ 'Intercept', "hisp_rate", "hisp_over_med", "which_bank[T.BankofWest]", "hisp_rate:which_bank[T.BankofWest]",
                                  "hisp_over_med:which_bank[T.BankofWest]", "median_all_income", "mean_LTV"]
                  )
     )


                                       Denial Rate I Denial Rate II  log_num_apps I log_num_apps I
--------------------------------------------------------------------------------------------------
Intercept                              0.0999***     0.1120***      4.2705***       4.0785***     
                                       (0.0092)      (0.0091)       (0.0225)        (0.0228)      
hisp_rate                              0.0012***                    -0.0130***                    
                                       (0.0001)                     (0.0003)                      
hisp_over_med                                        0.0463***                      -0.4740***    
                                                     (0.0054)                       (0.0152)      
which_bank[T.BankofWest]               0.1253***     0.0978***      -2.8058***      -2.6806***    
                                       (0.0062)      (0.0050)       (0.0217)        (0.0184)      
hisp_rate