In [44]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
pd.set_option('display.max_colwidth', None)
from statsmodels.iolib.summary2 import summary_col 

In [45]:
import warnings
# Ignore warnings by category
warnings.filterwarnings("ignore", category=DeprecationWarning)
# Ignore warnings by message
warnings.filterwarnings("ignore", message=".*")

In [46]:
bank_tract = pd.read_csv("../input_data_clean/bank_tract_clean_WITH_CENSUS.csv")
bank_tract['which_bank'] = bank_tract['which_bank'].str.replace(' ', '')

In [47]:
bank_tract['which_bank'].value_counts()

AllOtherBanks    9337
BankofWest       4500
Name: which_bank, dtype: int64

In [49]:
# Calculate the hispanic rate and whether it's above the median
bank_tract['hisp_rate'] = (bank_tract['HispanicLatinoPop'] / bank_tract['Tot.Pop']) * 100
bank_tract['hisp_over_med'] = bank_tract['hisp_rate'] > np.median(bank_tract['hisp_rate'].dropna())
bank_tract['hisp_over_med'] = bank_tract['hisp_over_med'].astype(int)

# Adding them at the start so that they don't get set to 0
bank_tract.insert(0, 'hisp_over_med', bank_tract.pop('hisp_over_med'))
bank_tract.insert(0, 'hisp_rate', bank_tract.pop('hisp_rate'))

# Calculate the black rate and whether it's above the median
bank_tract['black_rate'] = (bank_tract['Tot.BlackPop '] / bank_tract['Tot.Pop']) * 100
bank_tract['black_over_med'] = bank_tract['black_rate'] > np.median(bank_tract['black_rate'].dropna())
bank_tract['black_over_med'] = bank_tract['black_over_med'].astype(int)

# Adding them at the start so that they don't get set to 0
bank_tract.insert(0, 'black_over_med', bank_tract.pop('black_over_med'))
bank_tract.insert(0, 'black_rate', bank_tract.pop('black_rate'))

# Calculate the asian rate and whether it's above the median
bank_tract['asian_rate'] = (bank_tract['Tot.AsianPop'] / bank_tract['Tot.Pop']) * 100
bank_tract['asian_over_med'] = bank_tract['asian_rate'] > np.median(bank_tract['asian_rate'].dropna())
bank_tract['asian_over_med'] = bank_tract['asian_over_med'].astype(int)

# Adding them at the start so that they don't get set to 0
bank_tract.insert(0, 'asian_over_med', bank_tract.pop('asian_over_med'))
bank_tract.insert(0, 'asian_rate', bank_tract.pop('asian_rate'))

# Creating the major minority variable
bank_tract['major_minority'] = bank_tract.apply(lambda row: 'Hispanic' if row['hisp_rate'] > row['black_rate'] and row['hisp_rate'] > row['asian_rate'] else ('Asian' if row['asian_rate'] > row['black_rate'] and row['asian_rate'] > row['hisp_rate'] else 'Black'), axis=1)
bank_tract.insert(0, 'major_minority', bank_tract.pop('major_minority'))

# Calculate the log of number of applications
bank_tract['log_num_apps'] = np.log(1+bank_tract['num_applications'])

### Adding rows (census tracts) where BoW is not active, but All Other Banks Are 

In [50]:
# Create a copy of the original dataframe
bank_tract_new = bank_tract.copy()

# Find the census tracts that have AllOtherBanks but not BankofWest
missing_tracts = bank_tract_new[(bank_tract_new['which_bank'] == 'AllOtherBanks') & ~(bank_tract_new['census_tract'].isin(bank_tract_new[bank_tract_new['which_bank'] == 'BankofWest']['census_tract']))]

# Update which_bank column to BankofWest
missing_tracts['which_bank'] = 'BankofWest'

# Set bank specific variables to 0
start_col = bank_tract.columns.get_loc('sum_approved_loans')
bank_specific_vars = bank_tract.columns[start_col:]
missing_tracts[bank_specific_vars] = np.NaN
missing_tracts['log_num_apps'] = 0

# Append the missing rows to the new dataframe
bank_tract_new = pd.concat([bank_tract_new, missing_tracts], ignore_index=True)

# Verify that there are now equal numbers of rows for each which_bank value
bank_tract_new["which_bank"].value_counts()

BankofWest       9340
AllOtherBanks    9337
Name: which_bank, dtype: int64

### Adding rows (census tracts) where All Other Banks Are are not active, but BoW is.

In [51]:
# Create a copy of the original dataframe
bank_tract_full = bank_tract_new.copy()

# Find the census tracts that have BankofWest but not AllOtherBanks
missing_tracts1 = bank_tract_full[(bank_tract_full['which_bank'] == 'BankofWest') & ~(bank_tract_full['census_tract'].isin(bank_tract_full[bank_tract_full['which_bank'] == 'AllOtherBanks']['census_tract']))]

# Update which_bank column to AllOtherBanks
missing_tracts1['which_bank'] = 'AllOtherBanks'

# Set bank specific variables to NaN
start_col1 = bank_tract.columns.get_loc('sum_approved_loans')
bank_specific_vars1 = bank_tract.columns[start_col:]
missing_tracts1[bank_specific_vars1] = np.NaN
missing_tracts1['log_num_apps'] = 0

# Append the missing rows to the new dataframe
bank_tract_full = pd.concat([bank_tract_full, missing_tracts1], ignore_index=True)

# Verify that there are now equal numbers of rows for each which_bank value
bank_tract_full["which_bank"].value_counts()

AllOtherBanks    9340
BankofWest       9340
Name: which_bank, dtype: int64

In [52]:
# Creating a denial count variabel so that in th eregression models only rows that have a non-null denial rate are included 
bank_tract_full['tract_denial_count'] = bank_tract_full.groupby('census_tract')['denial_rate'].transform(pd.Series.count)

### Investigating more control variables to include in regression

In [10]:
census_vars = pd.read_csv("../input_data/census_vars.csv")

In [11]:
possible_vars = census_vars.iloc[:88, 1]

In [13]:
pd.set_option("display.max_rows", None)

In [18]:
possible_vars1 = possible_vars.iloc[20:,]

Vars to use: Median household Income, Ratio of Income to Poverty Level, Per Capita Incoome, Median Family Income

### Cleaning census control vars to use in regression

In [53]:
# List of variables to clean
cols_to_clean = ["Med.HousehIncome", "RatioIncomeToPovertyLevel", "PerCap.Income", "Med.FamIncome"]

# Replace empty and negative values with NaN
bank_tract_full[cols_to_clean] = bank_tract_full[cols_to_clean].applymap(lambda x: np.NaN if (x == '' or x < 0) else x)

In [54]:
# Possible columns to include
cols_to_clean.extend(['median_all_income', "mean_LTV"])
bank_tract_full[cols_to_clean].describe()

Unnamed: 0,Med.HousehIncome,RatioIncomeToPovertyLevel,PerCap.Income,Med.FamIncome,median_all_income,mean_LTV
count,18658.0,18680.0,18674.0,18614.0,13435.0,13833.0
mean,79065.1684,4818.880621,37180.969262,90385.196949,153.556085,63.655351
std,38400.119425,2169.322939,21572.777062,45020.093275,327.767463,19.763747
min,11576.0,0.0,1276.0,14471.0,0.0,2.8847
25%,50802.0,3416.0,22047.0,56482.75,84.0,55.788607
50%,71000.0,4534.5,31681.0,80332.0,114.0,64.124411
75%,98375.0,5848.0,45846.0,114209.5,170.0,72.3471
max,250001.0,30844.0,176388.0,250001.0,33827.0,1815.708527


In [83]:
# populate null values with 0
bank_tract_full["hisp_rate"].fillna(0, inplace=True)
bank_tract_full["black_rate"].fillna(0, inplace=True)
bank_tract_full["asian_rate"].fillna(0, inplace=True)

#Change the name of the Med.HousehIncome variable
bank_tract_full.rename(columns={"Med.HousehIncome":"MedHousehIncome", "PerCap.Income" : "PerCapIncome", "Med.FamIncome":"MedFamIncome"}, inplace=True)

In [82]:
bank_tract_full[['log_num_apps','hisp_rate','hisp_over_med',"black_rate", "black_over_med","asian_rate", "asian_over_med", 'which_bank','median_all_income','mean_LTV']].count()

log_num_apps         18680
hisp_rate            18680
hisp_over_med        18680
black_rate           18680
black_over_med       18680
asian_rate           18680
asian_over_med       18680
which_bank           18680
median_all_income    13435
mean_LTV             13833
dtype: int64

### Denial Rate Regression table using minority rates variables

In [95]:
census_tract_vars = ["MedHousehIncome", "PerCapIncome", "MedFamIncome"]
bank_tract_vars = ["median_all_income", "mean_LTV"]
results = []
x_vars = ['hisp_rate','black_rate','asian_rate',"major_minority"]

for v in x_vars :
    formula = f"denial_rate ~ {v} * which_bank + {'+'.join(census_tract_vars)} + {'+'.join(bank_tract_vars)}"
    #print(formula)
    results.append(
        smf.ols(formula,data=bank_tract_full.query("tract_denial_count == 2")).fit()
    )

# now I'll format an output table
# I'd like to include extra info in the table (not just coefficients)
info_dict={'R-squared' : lambda x: f"{x.rsquared:.2f}",
           'Adj R-squared' : lambda x: f"{x.rsquared_adj:.2f}",
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

# instead of having a row for each minority vairable, combine into one row, column labels will distinguish
# create custom variable labels for the variables you want to combine
varlabels = {v:'High Minority' for v in x_vars}

print('='*110)
print('                                             y = denial_rate')

# This summary col function combines a bunch of regressions into one nice table
print(summary_col(results=results, 
                  float_format='%0.4f',
                  stars = True, 
                  model_names=["Hispanic", "Black", "Asian", "MajorityMinority"],
                  #varlabels = varlabels,
                  info_dict=info_dict,
                  regressor_order=[ 'Intercept', "High Minority", "which_bank[T.BankofWest]", "High Minority:which_bank[T.BankofWest]",
                                 "MedHousehIncome", "PerCapIncome", "MedFamIncome", "median_all_income", "mean_LTV"]
                  )
     )

                                             y = denial_rate

                                                     Hispanic    Black      Asian    MajorityMinority
-----------------------------------------------------------------------------------------------------
Intercept                                           0.1085***  0.1223***  0.1599***  0.1068***       
                                                    (0.0132)   (0.0116)   (0.0119)   (0.0124)        
which_bank[T.BankofWest]                            0.1258***  0.0795***  0.0380***  0.1257***       
                                                    (0.0062)   (0.0045)   (0.0052)   (0.0069)        
MedHousehIncome                                     -0.0000    -0.0000    -0.0000    -0.0000         
                                                    (0.0000)   (0.0000)   (0.0000)   (0.0000)        
PerCapIncome                                        -0.0000    -0.0000    -0.0000    -0.0000         
                    

### Denial Rate Regression table using minority over median variables

In [91]:
census_tract_vars = ["MedHousehIncome", "PerCapIncome", "MedFamIncome"]
bank_tract_vars = ["median_all_income", "mean_LTV"]
results = []
x_vars = ['hisp_over_med','black_over_med','asian_over_med',"major_minority"]

for v in x_vars :
    formula = f"denial_rate ~ {v} * which_bank + {'+'.join(census_tract_vars)} + {'+'.join(bank_tract_vars)}"
    #print(formula)
    results.append(
        smf.ols(formula,data=bank_tract_full.query("tract_denial_count == 2")).fit()
    )

# now I'll format an output table
# I'd like to include extra info in the table (not just coefficients)
info_dict={'R-squared' : lambda x: f"{x.rsquared:.2f}",
           'Adj R-squared' : lambda x: f"{x.rsquared_adj:.2f}",
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

# instead of having a row for each minority vairable, combine into one row, column labels will distinguish
# create custom variable labels for the variables you want to combine
varlabels = {v:'High Minority' for v in x_vars}

print('='*110)
print('                                             y = denial_rate')

# This summary col function combines a bunch of regressions into one nice table
print(summary_col(results=results, 
                  float_format='%0.2f',
                  stars = True, 
                  model_names=["Hispanic", "Black", "Asian", "MajorityMinority"],
                  #varlabels = varlabels,
                  info_dict=info_dict,
                  regressor_order=[ 'Intercept', "High Minority", "which_bank[T.BankofWest]", "High Minority:which_bank[T.BankofWest]",
                                 "MedHousehIncome", "PerCapIncome", "MedFamIncome", "median_all_income", "mean_LTV"]
                  )
     )

                                             y = denial_rate

                                                    Hispanic  Black    Asian   MajorityMinority
-----------------------------------------------------------------------------------------------
Intercept                                           0.12***  0.12***  0.16***  0.11***         
                                                    (0.01)   (0.01)   (0.01)   (0.01)          
which_bank[T.BankofWest]                            0.10***  0.08***  0.03***  0.13***         
                                                    (0.00)   (0.01)   (0.01)   (0.01)          
MedHousehIncome                                     -0.00    -0.00    -0.00    -0.00           
                                                    (0.00)   (0.00)   (0.00)   (0.00)          
PerCapIncome                                        -0.00    -0.00    -0.00    -0.00           
                                                    (0.00)   (0.00)   (0.0

### Log number apps regression table using minority rate variables

In [80]:
census_tract_vars = ["MedHousehIncome", "PerCapIncome", "MedFamIncome"]
bank_tract_vars = ["median_all_income", "mean_LTV"]
results = []
x_vars = ['hisp_rate','black_rate','asian_rate',"major_minority"]

for v in x_vars :
    formula = f"log_num_apps ~ {v} * which_bank + {'+'.join(census_tract_vars)}"
    #print(formula)
    results.append(
        smf.ols(formula,data=bank_tract_full).fit()
    )

# now I'll format an output table
# I'd like to include extra info in the table (not just coefficients)
info_dict={'R-squared' : lambda x: f"{x.rsquared:.2f}",
           'Adj R-squared' : lambda x: f"{x.rsquared_adj:.2f}",
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

# instead of having a row for each minority vairable, combine into one row, column labels will distinguish
# create custom variable labels for the variables you want to combine
varlabels = {v:'High Minority' for v in x_vars}

print('='*110)
print('                                          y = log(number of applications)')

# This summary col function combines a bunch of regressions into one nice table
print(summary_col(results=results, 
                  float_format='%0.2f',
                  stars = True, 
                  model_names=x_vars,
                  #varlabels = varlabels,
                  info_dict=info_dict,
                  regressor_order=[ 'Intercept', "High Minority", "which_bank[T.BankofWest]", "High Minority:which_bank[T.BankofWest]",
                                 "median_all_income", "mean_LTV"]
                  )
     )

                                          y = log(number of applications)

                                                    hisp_rate black_rate asian_rate major_minority
--------------------------------------------------------------------------------------------------
Intercept                                           3.48***   3.12***    3.17***    2.92***       
                                                    (0.02)    (0.01)     (0.01)     (0.02)        
which_bank[T.BankofWest]                            -3.40***  -3.26***   -3.37***   -3.01***      
                                                    (0.02)    (0.01)     (0.01)     (0.02)        
MedHousehIncome                                     0.00***   0.00***    0.00***    0.00***       
                                                    (0.00)    (0.00)     (0.00)     (0.00)        
asian_rate                                                               -0.01***                 
                                  

### Log number apps regression table using minority over median variables

In [78]:
census_tract_vars = ["MedHousehIncome", "PerCapIncome", "MedFamIncome"]
bank_tract_vars = ["median_all_income", "mean_LTV"]
results = []
x_vars = ['hisp_over_med','black_over_med','asian_over_med',"major_minority"]

for v in x_vars :
    formula = f"log_num_apps ~ {v} * which_bank + {'+'.join(census_tract_vars)}"
    #print(formula)
    results.append(
        smf.ols(formula,data=bank_tract_full).fit()
    )

# now I'll format an output table
# I'd like to include extra info in the table (not just coefficients)
info_dict={'R-squared' : lambda x: f"{x.rsquared:.2f}",
           'Adj R-squared' : lambda x: f"{x.rsquared_adj:.2f}",
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

# instead of having a row for each minority vairable, combine into one row, column labels will distinguish
# create custom variable labels for the variables you want to combine
varlabels = {v:'High Minority' for v in x_vars}

print('='*110)
print('                                          y = log(number of applications)')

# This summary col function combines a bunch of regressions into one nice table
print(summary_col(results=results, 
                  float_format='%0.2f',
                  stars = True, 
                  model_names=x_vars,
                  #varlabels = varlabels,
                  info_dict=info_dict,
                  regressor_order=[ 'Intercept', "High Minority", "which_bank[T.BankofWest]", "High Minority:which_bank[T.BankofWest]",
                                 "median_all_income", "mean_LTV"]
                  )
     )

                                          y = log(number of applications)

                                                    hisp_over_med black_over_med asian_over_med major_minority
--------------------------------------------------------------------------------------------------------------
Intercept                                           3.22***       3.08***        3.17***        2.92***       
                                                    (0.02)        (0.02)         (0.01)         (0.02)        
which_bank[T.BankofWest]                            -3.30***      -3.20***       -3.36***       -3.01***      
                                                    (0.01)        (0.01)         (0.01)         (0.02)        
MedHousehIncome                                     0.00***       0.00***        0.00***        0.00***       
                                                    (0.00)        (0.00)         (0.00)         (0.00)        
asian_over_med                       

## Linear Regression with Interaction Terms - **No use**

In [9]:
model1 = smf.ols('denial_rate ~ hisp_rate * which_bank + median_all_income + mean_LTV', data=bank_tract_full.query("tract_denial_count == 2")).fit()
model2 = smf.ols('denial_rate ~ hisp_over_med * which_bank + median_all_income+ mean_LTV', data=bank_tract_full.query("tract_denial_count == 2")).fit()
model3 = smf.ols('log_num_apps ~ hisp_rate * which_bank + median_all_income+ mean_LTV', data=bank_tract_full).fit()
model4 = smf.ols('log_num_apps ~ hisp_over_med * which_bank + median_all_income+ mean_LTV', data=bank_tract_full).fit()

# now I'll format an output table
# I'd like to include extra info in the table (not just coefficients)
info_dict={'R-squared' : lambda x: f"{x.rsquared:.4f}",
           'Adj R-squared' : lambda x: f"{x.rsquared_adj:.4f}",
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

# This summary col function combines a bunch of regressions into one nice table
print(summary_col(results=[model1,model2,model3,model4], # list the result obj here
                  float_format='%0.4f',
                  stars = True, # stars are easy way to see if anything is statistically significant
                  model_names=['Denial Rate','Denial Rate',' log_num_apps','log_num_apps'], # these are bad names, lol. Usually, just use the y variable name
                  info_dict=info_dict,
                  regressor_order=[ 'Intercept', "hisp_rate", "hisp_over_med", "which_bank[T.BankofWest]", "hisp_rate:which_bank[T.BankofWest]",
                                  "hisp_over_med:which_bank[T.BankofWest]", "median_all_income", "mean_LTV"]
                  )
     )


                                       Denial Rate I Denial Rate II  log_num_apps I log_num_apps I
--------------------------------------------------------------------------------------------------
Intercept                              0.0999***     0.1120***      4.2705***       4.0785***     
                                       (0.0092)      (0.0091)       (0.0225)        (0.0228)      
hisp_rate                              0.0012***                    -0.0130***                    
                                       (0.0001)                     (0.0003)                      
hisp_over_med                                        0.0463***                      -0.4740***    
                                                     (0.0054)                       (0.0152)      
which_bank[T.BankofWest]               0.1253***     0.0978***      -2.8058***      -2.6806***    
                                       (0.0062)      (0.0050)       (0.0217)        (0.0184)      
hisp_rate