# Educational Spendings Effect on Home Valuations Across the United States of America


In [34]:
# import libraries 

import pandas as pd
import numpy as np
import statsmodels.api as sm

# set float format
pd.options.display.float_format = '{:.2f}'.format

# Read datasets

gdpDeflator = pd.read_csv('data\FRED_ipd.csv')

homeValuation = pd.read_csv('data\ZillowHousingValues_state-by-state.csv')

educationSpending = pd.read_csv('data\education-spending_state-by-state.csv')

#### Format gdpDeflator to be used homeValuations and educationSpending. This table will eventually be used to merge and perform gdp deflation calculations.

In [35]:
# Rename columns
gdpDeflator = gdpDeflator.rename(columns={'DATE': 'year', 'USAGDPDEFAISMEI': 'ipd'})

# Set date to correct variable type
gdpDeflator['year'] = pd.to_datetime(gdpDeflator['year']) 

# remove all data that does not fit data date range
gdpDeflator = gdpDeflator[gdpDeflator['year'] >= '2000-01-01']
gdpDeflator = gdpDeflator[gdpDeflator['year'] <= '2016-01-01']

# reset index to accommodate new datarange
gdpDeflator = gdpDeflator.reset_index(drop=True)

#### Adjust Base year from 2015 to 2016

In [36]:
# set base year
base_year = '2016-01-01'

# calculate conversion factor
conversion_factor = gdpDeflator.loc[gdpDeflator['year'] == base_year, 'ipd'].values[0] / 100

# adjust to base year of 2016
gdpDeflator['ipd'] = gdpDeflator['ipd'] / conversion_factor

#### Format educationSpending to be merged with gdpDeflator (for inflation calculations)

In [37]:
# convert wide form dataframe into long form
educationSpending = pd.melt(educationSpending, id_vars=['state'], var_name='year', value_name='USD')

# change date variable type
educationSpending['year'] = pd.to_datetime(educationSpending['year'])

# sort by state, then year for easier indexing
educationSpending = educationSpending.sort_values(['state', 'year'])

# remove all data that does not fit data date range
educationSpending = educationSpending[educationSpending['year'] >= '2000-01-01']
educationSpending = educationSpending[educationSpending['year'] <= '2016-01-01']

# reset index after sort
educationSpending = educationSpending.reset_index(drop=True)

# rename columns
educationSpending = educationSpending.rename(columns={'USD': 'EducationSpend'})

#### Format homeValuation Dataframe

In [38]:
# drop unneccessary columns
collumnToDrop = ['RegionID', 'SizeRank', 'RegionType', 'StateName']
homeValuation = homeValuation.drop(collumnToDrop, axis=1)

# Convert wide form into long form dataframe
homeValuation = pd.melt(homeValuation, id_vars=['RegionName'], var_name='Date', value_name='HomeValue')

# convert ['Date'] datatype to DateTime
homeValuation['Date'] = pd.to_datetime(homeValuation['Date']) + pd.tseries.offsets.MonthBegin(-1)

# sort data by state and date
homeValuation = homeValuation.sort_values(['RegionName', 'Date'])

# Remove all data that does is not the start of a year
homeValuation = homeValuation[homeValuation['Date'].dt.month == 1]

# remove all data that does not fit data date range
homeValuation = homeValuation[homeValuation['Date'] <= '2016-01-01']
homeValuation = homeValuation[homeValuation['Date'] >= '2000-01-01']

# rename columns (allow for simple use of left join later)
homeValuation = homeValuation.rename(columns={'Date': 'year', 'RegionName': 'state'})

# drop NaN
homeValuation.dropna(inplace=True)

# reset index
homeValuation = homeValuation.reset_index(drop=True)

#### Left join educationSpending and gdpDeflator, and perform inflation adjustments

In [39]:
# Left join
education_gdpDeflator = pd.merge(educationSpending, gdpDeflator, on='year', how='left')

# define variable for base year value
BaseYear = gdpDeflator.loc[16, 'ipd']

# Make new column for adjusted values
education_gdpDeflator['adjusted_EducationSpend'] = 0

# Adjust value calculations
education_gdpDeflator['adjusted_EducationSpend'] = (education_gdpDeflator['EducationSpend'] / education_gdpDeflator['ipd']) * BaseYear 

# Format final dataset
collumn_Drop = ['ipd', 'EducationSpend']
educationSpending_final = education_gdpDeflator.drop(collumn_Drop, axis=1)

#### Left join homeValuation and gdpDeflator, and perform inflation adjustments

In [40]:
# Left Join
homeValuation_gdpDeflator = pd.merge(homeValuation, gdpDeflator, on='year', how='left')

# Make new column for adjusted values
homeValuation_gdpDeflator['adjusted_homeValuation'] = 0

# Inflation value adjustment calculations
homeValuation_gdpDeflator['adjusted_homeValuation'] = (homeValuation_gdpDeflator['HomeValue'] / homeValuation_gdpDeflator['ipd']) * BaseYear 

# Format final dataset
collumn_Drop = ['ipd', 'HomeValue']
homeValuation_final = homeValuation_gdpDeflator.drop(collumn_Drop, axis=1)


#### Define a function for OLS model

In [41]:
def ols_model(y, X):
    model = sm.OLS(y, X).fit()
    return model.summary()

#### Merge all state dataset for model use

In [42]:
OLS_modelmerge = pd.merge(educationSpending_final, homeValuation_final, on=['state', 'year'], how='left')
OLS_modelmerge.dropna(inplace=True)

#### Define variables and create intercept, then run OLS

In [43]:
#intercept
OLS_modelmerge['intercept'] = 1

# define independent variables
X = OLS_modelmerge[ ['intercept', 'adjusted_EducationSpend']]

# define dependent variable
y = OLS_modelmerge['adjusted_homeValuation']

# run OLS model
ols_model(y,X)

0,1,2,3
Dep. Variable:,adjusted_homeValuation,R-squared:,0.041
Model:,OLS,Adj. R-squared:,0.04
Method:,Least Squares,F-statistic:,36.49
Date:,"Tue, 25 Apr 2023",Prob (F-statistic):,2.3e-09
Time:,10:35:24,Log-Likelihood:,-10751.0
No. Observations:,846,AIC:,21510.0
Df Residuals:,844,BIC:,21510.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,1.766e+05,3593.155,49.147,0.000,1.7e+05,1.84e+05
adjusted_EducationSpend,0.0014,0.000,6.041,0.000,0.001,0.002

0,1,2,3
Omnibus:,180.489,Durbin-Watson:,0.142
Prob(Omnibus):,0.0,Jarque-Bera (JB):,328.602
Skew:,1.276,Prob(JB):,4.42e-72
Kurtosis:,4.676,Cond. No.,20700000.0


#### Pearson correlation on main dataset

In [44]:
OLS_modelmerge['adjusted_EducationSpend'].corr(OLS_modelmerge['adjusted_homeValuation'], method='pearson')

0.2035796038690924

#### Establish regionally grouped dataframes

In [45]:
northEast = ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island', 'Vermont', 'New Jersey', 'New York', 'Pennsylvania' ]

midWest = ['Indiana', 'Illinois', 'Michigan', 'Ohio', 'Wisconsin', 'Iowa', 'Kansas', 'Minnesota', 'Missouri', 'Nebraska', 'North Dakota', 'South Dakota' ]

south = ['Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Maryland', 'North Carolina', 'South Carolina', 'Virginia', 'West Virginia', 'Alabama', 'Kentucky', 'Mississippi', 'Tennessee', 'Arkansas', 'Louisiana', 'Oklahoma', 'Texas']

west = ['Arizona', 'Colorado', 'Idaho', 'New Mexico', 'Montana', 'Utah', 'Nevada', 'Wyoming', 'Alaska', 'California', 'Hawaii', 'Oregon', 'Washington']

OLS_northEast = OLS_modelmerge[OLS_modelmerge['state'].isin(northEast)]
OLS_midWest = OLS_modelmerge[OLS_modelmerge['state'].isin(midWest)]
OLS_south = OLS_modelmerge[OLS_modelmerge['state'].isin(south)]
OLS_west = OLS_modelmerge[OLS_modelmerge['state'].isin(west)]


#### North East Region OLS

In [46]:
# define independent variables
x_NE = OLS_northEast[['intercept', 'adjusted_EducationSpend']]

# define dependent variable
y_NE = OLS_northEast['adjusted_homeValuation']

# run model
ols_model(y_NE, x_NE)

0,1,2,3
Dep. Variable:,adjusted_homeValuation,R-squared:,0.003
Model:,OLS,Adj. R-squared:,-0.004
Method:,Least Squares,F-statistic:,0.4274
Date:,"Tue, 25 Apr 2023",Prob (F-statistic):,0.514
Time:,10:35:24,Log-Likelihood:,-1920.1
No. Observations:,153,AIC:,3844.0
Df Residuals:,151,BIC:,3850.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,2.305e+05,7459.426,30.897,0.000,2.16e+05,2.45e+05
adjusted_EducationSpend,0.0002,0.000,0.654,0.514,-0.000,0.001

0,1,2,3
Omnibus:,8.237,Durbin-Watson:,0.179
Prob(Omnibus):,0.016,Jarque-Bera (JB):,8.251
Skew:,0.528,Prob(JB):,0.0162
Kurtosis:,2.575,Cond. No.,27700000.0


#### Mid West Region OLS

In [47]:
# define independent variables
x_MW = OLS_midWest[['intercept', 'adjusted_EducationSpend']]

# define dependent variable
y_MW = OLS_midWest['adjusted_homeValuation']

# run model
ols_model(y_MW, x_MW)


0,1,2,3
Dep. Variable:,adjusted_homeValuation,R-squared:,0.025
Model:,OLS,Adj. R-squared:,0.019
Method:,Least Squares,F-statistic:,4.856
Date:,"Tue, 25 Apr 2023",Prob (F-statistic):,0.0287
Time:,10:35:24,Log-Likelihood:,-2269.7
No. Observations:,195,AIC:,4543.0
Df Residuals:,193,BIC:,4550.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,1.322e+05,3428.858,38.546,0.000,1.25e+05,1.39e+05
adjusted_EducationSpend,0.0006,0.000,2.204,0.029,6.82e-05,0.001

0,1,2,3
Omnibus:,28.121,Durbin-Watson:,0.164
Prob(Omnibus):,0.0,Jarque-Bera (JB):,35.829
Skew:,0.942,Prob(JB):,1.66e-08
Kurtosis:,3.927,Cond. No.,20200000.0


#### South Region OLS

In [48]:
# define independent variables
x_S = OLS_south[['intercept', 'adjusted_EducationSpend']]

# define dependent variable
y_S = OLS_south['adjusted_homeValuation']

# run model
ols_model(y_S, x_S)


0,1,2,3
Dep. Variable:,adjusted_homeValuation,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.003
Method:,Least Squares,F-statistic:,0.01175
Date:,"Tue, 25 Apr 2023",Prob (F-statistic):,0.914
Time:,10:35:25,Log-Likelihood:,-3652.8
No. Observations:,289,AIC:,7310.0
Df Residuals:,287,BIC:,7317.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,1.612e+05,6460.750,24.947,0.000,1.48e+05,1.74e+05
adjusted_EducationSpend,5.279e-05,0.000,0.108,0.914,-0.001,0.001

0,1,2,3
Omnibus:,107.226,Durbin-Watson:,0.147
Prob(Omnibus):,0.0,Jarque-Bera (JB):,273.617
Skew:,1.779,Prob(JB):,3.8399999999999996e-60
Kurtosis:,6.172,Cond. No.,19500000.0


#### West Region OLS

In [49]:
# define independent variables
x_W = OLS_west[['intercept', 'adjusted_EducationSpend']]

# define dependent variable
y_W = OLS_west['adjusted_homeValuation']

# run model
ols_model(y_W, x_W)

0,1,2,3
Dep. Variable:,adjusted_homeValuation,R-squared:,0.247
Model:,OLS,Adj. R-squared:,0.243
Method:,Least Squares,F-statistic:,67.74
Date:,"Tue, 25 Apr 2023",Prob (F-statistic):,2.07e-14
Time:,10:35:25,Log-Likelihood:,-2633.0
No. Observations:,209,AIC:,5270.0
Df Residuals:,207,BIC:,5277.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,2.249e+05,5690.842,39.519,0.000,2.14e+05,2.36e+05
adjusted_EducationSpend,0.0026,0.000,8.230,0.000,0.002,0.003

0,1,2,3
Omnibus:,79.186,Durbin-Watson:,0.265
Prob(Omnibus):,0.0,Jarque-Bera (JB):,198.921
Skew:,1.723,Prob(JB):,6.38e-44
Kurtosis:,6.312,Cond. No.,20800000.0
