# Educational Spendings Effect on Home Valuations Across the United States of America


In [1]:
# import libraries 

import pandas as pd
import numpy as np
import statsmodels.api as sm

# Read datasets

gdpDeflator = pd.read_csv('data\gdp_deflator_FRED.csv')

homeValuation = pd.read_csv('data\ZillowHousingValues_state-by-state.csv')

educationSpending = pd.read_csv('data\education-spending_state-by-state.csv')
homeValuation.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,2000-01-31,2000-02-29,2000-03-31,2000-04-30,2000-05-31,...,2022-06-30,2022-07-31,2022-08-31,2022-09-30,2022-10-31,2022-11-30,2022-12-31,2023-01-31,2023-02-28,2023-03-31
0,9,0,California,state,,186276.110907,186903.75446,187750.027242,189586.300199,191719.031375,...,770917.504409,771314.829198,766685.736923,759238.756987,752637.613746,747513.149009,742356.811334,735996.325796,730505.205856,728133.501198
1,54,1,Texas,state,,105214.359426,105270.23952,105296.592,105431.576095,105518.343516,...,296589.890038,299379.177131,300228.906412,299532.992183,298589.121983,297623.444249,296121.316278,294864.993311,294073.120181,294336.348545
2,14,2,Florida,state,,103672.977445,103897.883804,104170.164927,104723.376752,105319.556053,...,380194.359106,385446.114848,388077.606462,388365.446497,387892.617981,387242.033534,385828.827185,384293.206747,383029.022145,383063.088395
3,43,3,New York,state,,123754.505069,124195.869047,124620.230729,125532.440125,126474.471071,...,409537.856474,412732.587736,413850.46872,412716.522689,411295.15767,410125.738516,408268.08773,407584.883808,408242.022283,411304.031953
4,47,4,Pennsylvania,state,,89306.93252,89498.07941,89677.686799,90043.628933,90419.511725,...,240680.874131,241661.576521,241588.251605,240795.850242,240652.061602,240982.617796,241276.57684,241860.227074,242563.191621,243858.898093


 Format gdpDeflator to be used homeValuations and educationSpending. This table will eventually be used to merge and perform gdp deflation calculations.

In [2]:
# Rename columns
gdpDeflator = gdpDeflator.rename(columns={'DATE': 'year', 'A191RI1A225NBEA': 'gdp'})

# Set date to correct variable type
gdpDeflator['year'] = pd.to_datetime(gdpDeflator['year']) 

# remove all data that does not fit data date range
gdpDeflator = gdpDeflator[gdpDeflator['year'] >= '2000-01-01']
gdpDeflator = gdpDeflator[gdpDeflator['year'] <= '2016-01-01']

# reset index to accommodate new datarange
gdpDeflator = gdpDeflator.reset_index(drop=True)
print(gdpDeflator)


         year  gdp
0  2000-01-01  2.3
1  2001-01-01  2.3
2  2002-01-01  1.6
3  2003-01-01  2.0
4  2004-01-01  2.7
5  2005-01-01  3.1
6  2006-01-01  3.1
7  2007-01-01  2.7
8  2008-01-01  1.9
9  2009-01-01  0.6
10 2010-01-01  1.2
11 2011-01-01  2.1
12 2012-01-01  1.9
13 2013-01-01  1.8
14 2014-01-01  1.9
15 2015-01-01  1.0
16 2016-01-01  1.0


Format educationSpending to be merged with gdpDeflator (for inflation calculations)

In [3]:
# convert wide form dataframe into long form
educationSpending = pd.melt(educationSpending, id_vars=['state'], var_name='year', value_name='USD')

# change date variable type
educationSpending['year'] = pd.to_datetime(educationSpending['year'])

# sort by state, then year for easier indexing
educationSpending = educationSpending.sort_values(['state', 'year'])

# remove all data that does not fit data date range
educationSpending = educationSpending[educationSpending['year'] >= '2000-01-01']
educationSpending = educationSpending[educationSpending['year'] <= '2016-01-01']

# reset index after sort
educationSpending = educationSpending.reset_index(drop=True)

# rename columns
educationSpending = educationSpending.rename(columns={'USD': 'EducationSpend'})

print(educationSpending)

       state       year  EducationSpend
0    Alabama 2000-01-01         4006894
1    Alabama 2001-01-01         4140053
2    Alabama 2002-01-01         4273211
3    Alabama 2003-01-01         4429789
4    Alabama 2004-01-01         4586366
..       ...        ...             ...
862  Wyoming 2012-01-01         1418805
863  Wyoming 2013-01-01         1425713
864  Wyoming 2014-01-01         1453975
865  Wyoming 2015-01-01         1495720
866  Wyoming 2016-01-01         1542763

[867 rows x 3 columns]


Format homeValuation Dataframe

In [4]:
# drop unneccessary columns
collumnToDrop = ['RegionID', 'SizeRank', 'RegionType', 'StateName']
homeValuation = homeValuation.drop(collumnToDrop, axis=1)
# Convert wide form into long form dataframe
homeValuation = pd.melt(homeValuation, id_vars=['RegionName'], var_name='Date', value_name='HomeValue')

# convert ['Date'] datatype to DateTime
homeValuation['Date'] = pd.to_datetime(homeValuation['Date']) + pd.tseries.offsets.MonthBegin(-1)

# sort data by state and date
homeValuation = homeValuation.sort_values(['RegionName', 'Date'])

# Remove all data that does is not the start of a year
homeValuation = homeValuation[homeValuation['Date'].dt.month == 1]

# remove all data that does not fit data date range
homeValuation = homeValuation[homeValuation['Date'] <= '2016-01-01']
homeValuation = homeValuation[homeValuation['Date'] >= '2000-01-01']

# rename columns (allow for simple use of left join later)
homeValuation = homeValuation.rename(columns={'Date': 'year', 'RegionName': 'state'})

# reset index
homeValuation = homeValuation.reset_index(drop=True)
homeValuation[homeValuation.isna().any(axis=1)]

Unnamed: 0,state,year,HomeValue
442,Montana,2000-01-01,
443,Montana,2001-01-01,
444,Montana,2002-01-01,
445,Montana,2003-01-01,
446,Montana,2004-01-01,
447,Montana,2005-01-01,
527,New Mexico,2000-01-01,
528,New Mexico,2001-01-01,
529,New Mexico,2002-01-01,
578,North Dakota,2000-01-01,


Left join educationSpending and gdpDeflator, to be used in gdp deflation calculations

In [5]:
education_gdpDeflator = pd.merge(educationSpending, gdpDeflator, on='year', how='left')
education_gdpDeflator.head(20)

Unnamed: 0,state,year,EducationSpend,gdp
0,Alabama,2000-01-01,4006894,2.3
1,Alabama,2001-01-01,4140053,2.3
2,Alabama,2002-01-01,4273211,1.6
3,Alabama,2003-01-01,4429789,2.0
4,Alabama,2004-01-01,4586366,2.7
5,Alabama,2005-01-01,4925201,3.1
6,Alabama,2006-01-01,5478034,3.1
7,Alabama,2007-01-01,6104470,2.7
8,Alabama,2008-01-01,6671430,1.9
9,Alabama,2009-01-01,6629154,0.6


In [8]:
#group data for use in for loop
education_gdpDeflator_group = education_gdpDeflator.groupby(['state', 'year'])


for info, group in education_gdpDeflator_group:
    state, year = info
    gdp = group['gdp'].iloc[0]
    print(f"Iteration {state} and {year}. the gdp percent is {gdp}")

Iteration Alabama and 2000-01-01 00:00:00. the gdp percent is 2.3
Iteration Alabama and 2001-01-01 00:00:00. the gdp percent is 2.3
Iteration Alabama and 2002-01-01 00:00:00. the gdp percent is 1.6
Iteration Alabama and 2003-01-01 00:00:00. the gdp percent is 2.0
Iteration Alabama and 2004-01-01 00:00:00. the gdp percent is 2.7
Iteration Alabama and 2005-01-01 00:00:00. the gdp percent is 3.1
Iteration Alabama and 2006-01-01 00:00:00. the gdp percent is 3.1
Iteration Alabama and 2007-01-01 00:00:00. the gdp percent is 2.7
Iteration Alabama and 2008-01-01 00:00:00. the gdp percent is 1.9
Iteration Alabama and 2009-01-01 00:00:00. the gdp percent is 0.6
Iteration Alabama and 2010-01-01 00:00:00. the gdp percent is 1.2
Iteration Alabama and 2011-01-01 00:00:00. the gdp percent is 2.1
Iteration Alabama and 2012-01-01 00:00:00. the gdp percent is 1.9
Iteration Alabama and 2013-01-01 00:00:00. the gdp percent is 1.8
Iteration Alabama and 2014-01-01 00:00:00. the gdp percent is 1.9
Iteration 