# [Zillow Data](https://www.zillow.com/research/data/)

The house value is calculated using **Zillow's Zestimate**. [1]


## 1. SFR (Single-family rentals)

#### 1.1 By State (2000 to 2021)

> ##### 1.1.1 Income-tax free states
>
> ##### 1.1.2 Legalized cannabis states
>

In [28]:
# Packages
###########
import pandas as pd
import numpy as np
import datetime as dt

# Original Data
###############

df = pd.read_csv('./_original_data/ZHVI_Single-Family-Home_by_State.csv')
df.sample(20)

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,2000-01-31,2000-02-29,2000-03-31,2000-04-30,2000-05-31,...,2020-12-31,2021-01-31,2021-02-28,2021-03-31,2021-04-30,2021-05-31,2021-06-30,2021-07-31,2021-08-31,2021-09-30
48,58,48,Vermont,State,VT,130229.0,130589.0,131066.0,131928.0,132791.0,...,284285.0,286927.0,289498.0,292461.0,296375.0,300995.0,304879.0,309210.0,313800.0,320392.0
8,16,8,Georgia,State,GA,131281.0,131716.0,132150.0,133021.0,133852.0,...,225956.0,228549.0,231325.0,234211.0,237631.0,241760.0,246695.0,252120.0,257469.0,263370.0
29,19,29,Iowa,State,IA,92448.0,92739.0,92970.0,93460.0,93913.0,...,160299.0,161235.0,162500.0,163790.0,165083.0,165981.0,167109.0,168376.0,169638.0,172477.0
34,42,34,Nevada,State,NV,170661.0,170708.0,171052.0,171755.0,172506.0,...,342395.0,346649.0,350952.0,355446.0,361011.0,370396.0,382315.0,394119.0,403844.0,412091.0
43,35,43,Montana,State,MT,,,,,,...,318717.0,323373.0,328992.0,334203.0,340323.0,347346.0,355610.0,364199.0,372936.0,383197.0
2,43,2,New York,State,NY,139218.0,139906.0,140479.0,141696.0,142786.0,...,300877.0,304118.0,307538.0,310848.0,314311.0,318801.0,324179.0,330136.0,334801.0,338763.0
47,37,47,North Dakota,State,ND,,,,,,...,246282.0,247704.0,249509.0,251180.0,252683.0,253820.0,255185.0,256790.0,258550.0,261136.0
21,10,21,Colorado,State,CO,191079.0,192012.0,192891.0,194865.0,196819.0,...,451393.0,456899.0,462869.0,469760.0,477580.0,487770.0,499653.0,511834.0,521564.0,529707.0
13,26,13,Massachusetts,State,MA,215834.0,216859.0,217925.0,219930.0,221883.0,...,484795.0,490866.0,497271.0,503971.0,511214.0,519541.0,529414.0,539272.0,547527.0,553036.0
9,36,9,North Carolina,State,NC,135286.0,135614.0,135896.0,136480.0,137034.0,...,229429.0,232275.0,235444.0,238806.0,242219.0,246221.0,251072.0,256736.0,262653.0,268754.0


In [29]:
# Yearly Growth Rate per State
##############################

# SFR Middle Tier Housing Data
df = pd.read_csv('./_original_data/ZHVI_Single-Family-Home_by_State.csv')

# Drop unnecessary columns
df.drop(columns=['RegionID', 'RegionType', 'RegionName', 'SizeRank'], 
        inplace=True)

# Set index to StateName
df.set_index('StateName', inplace=True)

# Transpose so dates are in the index for pre-processing purposes
df = df.T

# Convert index to datetime format
df.index = pd.to_datetime(df.index)

# Average housing prices per year
df = df.resample('Y').mean().round(2)

# Change column headers to year only
df.index = df.index.to_period('Y')

# Transpose dataframe
df = df.T

# Export data
df.to_csv('./_modified_data/state_yearly_avg_price_2009-21.csv', index=True)

df

Unnamed: 0_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
StateName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CA,216140.92,246365.08,277529.5,323904.25,394474.33,481066.42,528956.5,507838.42,414550.08,336750.33,...,315159.42,374705.33,423537.5,446674.33,477828.92,509607.75,548828.42,559653.67,590486.83,678107.22
TX,113606.83,115076.25,117724.5,121222.08,127987.5,134064.83,138680.25,143993.67,143612.33,140288.58,...,135851.5,143405.75,154461.92,166426.25,177992.17,189696.33,200902.92,210543.58,218834.67,244605.33
NY,144003.5,155104.08,168794.5,186090.17,205301.5,227558.83,246019.25,250248.0,247111.5,235081.5,...,221672.08,225465.5,231760.67,235930.33,242179.5,252599.25,265755.42,276866.58,287664.83,320388.33
FL,124320.25,134449.92,147544.75,163878.58,187217.92,231301.83,278920.25,270280.33,224832.83,179846.67,...,148047.92,165002.58,184098.0,196987.5,213137.75,229845.58,246518.0,258582.08,271536.08,307903.0
IL,152451.67,164293.08,176535.25,189486.08,204223.5,221353.5,235999.25,238423.17,224152.08,198161.58,...,159326.25,164866.58,174817.42,179898.33,186416.75,195748.83,203436.0,207673.08,212218.92,231695.78
PA,108931.25,114669.0,121673.92,131418.33,143818.0,159901.83,174223.92,180138.25,178993.17,172876.5,...,161175.83,162890.0,165805.92,168355.92,172552.42,179216.33,189113.92,197322.08,207469.33,232272.78
OH,113130.58,118169.33,122016.42,126314.17,130884.33,134962.33,137415.58,136037.17,131543.08,125009.67,...,112965.17,114751.33,117342.67,121699.83,127063.08,134715.25,143561.67,152081.0,161854.33,181095.67
MI,125070.75,133410.75,139298.83,143975.5,149420.92,153670.08,154350.75,147063.33,131923.67,111802.17,...,98010.5,108610.33,121054.42,128018.92,136199.58,147356.0,159730.58,169905.17,180018.92,202475.89
GA,135224.33,144965.25,151797.5,156373.92,161077.5,167154.75,174051.83,178071.08,171193.25,153535.42,...,125462.33,135190.25,148209.75,156961.75,166163.75,176583.75,191749.33,204496.75,215895.67,243681.11
NC,137786.92,143212.75,146279.58,149066.5,152971.33,159237.75,168850.42,177235.5,178491.17,171255.25,...,153735.75,156233.83,161168.5,166099.5,173174.5,183583.08,195693.0,207300.5,219659.58,248242.22


In [30]:
# Identify high growth locations
##################################

# Calculate number of years
periods = len(df.columns)-1

# Calculate % change across each year
df_pct = df.pct_change(axis='columns')

# Drop first column (NaN due to pct_change)
df_pct = df_pct.iloc[:, 1:]

df_pct.to_csv('./_modified_data/state_yearly_avg_growth_rate_2009-21.csv', index=True)

df_pct.sample(5)

Unnamed: 0_level_0,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
StateName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NJ,0.104476,0.125876,0.154817,0.137652,0.145539,0.098019,-0.012404,-0.06231,-0.084365,-0.034907,...,-0.052872,0.01452,0.039188,0.007023,0.008174,0.027234,0.043485,0.02341,0.040249,0.13586
AR,0.045885,0.029792,0.04277,0.055355,0.073802,0.067699,0.025758,-0.003132,-0.032659,-0.01491,...,0.008403,0.022843,0.015223,0.023845,0.033181,0.04057,0.045959,0.041704,0.042382,0.087838
IL,0.077673,0.074514,0.073361,0.077776,0.083879,0.066165,0.010271,-0.059856,-0.11595,-0.065333,...,-0.060068,0.034773,0.060357,0.029064,0.036234,0.05006,0.039271,0.020828,0.021889,0.091777
LA,0.052752,0.044671,0.034286,0.047132,0.056546,0.095608,0.045844,0.006138,-0.008562,-0.003938,...,-0.004451,0.013414,0.022542,0.023505,0.024844,0.026092,0.024632,0.005998,0.028895,0.060083
DC,0.071586,0.12405,0.163415,0.173288,0.211516,0.124317,0.008176,-0.033909,-0.06839,-0.007483,...,0.036031,0.122214,0.127882,0.007583,0.049021,0.048448,0.047802,0.055651,0.026971,0.061128


<br><br><br>

# AAGR per State (1-5 years)

In [33]:
# Import data
df_pct = pd.read_csv('./_modified_data/state_yearly_avg_growth_rate_2009-21.csv')

# Get years (column headers)
years = []
for x in range(2001, 2022):
    years.append(x)
years = list(map(str, years))

# Change column headers to str format
df_pct.reset_index(inplace=True)
df_pct.columns = df_pct.columns.astype(str)

# Melt
df = df_pct.melt(id_vars=['StateName'], value_vars=years,
                          var_name = 'Year', value_name = 'GrowthRate')

# Calculate AAGR for 1, 2, 3, 4, and 5 years from present

df1 = df[df['Year'].isin(['2020', '2021'])]
df1 = df1.groupby('StateName', as_index=False).mean()
df1['Years'] = 1

df2 = df[df['Year'].isin(['2019', '2020', '2021'])]
df2 = df2.groupby('StateName', as_index=False).mean()
df2['Years'] = 2

df3 = df[df['Year'].isin(['2018', '2019', '2020', '2021'])]
df3 = df3.groupby('StateName', as_index=False).mean()
df3['Years'] = 3

df4 = df[df['Year'].isin(['2017', '2018', '2019', '2020', '2021'])]
df4 = df4.groupby('StateName', as_index=False).mean()
df4['Years'] = 4

df5 = df[df['Year'].isin(['2016', '2017', '2018', '2019', '2020', '2021'])]
df5 = df5.groupby('StateName', as_index=False).mean()
df5['Years'] = 5

df_final = pd.concat([df1, df2, df3, df4, df5]).reset_index(drop=True)

df_final['GrowthRate'] = (df_final['GrowthRate'] * 100).round(2)

df_final

Unnamed: 0,StateName,GrowthRate,Years
0,AK,2.33,1
1,AL,8.28,1
2,AR,6.51,1
3,AZ,15.63,1
4,CA,10.17,1
...,...,...,...
250,VT,4.11,5
251,WA,10.10,5
252,WI,6.77,5
253,WV,3.32,5


In [4]:
# Filter States
################

states_income_tax_free = [
    'AK', 'TN', 'WY', 'FL', 'NH', 'SD', 'TX', 'WA', 'NV'
]

states_legalized_cannabis = [
    'WA', 'CA', 'OR', 'MT', 'CO', 'NV', 'AZ', 'NM', 'IL', 'MI', 'NY', 'MA',
    'VA', 'CT', 'ME', 'NJ', 'AK'
]

# References
> 
> [1] Zillow's Webiste (Nov 2, 2021) https://www.zillow.com/z/zestimate/

<br><br><br>


# Ideas
>
> 1. Pull data from the API (https://www.zillow.com/howto/api/APIOverview.htm)