###### Imports and Settings

UPDATED 10/14/2024 THROUGH SEPTEMBER 2024

In [2]:
import pandas as pd
import numpy as np
import requests
from functools import reduce
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 150)
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
import sys
sys.path.append("../../Functions and Dictionaries/") # Adds higher directory to python modules path
import geodict
tofullcensus = geodict.tofullcensus
geotogeoid = geodict.geotogeoid
import sqlite3 as sq

In [3]:
#functions
def percent(x, y):
        try:
            return ((x/y)*100)
        except ZeroDivisionError:
            return 0
def percentchange(x, y):
    try:
        return ((x - y)*100/y)
    except ZeroDivisionError:
        return 0
def realchange(x, y):
    return x-y
#calculate real and percent change between all columns for all possible time frames
def calculate_changes(df, columns, time_frames, years):
    for column in columns:
        for time_frame in time_frames:
            start_year, end_year = time_frame.split('-')
            df[f'{column} % Change', 'None', f'{time_frame}'] = percentchange(df[(column, int(end_year), 'None')], df[(column, int(start_year), 'None')])
            df[f'{column} Change', 'None', f'{time_frame}'] = (df[(column, int(end_year), 'None')] - df[(column, int(start_year), 'None')])

    return df
#generate all possible time frames from a list of years
def generate_time_frames(years):
    time_frames = []
    for i in range(len(years)-1):
        for j in range(i+1, len(years)):
            time_frames.append(f"{years[i]}-{years[j]}")
    return time_frames

# This notebook outlines the download and formatting process for the Zillow Observed Rent Index for counties and places in the GNRC operating region.  

Go to this page: https://www.zillow.com/research/data/  

+ Under "RENTALS", select Data Type "ZORI (Smoothed, Seasonally Adjusted): All Homes Plus Multifamily Time Series" and download this for Metro & US, County, and City (State is not currently available).   

### Save these csvs as they come in the Data Downloads folder

In [5]:
rental_metrous = pd.read_csv('../Data Downloads/Zillow_MetroUS_ZORI.csv')
rental_county = pd.read_csv('../Data Downloads/Zillow_County_ZORI.csv')
rental_place = pd.read_csv('../Data Downloads/Zillow_City_ZORI.csv')

The county codes are not FIPS codes, unsure what they are but our region contains the following:
Cheatham:2185, Davidson:2243, Dickson:1668 , Houston:1784, Humphreys:2728, Macon:623, Maury:632, Montgomery:2982, Robertson:2834, Rutherford:3016, Sumner:1407, Stewart:2044, Trousdale:2856, Williamson:3080, Wilson:1496, (KY) Allen:369 , (KY) Simpson:2028

In [6]:
#filter down the metro and us files to include only the US and the Nashville and Clarksville MSAs
metrous = [102001, 394902, 394471]
rental_metrous = rental_metrous.loc[rental_metrous['RegionID'].isin(metrous)].reset_index(drop = True)
#filter the county files, Simpson Co KY is RegionID 2028, but doesn't have associated data until recent years so not including for now
counties = [2185, 2243, 1668, 1784, 2728, 623, 632, 2982, 2834, 3016, 1407, 2044, 2856, 3080, 1496, 369]
rental_county = rental_county.loc[rental_county['RegionID'].isin(counties)].reset_index(drop = True)
#filter the place files
places = [41932, 30583, 10843, 30993, 49233, 45339, 11564, 32006, 46091, 25534, 42878, 39894, 19523, 6118, 26161, 54450, 7208, 27227, 29482, 41690]
rental_place = rental_place.loc[rental_place['RegionID'].isin(places)].reset_index(drop = True)

In [7]:
rental_place = rental_place.drop(columns = ['RegionID', 'SizeRank', 'RegionType', 'StateName', 'State', 'Metro', 'CountyName']).set_index(('RegionName'))
rental_county= rental_county.drop(columns = ['RegionID', 'SizeRank', 'RegionType', 'StateName', 'State', 'Metro', 'StateCodeFIPS', 'MunicipalCodeFIPS']).set_index(('RegionName'))
rental_metrous = rental_metrous.drop(columns = ['RegionID', 'SizeRank', 'RegionType', 'StateName']).set_index(('RegionName'))

In [8]:
#append the home value dataframes
frame = [rental_place, rental_county, rental_metrous]
emptyframe = pd.DataFrame()
for df in frame:
    emptyframe = pd.concat([emptyframe, df])  
rental = emptyframe

In [9]:
#transpose and rename the index to NAME
data = rental.transpose().reset_index().rename(columns = {'index':'NAME'})

In [10]:
#adjust the geo names using str.split and index into the correct year
year = data['NAME'].str.split(pat = "/", expand = True)
data['Year'] = year[2]
data.drop(columns = 'NAME', inplace = True)

In [11]:
#rename using module
data = data.rename(columns = tofullcensus)

In [12]:
#group by the average of the year, transpose, make the column a region name, then go from wide to long with .melt()
data = data.groupby(['Year']).mean()
data = data.transpose().reset_index()
data = data.rename(columns = {'RegionName':'NAME'})
data = data.set_index('NAME')
data = data.melt(value_name = 'Average Rent', ignore_index = False)
data = data.reset_index()

In [13]:
#make sure year is formatted as an integer
data['Year'] = data['Year'].astype(int)
#create a list of years from the dataframe to pass through our "generate time frames" function to create a list of all possible time frames - need this here for later
years = list(data['Year'].unique().astype(int))
time_frames = generate_time_frames(years)

In [14]:
data.head()

Unnamed: 0,NAME,Year,Average Rent
0,Nashville-Davidson metropolitan government (ba...,2015,1227.738078
1,"Murfreesboro city, Tennessee",2015,1087.638958
2,"Clarksville city, Tennessee",2015,874.486012
3,"Franklin city, Kentucky",2015,1383.71994
4,"Hendersonville city, Tennessee",2015,


In [15]:
#create a multilevel column header with year and placeholder for time frames
#pivot the table and create a multiindex of year and column header
cols = list(data.columns)
cols.remove('NAME')
cols.remove('Year')
df_pivot = data.pivot_table(index = 'NAME', columns = ['Year'], values = cols)
df_pivot.head(2)

Unnamed: 0_level_0,Average Rent,Average Rent,Average Rent,Average Rent,Average Rent,Average Rent,Average Rent,Average Rent,Average Rent,Average Rent
Year,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
NAME,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
"Ashland City town, Tennessee",,,,,,,,1610.256263,1636.614782,1661.483838
"Brentwood city, Tennessee",,,,,,,,2480.429087,2550.068724,2619.201388


In [16]:
#add a level to the multiindex to accomodate the time period metrics
df_pivot.columns = pd.MultiIndex.from_tuples([(col[0], col[1], 'None') for col in df_pivot.columns])
df_pivot.head(3)

Unnamed: 0_level_0,Average Rent,Average Rent,Average Rent,Average Rent,Average Rent,Average Rent,Average Rent,Average Rent,Average Rent,Average Rent
Unnamed: 0_level_1,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
Unnamed: 0_level_2,None,None,None,None,None,None,None,None,None,None
NAME,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
"Ashland City town, Tennessee",,,,,,,,1610.256263,1636.614782,1661.483838
"Brentwood city, Tennessee",,,,,,,,2480.429087,2550.068724,2619.201388
"Cheatham County, Tennessee",,,,,,,,1644.778125,1696.61708,1738.619907


In [17]:
#get a list of the varaibles to loop through by indexing into the first level only of the column headers
first_level = df_pivot.columns.get_level_values(0).unique().tolist()
first_level

['Average Rent']

In [18]:
#pass the dataframe, the list of variables, time frames, and years through the "calculate change" function
data = calculate_changes(df_pivot, first_level, time_frames = time_frames, years = years)

In [19]:
data = data.stack([1, 1])
data = data.reset_index(drop = False)
data = data.rename(columns = {'level_1':'Year', 'level_2':'Time Frame'})

In [20]:
data['GEO_ID'] = data['NAME'].map(geotogeoid)
data['Source'] = 'Zillow'

In [21]:
#final check
data.head()

Unnamed: 0,NAME,Year,Time Frame,Average Rent,Average Rent % Change,Average Rent Change,GEO_ID,Source
0,"Ashland City town, Tennessee",2022.0,,1610.256263,,,1600000US4702180,Zillow
1,"Ashland City town, Tennessee",2023.0,,1636.614782,,,1600000US4702180,Zillow
2,"Ashland City town, Tennessee",2024.0,,1661.483838,,,1600000US4702180,Zillow
3,"Ashland City town, Tennessee",,2022-2023,,1.636915,26.358518,1600000US4702180,Zillow
4,"Ashland City town, Tennessee",,2022-2024,,3.181331,51.227575,1600000US4702180,Zillow


In [22]:
data['NAME'].unique()

array(['Ashland City town, Tennessee', 'Brentwood city, Tennessee',
       'Cheatham County, Tennessee', 'Clarksville city, Tennessee',
       'Columbia city, Tennessee', 'Davidson County, Tennessee',
       'Dickson County, Tennessee', 'Franklin city, Kentucky',
       'Gallatin city, Tennessee', 'Hendersonville city, Tennessee',
       'La Vergne city, Tennessee', 'Lebanon city, Tennessee',
       'Maury County, Tennessee', 'Montgomery County, Tennessee',
       'Mount Juliet city, Tennessee', 'Murfreesboro city, Tennessee',
       'Nashville-Davidson metropolitan government (balance), Tennessee',
       'Nolensville town, Tennessee', 'Robertson County, Tennessee',
       'Rutherford County, Tennessee', 'Smyrna town, Tennessee',
       'Spring Hill city, Tennessee', 'Springfield city, Tennessee',
       'Sumner County, Tennessee', 'Thompsons Station', 'United States',
       'White House city, Tennessee', 'Williamson County, Tennessee',
       'Wilson County, Tennessee'], dtype=objec

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 950 entries, 0 to 949
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   NAME                   950 non-null    object 
 1   Year                   950 non-null    object 
 2   Time Frame             950 non-null    object 
 3   Average Rent           199 non-null    float64
 4   Average Rent % Change  751 non-null    float64
 5   Average Rent Change    751 non-null    float64
 6   GEO_ID                 894 non-null    object 
 7   Source                 950 non-null    object 
dtypes: float64(3), object(5)
memory usage: 59.5+ KB


In [24]:
#export to the SQLite database as Zillow annual data
conn = sq.connect('../Outputs/Zillow.db')
data.to_sql('Annual_Rent', conn, if_exists = 'replace', index = False)

950