# Zillow Raw Data Aggregation

### Introduction

In this notebook I will be aggregating median rental prices from Zillow for different sized properties across the United States. This data pertains to the area from March of 2010 through November of 2019.

### Aggregation

In [1]:
#Read in libraries
import pandas as pd
import glob

import os

In [2]:
#Set path to location of zillow raw data on local machine
path = r'C:\Users\kishe\Documents\Data Science\Projects\Python Projects\In Progress\Air BnB - SF\Data\01_Raw\Zillow Raw Data/'

#Change wd to path to zillow data location
os.chdir(path)

In [3]:
#Get current directory csv files
csvs = [x for x in os.listdir('.') if x.endswith('.csv')]

#stats.csv -> stats
fns = [os.path.splitext(os.path.basename(x))[0] for x in csvs]

#Create empty dictionary
d = {}

#Store csv data in d
for i in range(len(fns)):
    d[fns[i]] = pd.read_csv(csvs[i])

In [4]:
#Assign data to seperate data frames
studio = d['Zip_MedianRentalPrice_Studio']
one_br = d['Zip_MedianRentalPrice_1Bedroom']
two_br = d['Zip_MedianRentalPrice_2Bedroom']
three_br = d['Zip_MedianRentalPrice_3Bedroom']

**Preview Dataframes**

In [5]:
#Check shape of data frame
print('Shape of studio: ', studio.shape)

#Preview Data
display(studio.head())

Shape of studio:  (630, 118)


Unnamed: 0,RegionName,City,State,Metro,CountyName,SizeRank,2010-08,2010-09,2010-10,2010-11,...,2019-02,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11
0,10025,New York,NY,New York-Newark-Jersey City,New York County,1,,,,,...,2685.5,2700.0,2622.5,2700.0,2700.0,2800.0,2700.0,2700.0,2795.0,2869.5
1,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,,,,,...,1100.0,1195.0,1195.0,1165.0,1145.0,1150.0,1171.5,1150.0,1125.0,1120.0
2,10023,New York,NY,New York-Newark-Jersey City,New York County,3,,,,,...,2800.0,2780.0,2800.0,2862.5,2700.0,2700.0,2755.0,2800.0,2800.0,2850.0
3,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,4,,,,,...,1300.0,1395.0,1361.0,1350.0,1320.0,1325.0,1325.0,1350.0,1320.0,1350.0
4,79936,El Paso,TX,El Paso,El Paso County,5,,,,,...,995.0,1012.5,1000.0,1070.0,1100.0,1100.0,1100.0,1100.0,1085.0,1050.0


In [6]:
#Check shape of data frame
print('Shape of one_br: ', one_br.shape)

#Preview Data
display(one_br.head())

Shape of one_br:  (1676, 117)


Unnamed: 0,RegionName,City,State,Metro,CountyName,SizeRank,2010-09,2010-10,2010-11,2010-12,...,2019-02,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11
0,10025,New York,NY,New York-Newark-Jersey City,New York County,1,,2600.0,2689.0,2678.0,...,3195.0,3200.0,3200.0,3100.0,3150.0,3100.0,3100.0,3050.0,3100.0,3156.0
1,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,,,,,...,1425.0,1475.0,1490.0,1475.0,1495.0,1452.5,1425.0,1425.0,1410.0,1400.0
2,10023,New York,NY,New York-Newark-Jersey City,New York County,3,,2995.0,3025.0,3000.0,...,3300.0,3300.0,3350.0,3350.0,3325.0,3297.5,3350.0,3350.0,3400.0,3391.5
3,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,,,,,...,1159.0,1155.0,1152.5,1194.0,1236.5,1201.0,1206.0,1169.0,1157.0,1127.5
4,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,,,,,...,1650.0,1695.0,1675.0,1700.0,1675.0,1720.0,1700.0,1670.0,1625.0,1600.0


In [7]:
#Check shape of data frame
print('Shape of two_br: ', two_br.shape)

#Preview Data
display(two_br.head())

Shape of two_br:  (2540, 117)


Unnamed: 0,RegionName,City,State,Metro,CountyName,SizeRank,2010-09,2010-10,2010-11,2010-12,...,2019-02,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11
0,10025,New York,NY,New York-Newark-Jersey City,New York County,1,,,,,...,3500.0,3495.0,3500.0,3600.0,3450.0,3399.5,3450.0,3500.0,3849.0,3952.5
1,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,,,,,...,1901.0,1950.0,1950.0,1950.0,1958.5,1990.0,1950.0,1950.0,1900.0,1895.0
2,10023,New York,NY,New York-Newark-Jersey City,New York County,3,,,,,...,4860.0,4750.0,4840.0,4900.0,4900.0,4950.0,5200.0,5183.5,5400.0,5200.0
3,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,,,,,...,1665.0,1659.0,1599.0,1605.0,1619.0,1619.0,1600.0,1590.0,1590.0,1578.0
4,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,,,,,...,2350.0,2300.0,2200.0,2200.0,2155.0,2150.0,2245.0,2390.0,2300.0,2316.0


In [8]:
#Check shape of data frame
print('Shape of three_br: ', three_br.shape)

#Preview Data
display(three_br.head())

Shape of three_br:  (1594, 123)


Unnamed: 0,RegionName,City,State,Metro,CountyName,SizeRank,2010-03,2010-04,2010-05,2010-06,...,2019-02,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11
0,10025,New York,NY,New York-Newark-Jersey City,New York County,1,,,,,...,4500.0,4425.0,4350.0,4299.0,4300.0,4350.0,4300.0,4335.0,4550.0,4550.0
1,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,,,,,...,2350.0,2500.0,2600.0,2650.0,2660.0,2600.0,2550.0,2550.0,2495.0,2495.0
2,10023,New York,NY,New York-Newark-Jersey City,New York County,3,,,,,...,6947.5,6900.0,6600.0,6947.5,6850.0,6500.0,7095.0,6900.0,7147.5,7381.0
3,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,,,,,...,1850.0,1844.0,1830.0,1849.0,1850.0,1849.5,1850.0,1849.0,1825.0,1850.0
4,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,,,,,...,2800.0,2970.0,2901.0,2950.0,2950.0,2895.0,2850.0,2850.0,2875.0,2850.0


**Add Bedroom Count**

In [9]:
#Assign beds column to differentiate data downstream
studio['Bedrooms'] = 0
one_br['Bedrooms'] = 1
two_br['Bedrooms'] = 2
three_br['Bedrooms'] = 3

**Concatenate Data**

In [10]:
#Concatenate and assign to zillow
zillow = pd.concat([studio, one_br,two_br,three_br], axis =0 )

#Preview
display(zillow.head())

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0,2010-03,2010-04,2010-05,2010-06,2010-07,2010-08,2010-09,2010-10,2010-11,2010-12,...,2019-09,2019-10,2019-11,Bedrooms,City,CountyName,Metro,RegionName,SizeRank,State
0,,,,,,,,,,,...,2700.0,2795.0,2869.5,0,New York,New York County,New York-Newark-Jersey City,10025,1,NY
1,,,,,,,,,,,...,1150.0,1125.0,1120.0,0,Chicago,Cook County,Chicago-Naperville-Elgin,60657,2,IL
2,,,,,,,,,,3400.0,...,2800.0,2800.0,2850.0,0,New York,New York County,New York-Newark-Jersey City,10023,3,NY
3,,,,,,,,,,,...,1350.0,1320.0,1350.0,0,Chicago,Cook County,Chicago-Naperville-Elgin,60614,4,IL
4,,,,,,,,,,,...,1100.0,1085.0,1050.0,0,El Paso,El Paso County,El Paso,79936,5,TX


# Export Aggregated Data

In [11]:
#Set path to export cleaned zillow data
path = r'C:\Users\kishe\Documents\Data Science\Projects\Python Projects\In Progress\Air BnB - SF\Data\01_Raw\Zillow Raw Data\Zillow Raw Data Aggregated\12_29_2019_Zillow_Raw_Aggregated.csv'

#Write zillow to csv file
zillow.to_csv(path, sep=',')