# Step 1: Data Collection

Housing market data will be collected via API from RapidAPI. Brevard County, FL will be the focus of this effort, but with minor adjustment, this could be performed for any county, city, state, etc.

There are two primary steps in this process:
1) Call the API to collect all relevant data
2) Populate a CSV file with the data for use later on

This process uses a freemium API which, at the time of this writing, was called over 8000 times. The specific information on this API can be found here:
https://rapidapi.com/s.mahmoud97/api/zillow56

---

In [1]:
'''
Import libraries
'''

from bs4 import BeautifulSoup
import requests
import pandas as pd
import time

pd.options.display.max_columns = None

In [113]:
'''
Load API key
'''

df = pd.read_csv('../../API_Keys/API-KEYS.csv')
rapid_API_Key = df.loc[df['API'] == 'rapid-API']['KEY'].iloc[0]

In [114]:
'''
Define API specific variables
'''

url_search = "https://zillow56.p.rapidapi.com/search"
url_property = "https://zillow56.p.rapidapi.com/property"
headers = {
		"X-RapidAPI-Key":rapid_API_Key,
		"X-RapidAPI-Host":"zillow56.p.rapidapi.com"
	}

VALID_STATUS = {"forSale", "recentlySold", "forRent"}

API_callCount = 0

In [2]:
'''
Define Zillow search parameters
'''

location = "Brevard County FL"
status = "forSale" # Valid options are: ['forSale','recentlySold','forRent']
priceRange = [
    [1, 250000], [250001, 300000], [300001, 350000], [350001, 400000], [400001, 450000], [450001, 500000], [500001, 750000], [750001, 1000000],
    [1000001, 2000000], [2000001, 5000000], [5000001, 50000000]
    ]

In [138]:
'''
Build helper functions for code cleanliness
'''

# This function is the initial API call. From it, a preliminary set of home data will be obtained including: price, address, ZPID, zestimate, etc.
def zillowAPISearch(callCount, url=url_search, headers=headers, priceRange=priceRange, location="Brevard County FL", status="forSale"):
	if status not in VALID_STATUS:
		raise ValueError("Error: Status must be one of %r." %VALID_STATUS)
	
	firstEntry = False
	for i, price in enumerate(priceRange):
		lastPageCheck = False
		page = 1
		lastPage = 20
		while page <= lastPage:
			if status == "recentlySold":
				querystring = {
					"page":page,
					"location":location,
					"status":status,
					"sortSelection":"days",
					"isSingleFamily":"true",
					"isMultiFamily":"false",
					"isApartment":"false",
					"isCondo":"false",
					"isManufactured":"false",
					"isTownhouse":"false",
					"isLotLand":"false",
					"price_min":price[0],
					"price_max":price[1],
					"doz":"12m"
				}
			else:
				querystring = {
					"page":page,
					"location":location,
					"status":status,
					"sortSelection":"pricea",
					"isSingleFamily":"true",
					"isMultiFamily":"false",
					"isApartment":"false",
					"isCondo":"false",
					"isManufactured":"false",
					"isTownhouse":"false",
					"isLotLand":"false",
					"price_min":price[0],
					"price_max":price[1]
				}
		
			# API call here:
			response = requests.get(url, headers=headers, params=querystring)
			callCount += 1
			time.sleep(0.5)

			searchData = response.json()

			if lastPageCheck == False:
				lastPageCheck = True
				lastPage = searchData['totalPages']
				print(f'Total number of pages to scrape for price range ${price[0]} to ${price[1]}: {lastPage}.')
				print('')

			if firstEntry == False:
				df_searchData = pd.json_normalize(searchData['results'])
				firstEntry = True
			else:
				df_searchData = pd.concat([df_searchData, pd.json_normalize(searchData['results'])], axis=0, ignore_index=True)
			
			print(f'Scraping property data from page {page}/{lastPage} for price range ${price[0]} to ${price[1]}.')
			page += 1
	return df_searchData, callCount


# This is the next API call function. This call will pull an exhaustive list of home data for each ZPID found from the first API call.
def zillowPropertyDetails(zpid, firstEntry, callCount, url=url_property, headers=headers):
	querystring = {
		"zpid":zpid
	}

	# API call here:
	response = requests.get(url, headers=headers, params=querystring)
	callCount += 1
	time.sleep(0.5)
	
	if firstEntry == False:
		firstEntry = True

	df_propertyData = pd.json_normalize(response.json())
	return df_propertyData, firstEntry, callCount

In [None]:
'''
Collect all home data for each 'forSale' home in Brevard county. Both API helper functions are called here.
'''

# CAUTION: These will go against the monthly allowable of 15,000. Run carefully!!
status = "forSale"
print(f'{status} housing data collection initiated.')
for price in priceRange:

    df_search, API_cc = zillowAPISearch(priceRange=price, callCount=API_callCount, status=status)
    API_callCount += API_cc
    print(f'Initial data collected for price range: ${price[0]} to ${price[1]}.')
    print('')

    firstEntry = False
    for i, zpid in enumerate(df_search['zpid']):
        if firstEntry == False:
            df_property, firstEntry, API_callCount = zillowPropertyDetails(callCount=API_callCount, zpid=zpid, firstEntry=firstEntry)
            API_callCount += API_cc
        else:
            temp, _, API_callCount = zillowPropertyDetails(callCount=API_callCount, zpid=zpid, firstEntry=firstEntry)
            API_callCount += API_cc
            df_property = pd.concat([df_property, temp], axis=0, ignore_index=True)
        
        print(f'Adding additional data for property {i+1}/{df_search.shape[0]} for price range: ${price[0]} to ${price[1]}.')
    
    # Concatenate the two dataframes together
    df_housingData = pd.concat([df_search, df_property], axis=1, ignore_index=False)
    
    # Export dataframe to csv file for later use.
    df_housingData.to_csv(f'housingData_{price[0]}_{price[1]}.csv', sep=',', index=True, encoding='utf-8')
    del df_housingData

print('')
print(f'{status} housing data collection complete.')
print(f'Total API call count for month so far: {API_callCount}')

In [None]:
'''
Collect all home data for each 'recentlySold' home in Brevard county. Both API helper functions are called here.
'''

# CAUTION: These will go against the monthly allowable of 15,000. Run carefully!!
status = "recentlySold"
print(f'{status} housing data collection initiated.')
for price in priceRange:

    df_search, API_cc = zillowAPISearch(priceRange=price, callCount=API_callCount, status=status)
    API_callCount =+ API_cc
    print(f'Initial data collected for price range: ${price[0]} to ${price[1]}.')
    print('')

    firstEntry = False
    for i, zpid in enumerate(df_search['zpid']):
        if firstEntry == False:
            df_property, firstEntry, API_cc = zillowPropertyDetails(callCount=API_callCount, zpid=zpid, firstEntry=firstEntry)
            API_callCount =+ API_cc
        else:
            temp, _, API_cc = zillowPropertyDetails(callCount=API_callCount, zpid=zpid, firstEntry=firstEntry)
            API_callCount =+ API_cc
            df_property = pd.concat([df_property, temp], axis=0, ignore_index=True)
        
        print(f'Adding additional data for property {i+1}/{df_search.shape[0]} for price range: ${price[0]} to ${price[1]}.')

    # Concatenate the two dataframes together
    df_housingData = pd.concat([df_search, df_property], axis=1, ignore_index=False)

    # Export dataframe to csv file for later use.
    df_housingData.to_csv(f'housingData_recentlySold_{price[0]}_{price[1]}.csv', sep=',', index=True, encoding='utf-8')
    del df_housingData

print('')
print(f'{status} housing data collection complete.')
print(f'Total API call count for month so far: {API_callCount}')

---
End of section.
## Next: Data Wrangling -->