Zillow API from:\
https://rapidapi.com/s.mahmoud97/api/zillow56

In [65]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time

pd.options.display.max_columns = None

In [66]:
# Load API key
df = pd.read_csv('../../Data Analytics and Data Science Projects/API-KEYS.csv')
rapid_API_Key = df.loc[df['API'] == 'rapid-API']['KEY'].iloc[0]

In [67]:
# API specific variables.
url_search = "https://zillow56.p.rapidapi.com/search"
url_property = "https://zillow56.p.rapidapi.com/property"
headers = {
		"X-RapidAPI-Key":rapid_API_Key,
		"X-RapidAPI-Host":"zillow56.p.rapidapi.com"
	}

VALID_STATUS = {"forSale", "recentlySold", "forRent"}

In [68]:
# Set Zillow search parameters
location = "Brevard County FL"
status = "forSale" # Options are "forSale", "recentlySold", "forRent"
# priceRange = [[1, 250000], [250001, 300000], [300001, 350000], [350001, 400000], [400001, 450000], [450001, 500000], [500001, 750000], [750001, 1000000], [1000001, 2000000], [2000001, 5000000], [5000001, 50000000]] #Limiting for trial/error. Will set to full range once confirmed working.
#                  x137             241              443                571               283               173              487               208                 113                 62                4
priceRange = [[5000001, 50000000]]

In [69]:
# Build functions for cleanliness
def zillowAPISearch(callCount, url=url_search, headers=headers, priceRange=priceRange, location="Brevard County FL", status="forSale"):

	if status not in VALID_STATUS:
		raise ValueError("Error: Status must be one of %r." %VALID_STATUS)
	
	firstEntry = False
	for i, price in enumerate(priceRange):
		lastPageCheck = False
		page = 1
		lastPage = 20
		while page <= lastPage:
			querystring = {
				"page":page,
				"location":location,
				"status":status,
				"sortSelection":"pricea",
				"isSingleFamily":"true",
				"isMultiFamily":"false",
				"isApartment":"false",
				"isCondo":"false",
				"isManufactured":"false",
				"isTownhouse":"false",
				"isLotLand":"false",
				"price_min":price[0],
				"price_max":price[1]
			}
		
			response = requests.get(url, headers=headers, params=querystring)
			callCount += 1
			time.sleep(0.5)
			
			# if response.status_code != 200:
			# 	print(f'Last page is: {page-1}')
			# 	break

			searchData = response.json()

			if lastPageCheck == False:
				lastPageCheck = True
				lastPage = searchData['totalPages']
				print(f'Total number of pages to scrape for price range ${price[0]} to ${price[1]}: {lastPage}.')
				print('')

			if firstEntry == False:
				df_searchData = pd.json_normalize(searchData['results'])
				firstEntry = True
			else:
				df_searchData = pd.concat([df_searchData, pd.json_normalize(searchData['results'])], axis=0, ignore_index=True)
			
			print(f'Scraping property data from page {page}/{lastPage} for price range ${price[0]} to ${price[1]}.')
			page += 1
	
	return df_searchData, callCount

def zillowPropertyDetails(zpid, firstEntry, callCount, url=url_property, headers=headers):
	querystring = {
		"zpid":zpid
	}

	response = requests.get(url, headers=headers, params=querystring)
	callCount += 1
	time.sleep(0.5)

	# if response.status_code != 200:
	# 	raise ValueError('Chosen ZPID is not valid.')
	
	if firstEntry == False:
		firstEntry = True

	df_propertyData = pd.json_normalize(response.json())

	return df_propertyData, firstEntry, callCount

In [70]:
# API calls for web scraping
# CAUTION: These will go against the monthly allowable fo 15,000. RUN SPARINGLY!!.
API_callCount = 3297

df_search, API_callCount = zillowAPISearch(callCount=API_callCount)
print('')

firstEntry = False
for i, zpid in enumerate(df_search['zpid']):
    if firstEntry == False:
        df_property, firstEntry, API_callCount = zillowPropertyDetails(callCount=API_callCount, zpid=zpid, firstEntry=firstEntry)
    else:
        temp, _, API_callCount = zillowPropertyDetails(callCount=API_callCount, zpid=zpid, firstEntry=firstEntry)
        df_property = pd.concat([df_property, temp], axis=0, ignore_index=True)
    
    print(f'Adding additional data for property {i+1}/{df_search.shape[0]}.')

print('')
print(f'Total API call count so far: {API_callCount}')

Total number of pages to scrape for price range $5000001 to $50000000: 1.

Scraping property data from page 1/1 for price range $5000001 to $50000000.

Adding additional data for property 1/4.
Adding additional data for property 2/4.
Adding additional data for property 3/4.
Adding additional data for property 4/4.

Total API call count so far: 3262


In [72]:
# Concatenate the two dataframes together
df_housingData = pd.concat([df_search, df_property], axis=1, ignore_index=False)

(4, 589)


In [73]:
# Export dataframe to csv file for later use.
df_housingData.to_csv('housingData.csv', sep=',', index=True, encoding='utf-8')

In [None]:
del df_housingData