# Final Project: House Price Prediction
## Corpus Christi Team
### Step 1 (Data Collection)

The data is obtained from _Zillow.com_

The data is requested through API calls using _RapidAPI_: https://rapidapi.com/apimaker/api/zillow-com1

**MAKE SURE TO INPUT YOUR RapiAPI key**

Subscribe for your key here: https://rapidapi.com/apimaker/api/zillow-com1/pricing

#### Import requested libraries

In [1]:
import pandas as pd
import requests
import json
import re
import os
import glob
from datetime import datetime as dt
import locale
import warnings
from time import sleep
from tqdm import tqdm

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

# Import your RapidAPI key
from config import krapid as key

#### Define search filters

In [2]:
# API specific (do not modify)
host = "zillow-com1.p.rapidapi.com"
url = "https://" + host + "/propertyExtendedSearch" # Query of basic information of ~40 properties)
url2 = "https://" + host + "/property" # Query of single property details

# Search filters
city = "Houston"
state = "TX"
location = city + ", " + state
homeType = "Houses"

# My price range is from $370,000 to $500,000 but I need to split it into two ranges:
# Range 1: $370,000 to $435,000 (20 pages * 40 houses) = 800 houses
# Range 2: $435,001 to $500,000. Another 800 houses. Giving a total of 1,600 houses which is our personal target
minPrice = 370001 # Use your assigned min price
maxPrice = 435000 # Use your assigned max price

sort = "Price_Low_High"

# Create output directory if it does not exist
os.makedirs('./data/', exist_ok=True)

#### Get the data

In [5]:
# Input a page number
def main(pg):
    
    global host, url, url2, city, state, location, homeType, minPrice, maxPrice, sort, num, df
    
    # Output file name
    out_name = f"data_{city}_{state}_{homeType}_p{pg}_price_{minPrice}_{maxPrice}"
    
    querystring = {"location":location,"page":pg,"home_type":homeType,"sort": sort,"minPrice":minPrice,"maxPrice":maxPrice}

    headers = {
        "X-RapidAPI-Key": key,
        "X-RapidAPI-Host": host
    }

    response = requests.request("GET", url, headers=headers, params=querystring)
    json_response = response.json()
    
    # If the response is successful (i.e., 200), then proceed
    if response.status_code == 200:
                
        # Create an empty DF, only declare colun names
        cnames = ['Page', 'Item', 'zid', 'State', 'City', 'Address', 'Lat', 'Lng', 'Price', 'Image', 'Bedrooms', 'Bathrooms', 'lotArea', 'constructedArea', 'zipCode', 'CountyId', 'taxRate', 'hasGarage', 'hasPool', 'hasCooling', 'hasView', 'yearBuilt', 'stories', 'parkingSpaces', 'annualHOI', 'annualHOA', 'Zone', 'schoolElemRating', 'schoolElemDist', 'schoolMidRating', 'schoolMidDist', 'schoolHighRating', 'schoolHighDist', 'priceIncreased', 'priceDiff', 'days']
        df = pd.DataFrame(columns=cnames)
        
        length = len(json_response['props'])

        for i in range(length):
            page = json_response['currentPage']
            item = i+1
            zid = json_response['props'][i]['zpid']
            address = json_response['props'][i]['address']
            lat = json_response['props'][i]['latitude']
            lng = json_response['props'][i]['longitude']
            price = json_response['props'][i]['price']
            image = json_response['props'][i]['imgSrc']
            bedrooms = json_response['props'][i]['bedrooms']
            bathrooms = json_response['props'][i]['bathrooms']
            lotArea = json_response['props'][i]['lotAreaValue']
            constructedArea = json_response['props'][i]['livingArea']
    
            ## Perform the Property Details Query
            # API specific (do not modify)
            querystring = {"zpid":zid}
    
            headers = {
                "X-RapidAPI-Key": key,
                "X-RapidAPI-Host": host
            }
    
            response2 = requests.request("GET", url2, headers=headers, params=querystring)
            json_response2 = response2.json()
    
            # If the response is successful (i.e., 200), then get the property details
            if response2.status_code == 200:
        
                sleep(0.6)
        
                zipCode = json_response2['zipcode'] #int
                countyId = json_response2['countyId']
                taxRate = json_response2['propertyTaxRate'] #float
                hasGarage = 0 if json_response2['resoFacts']['hasGarage'] == False else 1 # int. 0 if no, 1 if yes
                hasPool = 0 if json_response2['resoFacts']['poolFeatures'] == None else 1 # int. 0 if no, 1 if yes
                hasCooling = 0 if json_response2['resoFacts']['hasCooling'] == False else 1 # int. 0 if no, 1 if yes
                hasView = 0 if json_response2['resoFacts']['hasView'] == False else 1 # int. 0 if no, 1 if yes
                yearBuilt = json_response2['yearBuilt'] # Int
                stories = json_response2['resoFacts']['stories'] #int
                parkingSpaces = json_response2['resoFacts']['parking'] #int
                annualHOI = json_response2['annualHomeownersInsurance'] #Int (Annual Homeowner Insurance Fee $USD)
                annualHOA = json_response2['resoFacts']['hoaFee'] #string (Annual Homeowner Association Fee $USD)
                zone = json_response2['resoFacts']['zoningDescription']
        
                # School Information:
                if len(json_response2['schools']) == 3:
                    schoolElemRating = json_response2['schools'][0]['rating'] #int
                    schoolElemDist = json_response2['schools'][0]['distance'] #float (miles)
                    schoolMidRating = json_response2['schools'][1]['rating']  #int
                    schoolMidDist = json_response2['schools'][1]['distance']  #float (miles)
                    schoolHighRating = json_response2['schools'][2]['rating'] #int
                    schoolHighDist = json_response2['schools'][2]['distance'] #float (miles)
        
                else: 
                    schoolElemRating = 999
                    schoolElemDist =  999
                    schoolMidRating = 999
                    schoolMidDist = 999
                    schoolHighRating = 999
                    schoolHighDist = 999
            
                try:
                    outterLen = len(json_response2['priceHistory'])
        
                    if outterLen > 1:
            
                        # Price history Information:
                        historyPrice = []
                        historyDate = []
    
                        for j in range(outterLen):
                            priceDate = json_response2['priceHistory'][j]['date'] #Format 2022-12-19
                            price = json_response2['priceHistory'][j-1]['price'] if json_response2['priceHistory'][j]['price'] == None else json_response2['priceHistory'][j]['price']
                            isForRent = json_response2['priceHistory'][j]['postingIsRental']
            
                            if isForRent == False:                
                                historyPrice.append(price)
                                historyDate.append(priceDate)
            
                            innerLen = len(historyPrice)
            
                            if innerLen > 1:
                                priceRecent = historyPrice[0] #Most recent price
                                pricePast = historyPrice[-1]  #Oldest price
                                dateRecent = historyDate[0]        #Most recent date
                                datePast = historyDate[-1]         #Oldest date
            
                                # convert string to date object
                                d1 = dt.strptime(dateRecent, "%Y-%m-%d")
                                d2 = dt.strptime(datePast, "%Y-%m-%d")
            
                                # Compute the difference in price and in how many days
                                days = (d1 - d2).days
                    
                                try:
                                    priceDiff = priceRecent - pricePast
                                except TypeError:
                                    priceDiff = 0
            
                                if priceDiff > 0:
                                    priceIncreased = 1
                                else:
                                    priceIncreased = 0

                            else:
                                priceDiff = 0
                                priceIncreased = 0
                                days = 0
                    else:
                        historyPrice = []
                        historyDate = []
                        priceDiff = 0
                        priceIncreased = 0
                        days = 0

                except KeyError:
                    priceDiff = 0
                    priceIncreased = 0
                    days = 0
              
            df_row = {'Page': page, 'Item': item, 'zid': zid, 'State': state, 'City': city, 'Address': address, 'Lat': lat, 'Lng': lng, 'Price': price, 'Image': image, 'Bedrooms': bedrooms, 'Bathrooms': bathrooms, 'lotArea': lotArea, 'constructedArea': constructedArea, 'zipCode': zipCode, 'CountyId': countyId, 'taxRate': taxRate, 'hasGarage': hasGarage, 'hasPool': hasPool, 'hasCooling': hasCooling, 'hasView': hasView, 'yearBuilt': yearBuilt, 'stories': stories, 'parkingSpaces': parkingSpaces, 'annualHOI': annualHOI, 'annualHOA': annualHOA, 'Zone': zone, 'schoolElemRating': schoolElemRating, 'schoolElemDist': schoolElemDist, 'schoolMidRating': schoolMidRating, 'schoolMidDist': schoolMidDist, 'schoolHighRating': schoolHighRating, 'schoolHighDist': schoolHighDist, 'priceIncreased': priceIncreased, 'priceDiff': priceDiff, 'days': days}
            df = df.append(df_row, ignore_index=True)
            
            # Save the DF to a .csv file
            df.to_csv(f'./data/{out_name}.csv', index=False)
    del(df)


#### Run the fuction

In [6]:
# Run the function for 20 pages (note the max value need to be 20 + 1)
pbar = tqdm(range(1, 21))
for i in pbar:
    sleep(0.5)
    pbar.set_description(f'Retreiving data from page {i}')
    main(i)

Retreiving data from page 2: 100%|███████████████████████████████████████████████████████| 1/1 [01:17<00:00, 77.59s/it]


In [None]:
# Run the function for one page
page = 1
main(page)