# Creating a web scraper for Zillow.com
This scraper will search all of Zillow's listings for both recently sold and currently available single family homes in a specific state/county.

## Helpful videos:
https://www.youtube.com/watch?v=dRcvJRmqFHQ&ab_channel=WebScrapingwithAndy

https://www.youtube.com/watch?v=a3Cuq2csLWk

## Helpful websites:
curl.trillworks.com

In [547]:
# Import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
import numpy as np
import regex as re
import time
import json

In [548]:
# Initial variables
page = 0
size = 0
county = 'brevard'
state = 'fl'
url = 'https://www.zillow.com/'
results = []

In [549]:
headers = {
    'authority': 'www.zillow.com',
    'accept': '*/*',
    'accept-language': 'en-US,en;q=0.9',
    'cache-control': 'max-age=0',
    'referer': 'https://www.zillow.com/',
    'sec-ch-ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
    }

response = requests.get(url=url, headers=headers)
print(response.status_code)

200


In [550]:
while response.status_code == 200:
    page += 1

    url = 'https://www.zillow.com/%s-county-%s/' %(county, state)
    
    params = {
    'searchQueryState': '{"pagination":{"currentPage":%s},"isMapVisible":false,"mapBounds":{"west":-81.67492903125,"east":-79.69738996875,"south":27.568978152602924,"north":29.04158767448029},"mapZoom":9,"usersSearchTerm":"Brevard County, FL","regionSelection":[{"regionId":1556,"regionType":4}],"filterState":{"sort":{"value":"globalrelevanceex"},"ah":{"value":true}},"isListVisible":true}' %page
    }

    response = requests.get(url=url, params=params, headers=headers)
    time.sleep(0.25)
    if response.status_code != 200:
        break

    content = BeautifulSoup(response.text, 'lxml')

    deck = content.find('ul', {'class':'List-c11n-8-84-3__sc-1smrmqp-0 StyledSearchListWrapper-srp__sc-1ieen0c-0 doa-doM gKnRas photo-cards photo-cards_extra-attribution'})
    # If you inspect the "deck" variable, print(deck.prettify()), you will see a great many "timeouts". Not sure how to fix this...

    size = 0
    for card in deck.contents:
        size += 1
        script = card.find('script', {'type':'application/ld+json'}) # Excluding any advertisement cards. Script = None on ad cards.

        if script:
            script_json = json.loads(script.contents[0])
            
            try:
                square_feet = script_json['floorSize']['value']
            except:
                square_feet = ''

            try:
                latitude = script_json['geo']['latitude']
            except:
                latitude = ''
            
            try:
                longitude = script_json['geo']['longitude']
            except:
                longitude = ''
            
            try:
                address = script_json['name']
            except:
                address = ''
            
            try:
                homeDetailsURL = script_json['url']
            except:
                homeDetailsURL = ''

            try:
                price = card.find('span', {'class':'PropertyCardWrapper__StyledPriceLine-srp__sc-16e8gqd-1 iMKTKr'}).text
            except:
                price = ''

            try:
                beds = re.split('bds|ba| ', card.find('ul', {'class':'StyledPropertyCardHomeDetailsList-c11n-8-84-3__sc-1xvdaej-0 eYPFID'}).text)[0]
            except:
                beds = ''

            try:
                baths = re.split('bds|ba| ', card.find('ul', {'class':'StyledPropertyCardHomeDetailsList-c11n-8-84-3__sc-1xvdaej-0 eYPFID'}).text)[2]
            except:
                baths = ''

            results.append({
                'square_feet':square_feet,
                'latitude':latitude,
                'longitude':longitude,
                'address':address,
                'url':homeDetailsURL,
                'price':price,
                'beds':beds,
                'baths':baths
            })

    time.sleep(2)

df_housingInfo = pd.DataFrame.from_dict(results)

# Still need to figure out the timeout issue...
# Now you need to loop through all the home detail urls and add data to dataframe (age, zestimate, days on zillow, view, saves, has HOA )

Run request on first page
get all child elements from cards on first page
    from child elements is a "homedetails" url for all the specific data
loop through all homedetails urls on first page as second request set
    scrape necessary data from homedetails page into data frame
run request on second, third, ..., nth page

In [None]:
# Update headers and cookies required for url get request on zillow "pop up" home details pages.
# It does not appear that these need to be updated for different home details urls. Should be a one and done for all searches.

headers = {
    'authority': 'www.zillow.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-language': 'en-US,en;q=0.9',
    'cache-control': 'max-age=0',
    'referer': 'https://www.zillow.com/',
    'sec-ch-ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
}

cookies = {
    'zguid': '24|%246fb0b799-0e43-4f66-8d83-21d596fdcb10',
    '_ga': 'GA1.2.276650241.1707089070',
    '_gid': 'GA1.2.650428496.1707089070',
    'zjs_anonymous_id': '%226fb0b799-0e43-4f66-8d83-21d596fdcb10%22',
    'zg_anonymous_id': '%2274af3d1f-88c7-47f1-8fba-2012fad9ebd0%22',
    '_gcl_au': '1.1.34214727.1707089070',
    '_pxvid': '8ba37f46-c3b4-11ee-9e13-8258276fc6fd',
    '__pdst': '4643e3b9e1e247b593e735d4a1dc968c',
    '_pin_unauth': 'dWlkPU9USmxaREptTkRFdE1UazFZUzAwWmpobUxXRTNaVGt0TkdabU56WTFaVEV6WW1RNA',
    'g_state': '{"i_l":0}',
    'loginmemento': '1|8a454d231ed7eca409e814a99a3597c00ceda63377d0b3bf92c6a65c545305f6',
    'userid': 'X|3|2d7d88bcfa818bd7%7C2%7Cb4EgAuPxaVSpXpvRhJu7beBPJcglJyW6hchQYvmUSWA%3D',
    'zjs_user_id': '%22X1-ZU11on5i9m0cg7d_8469x%22',
    'zgsession': '1|3b9257ba-347c-4841-b36c-fdc3456dc54a',
    'DoubleClickSession': 'true',
    'pxcts': 'be168d07-c47a-11ee-887d-93606b21f946',
    '_derived_epik': 'dj0yJnU9aU1rbm1vODhtUkZMZS1zVGt6V2ZvZkprXzhfXzdMUDkmbj03TDhoYXNGczNqMlJpMzVqR1hObGhnJm09NCZ0PUFBQUFBR1hCY1NZJnJtPTQmcnQ9QUFBQUFHWEJjU1kmc3A9NA',
    '_clck': '1ucal2l%7C2%7Cfj2%7C0%7C1495',
    '_fbp': 'fb.1.1707264904428.264116798',
    '__gads': 'ID=7ca70bfc69dc898a:T=1707264903:RT=1707269602:S=ALNI_MY6Wr18mXjDU2wlmX36axPlaKh8Kg',
    '__gpi': 'UID=00000dbff955306f:T=1707264903:RT=1707269602:S=ALNI_MbHVNs2mJJeAPEBLbEwrfM16s39Xw',
    '__eoi': 'ID=501fbd9f5a950fbd:T=1707264903:RT=1707269602:S=AA-AfjbGo0IdBShZNSjY3lVzM-pu',
    'ZILLOW_SID': '1|AAAAAVVbFRIBVVsVEuqA4d1HcFzglKcT9vnf%2FfrQt8v78O0uFl6Mzqi%2B%2FYkfJyktTrUhkn4f9g4k9YD3yJ8s0U2XrPMM2KTgmA',
    '_pxff_cc': 'U2FtZVNpdGU9TGF4Ow==',
    '_pxff_cfp': '1',
    '_pxff_bsco': '1',
    '_hp2_id.1215457233': '%7B%22userId%22%3A%221841226270994960%22%2C%22pageviewId%22%3A%22969084944458639%22%2C%22sessionId%22%3A%223445744565095364%22%2C%22identity%22%3A%22X1-ZU11on5i9m0cg7d_8469x%22%2C%22trackerVersion%22%3A%224.0%22%2C%22identityField%22%3Anull%2C%22isIdentified%22%3A1%7D',
    '_hp2_ses_props.1215457233': '%7B%22ts%22%3A1707344036844%2C%22d%22%3A%22www.zillow.com%22%2C%22h%22%3A%22%2F%22%7D',
    'JSESSIONID': '6615BEDA1FBA067AE6A883DE8FECB5B8',
    '_pxff_tm': '1',
    'search': '6|1709936081727%7Crect%3D29.581642766431106%2C-76.731081375%2C27.018581702681022%2C-84.641237625%26rid%3D1556%26disp%3Dmap%26mdm%3Dauto%26p%3D1%26z%3D1%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26ah%3D0%26singlestory%3D0%26housing-connector%3D0%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%26featuredMultiFamilyBuilding%3D0%26student-housing%3D0%26income-restricted-housing%3D0%26military-housing%3D0%26disabled-housing%3D0%26senior-housing%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%091556%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09',
    '_uetsid': '8cd95cd0c3b411ee83fd6fff755a5145',
    '_uetvid': '8cd96630c3b411ee92972bee046f6165',
    '_px3': 'fd31d6c20e31149e8b7e2ac80c843ba4073883cf07a4c65e5b92671c08279c9c:J8X+8iNDY6A9GZsgydWtAzObXkltNZFuKZeIJR2OLyIlwG76t3TzyhuK/JWpIH9POugLUa1E/4nXxfrzN8gzaA==:1000:TcE9jzmXIziDfX+sDL7JS/u0n6LHs8HyN+99S2V1CHW2loSVULIxyUBM9HP04XnTVHtZa70Fg7iHDhDkU+hKe1zErVuC8M5PdA7zNvFQGUNJLKFqp0aVJoaa0MarRkiBlzRES8KI2h2Gmy91XRHyOW1JJ1+rlv+7MJMKjuPfPyqI9pj0mzrhL0xfHvNYSNJgsyDkXZabV218n9cHpJ89DeXk3iLG8JVHHo8yvnkRxPE=',
    '_gat': '1',
    '_clsk': 'x4c9ar%7C1707344101927%7C9%7C0%7Cy.clarity.ms%2Fcollect',
    'AWSALB': 'fazhWvFSC1Q3cRGpAgbIMkDjK4IN422/fMCyBAiZ5Wkp9rpojXc2OhkfXDLR2wUseYC8FHvfzTn3xKfEzTnVL2+8AV47xAVIiEkLfaPGdFxySyZ/5EHL1XXNI5S7',
    'AWSALBCORS': 'fazhWvFSC1Q3cRGpAgbIMkDjK4IN422/fMCyBAiZ5Wkp9rpojXc2OhkfXDLR2wUseYC8FHvfzTn3xKfEzTnVL2+8AV47xAVIiEkLfaPGdFxySyZ/5EHL1XXNI5S7',
}

In [None]:
for url in df_housingInfo.url:
    response = requests.get(url=url, cookies=cookies, headers=headers)
    time.sleep(0.25)

    content = BeautifulSoup(response.text, 'lxml')
    print(content.title)

    # Now start looking for individual data required to complete data frame.
    # Still need to fix timeout issue above.