### Import Libraries 

 - Request library allows you to send HTTP request in python to a specific URL. In our case we send an HTTP request to Zillow
 - Time module allows to handle time related task including formatting dates, waiting and representing time
 - The random module allows you to generate random 
 - The bs4 module allows you to pull data from HTML document after you get a response from HTTP request
 - The os modules allows ou to interact with operating systems including changing working directory
 - The selenium module allows you to automate interaction with a web browser including sending URL request and extracting HTML
   document response

In [1]:
import requests
import time
from bs4 import BeautifulSoup
from random import sample 
import pandas as pd 
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import json
import csv
from datetime import datetime
import re
from selenium_stealth import stealth


### Set Path
 - Identify your destination folder
 - Use os change directory to set your destination directory as the default. That is where all outputs will be exported to

In [2]:
path = "../webscraping_outputs-Z"
os.chdir(path)

### Create a file name
 - Create an outfile file name, I called mine ZillowSelium and formatted it a date time stamp
 - Note: If you are scraping multiple times in a day, then you need to format the time stamp with hours that way you don't overwrite already exported data

In [23]:
finalfile = "ZillowSelenium" + "_" + "{:%Y_%h_%d_%H-%M-%S}".format(datetime.now()) +".csv"
finalfile

'ZillowSelenium_2023_Dec_21_13-45-20.csv'

### Main Webscraping 

- Output results
- Page numbers
- URL 
- Selenium Setup


In [24]:
#Create a list that will hold the results

page_count=20
results = []

zillow_placeholders = ["This property accepts Zillow applications","Zillow last checked:"]

# selectors
titleSelector = "h1.Text-c11n-8-84-3__sc-aiai24-0"
descSelector = "div.building-description"
linkSelector = "div.StyledPropertyCardDataWrapper-c11n-8-84-3__sc-1omp4c3-0"
saleSelector = "div.Text-c11n-8-84-3__sc-aiai24-0"
sale2Selector = "p.Text-c11n-8-84-3__sc-aiai24-0"
nhoodSelector = "h4.Text-c11n-8-84-3__sc-aiai24-0"
nhood2Selector = "h2.styledComponents__BuildingCardTitle-sc-1bj2ydz-8"

driver = webdriver.Chrome()
textDriver = webdriver.Chrome()

stealth(driver,
       languages=["en-US", "en"],
       vendor="Google Inc.",
       platform="Win32",
       webgl_vendor="Intel Inc.",
       renderer="Intel Iris OpenGL Engine",
       fix_hairline=True,
       )

stealth(textDriver,
       languages=["en-US", "en"],
       vendor="Google Inc.",
       platform="Win32",
       webgl_vendor="Intel Inc.",
       renderer="Intel Iris OpenGL Engine",
       fix_hairline=True,
       )

url ="https://www.zillow.com/philadelphia-pa/rentals/"

url2 = "https://www.zillow.com/philadelphia-pa/for_sale/"

# Inspect the zillow website and figure out the number pages for rental ads use
# In the charlotte example, there are a total of 20 pages so I set the range at 21

for page in range(1,page_count+1,1):
    
    print("This is page: " + str(page))
    
    #Identify the Zillow URL of your City, it should follow this format:
    # 1. Default Zillow url : https://www.zillow.com/
    # 2. Name of your City: eg. charlotte-nc, atlanta-ga
    # 3. Pass the page number 
    # 4. Add the "_p" that is a default thing with the Zillow website 
    # 5. In a sample URL on page 15 for example will be like: https://www.zillow.com/charlotte-nc/rentals/15_p/

    page = str(page) + '_p/'
    
    # Here we are going to utilize the selenium. To automate the interaction behavior of a web browser you would
    # need a web driver. Each browser has a webdriver, in my case I am using google chrome so I download the web driver
    # from this website "https://chromedriver.storage.googleapis.com/index.html?path=98.0.4758.80/" 
    
    # After downloading and extracting the web drive(chromdriver.exe) you use the webdrive.Chrome() method to initiate
    # the chrome browser and pass the path where the driver is saved.
    
    
    # CraiglistBrowser.maximize_window()

    # After the browser has been launched use the get() to pass the url 
    print(f"Urls:\n")
    page_links = []
    for url in [url,url2]: # getListingType():
        print(f"\t\t{url+page}\n")
        browser = driver.get(url+page)
        html = driver.execute_script("return document.documentElement.outerHTML")
        soup = BeautifulSoup(html, 'html.parser')

        for item in soup.select(linkSelector):
            l = item.select("a")[0].attrs["href"]
            if not(l.startswith("https://")):
                l = "https://www.zillow.com"+l
            page_links.append(l)

    for link in page_links:
        ovPage = textDriver.get(link)
        textSoup = BeautifulSoup(textDriver.page_source,"html.parser")

        if len(textSoup.select("div.px-captcha-container")) > 0:
            time.sleep(0.3)
            continue
        else:
        
            title = textSoup.select(titleSelector)[0].text
            nh1 = textSoup.select(nhoodSelector)
            nh2 = textSoup.select(nhood2Selector)
            nhood = None

            # get neighborhood from among header tags
            if len(nh1) > 0:
                for blurb in nh1:
                    if "neighborhood:" in blurb.text.lower():
                        nhood = blurb.text.split(":")[1][1:]
                        # print(blurb.text.split(":")[1][1:])
                        break
            elif (len(nh2) > 0 and type(nhood) == type(None)):
                for blurb in nh2:
                    if "neighborhood:" in blurb.text.lower():
                        nhood = blurb.text.split(":")[1][1:]
                        # print(blurb.text.split(":")[1][1:])
                        break

            # getting address from title
            address = None
            for w in range(len(title)):
                if title[w].isnumeric():
                    address = title[w:]
                    break

            if len(textSoup.select(descSelector))>0 and len(textSoup.select(descSelector)[0].text)>70 and (not(any(holder in textSoup.select(descSelector)[0].text for holder in zillow_placeholders))):
                text = textSoup.select(descSelector)[0].text
            elif len(textSoup.select(sale2Selector)[0]) and len(textSoup.select(sale2Selector)[0].text)>70 and (not(any(holder in textSoup.select(sale2Selector)[0].text for holder in zillow_placeholders))):
                text= textSoup.select(sale2Selector)[0].text
            elif len(textSoup.select(saleSelector)[0])>0 and len(textSoup.select(saleSelector)[0].text)>70 and (not(any(holder in textSoup.select(saleSelector)[0].text for holder in zillow_placeholders))):
                text= textSoup.select(saleSelector)[0].text
            else:
                text=""

            results.append({
            "title": title,
            "address": address,
            "neighborhood": nhood,
            "description": text,
            "url": link
            })
            print(f"title: {title}\t\tneighborhood: {nhood}\nlink: {link}\n\tdescription: {text}")

            time.sleep(0.3)


    time.sleep(0.5)

Zillowdata =  pd.DataFrame(results)
Zillowdata.to_csv(finalfile, index = False)


This is page: 1
Urls:

		https://www.zillow.com/philadelphia-pa/rentals/1_p/

		https://www.zillow.com/philadelphia-pa/for_sale/1_p/

title: LVL West		neighborhood: None
link: https://www.zillow.com/apartments/philadelphia-pa/lvl-west/98vKcn/
	description: Centrally located in Philadelphia's bustling University City neighborhood, LVL West is just steps from award winning restaurants, unique shopping destinations, and many of Philadelphia's iconic cultural landmarks. Designed to be the perfect mix of contemporary design and modern convenience, LVL West offers unique floor plans, elevated appointments, and a wide variety of layouts to suite your needs. Each apartment is fully equipped with washer and dryer, full size apartment style appliances, and thoughtfully designed storage spaces. Elevate everyday life with premium amenities at LVL West. Relax or host get-togethers in our penthouse resident lounge, get fit in our state-of-the-art fitness center, or take in the stunning Philadelphia 

IndexError: list index out of range

## Save current results to file if error occurs 

In [26]:
Zillowdata =  pd.DataFrame(results)
Zillowdata.to_csv(finalfile, index = False)

### Function Design

In [25]:
textSoup.select(titleSelector)

[]

In [14]:

# textSoup.select(nhoodSelector)
for blurb in textSoup.select("h2.styledComponents__BuildingCardTitle-sc-1bj2ydz-8"):
    if "neighborhood" in blurb.text.lower():
        # nhood = blurb.text.split(":")[1][1:]
        print(blurb.text.split(":")[1][1:])
        break


# nhoodSelector

Hawthorne


In [4]:
driver = webdriver.Chrome()

url ="https://www.zillow.com/philadelphia-pa/rentals/"
url2 = "https://www.zillow.com/philadelphia-pa/for_sale/"
page = str(1) + '_p/'
titleSelector = "h1.Text-c11n-8-84-3__sc-aiai24-0"
descSelector = "div.building-description"
linkSelector = "div.StyledPropertyCardDataWrapper-c11n-8-84-3__sc-1omp4c3-0"
saleSelector = "div.Text-c11n-8-84-3__sc-aiai24-0"
page_links = []

driver.get(url2+page)
html = driver.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(html, 'html.parser')

for item in soup.select(linkSelector):
    l = item.select("a")[0].attrs["href"]
    if not(l.startswith("https://")):
        l = "https://www.zillow.com"+l
    page_links.append(l)

results = []
page_links

['https://www.zillow.com/homedetails/11102-Hendrix-St-Philadelphia-PA-19116/10531637_zpid/',
 'https://www.zillow.com/homedetails/706-Latona-St-UNIT-I-Philadelphia-PA-19147/2059089682_zpid/',
 'https://www.zillow.com/homedetails/6386-Sherwood-Rd-Philadelphia-PA-19151/10350074_zpid/',
 'https://www.zillow.com/homedetails/1419-S-51st-St-Philadelphia-PA-19143/10309619_zpid/',
 'https://www.zillow.com/homedetails/6123-Wayne-Ave-FLOOR-2-Philadelphia-PA-19144/2054310168_zpid/',
 'https://www.zillow.com/homedetails/3022-N-American-St-Philadelphia-PA-19133/118349697_zpid/',
 'https://www.zillow.com/homedetails/6424-City-Ave-Philadelphia-PA-19151/2054068561_zpid/',
 'https://www.zillow.com/homedetails/6506-N-Fairhill-St-Philadelphia-PA-19126/10545698_zpid/',
 'https://www.zillow.com/homedetails/118-S-21st-St-APT-1022-Philadelphia-PA-19103/2057852846_zpid/',
 'https://www.zillow.com/homedetails/1129-W-Duncannon-Ave-Philadelphia-PA-19141/10467823_zpid/',
 'https://www.zillow.com/homedetails/728-C

In [7]:
driver = webdriver.Chrome()

In [40]:
link=page_links[1]
ovPage = driver.get(link)
textSoup = BeautifulSoup(driver.page_source,"html.parser")

In [37]:
sale2Selector = "p.Text-c11n-8-84-3__sc-aiai24-0"
textSoup.select(saleSelector)[0].text

'The data relating to real estate for sale on this website appears in part through the BRIGHT Internet Data Exchange program, a voluntary cooperative exchange of property listing data between licensed real estate brokerage firms, and is provided by BRIGHT through a licensing agreement.\nListing information is from various brokers who participate in the Bright MLS IDX program and not all listings may be visible on the site. \nThe property information being provided on or through the website is for the personal, non-commercial use of consumers and such information may not be used for any purpose other than to identify prospective properties consumers may be interested in purchasing.\nSome properties which appear for sale on the website may no longer be available because they are for instance, under contract, sold or are no longer being offered for sale. \nProperty information displayed is deemed reliable but is not guaranteed. \nCopyright 2023 Bright MLS, Inc. Click here for more informa

In [43]:
results = results[:-1]

In [None]:
title = "11102 Hendrix St, Philadelphia, PA 19116"
address = None
for w in range(len(title)):
    if title[w].isnumeric():
        address = title[w:]
        break

print(address)


In [44]:
for i in range(2,len(page_links)):

    link=page_links[i]
    ovPage = driver.get(link)
    textSoup = BeautifulSoup(driver.page_source,"html.parser")
    title = textSoup.select(titleSelector)[0].text

    address = None
    for w in range(len(title)):
        if title[w].isnumeric():
            address = title[w:]
            break
    

    if len(textSoup.select(descSelector))>0 and len(textSoup.select(descSelector)[0].text)>70:
        text = textSoup.select(descSelector)[0].text
    elif len(textSoup.select(sale2Selector)[0]) and len(textSoup.select(sale2Selector)[0].text)>70:
        text= textSoup.select(sale2Selector)[0].text
    elif len(textSoup.select(saleSelector)[0])>0 and len(textSoup.select(saleSelector)[0].text)>70:
        text= textSoup.select(saleSelector)[0].text
    else:
        text=""

    results.append({
        "title": title,
        "address": address,
        "neighborhood": nhood,
        "description": text,
        "url": link
    })
    print(f"title: {title}\nlink: {link}\n\n\t{text}\n\n")
    time.sleep(3)

title: 6386 Sherwood Rd, Philadelphia, PA 19151
link: https://www.zillow.com/homedetails/6386-Sherwood-Rd-Philadelphia-PA-19151/10350074_zpid/

	Zillow last checked:  3 hours ago


title: 1419 S 51st St, Philadelphia, PA 19143
link: https://www.zillow.com/homedetails/1419-S-51st-St-Philadelphia-PA-19143/10309619_zpid/

	Zillow last checked:  3 hours ago


title: 6123 Wayne Ave FLOOR 2, Philadelphia, PA 19144
link: https://www.zillow.com/homedetails/6123-Wayne-Ave-FLOOR-2-Philadelphia-PA-19144/2054310168_zpid/

	Likely to sell faster than 83 % nearby


title: 3022 N American St, Philadelphia, PA 19133
link: https://www.zillow.com/homedetails/3022-N-American-St-Philadelphia-PA-19133/118349697_zpid/

	Zillow last checked:  3 hours ago


title: 6424 City Ave, Philadelphia, PA 19151
link: https://www.zillow.com/homedetails/6424-City-Ave-Philadelphia-PA-19151/2054068561_zpid/

	Likely to sell faster than 99 % nearby


title: 6506 N Fairhill St, Philadelphia, PA 19126
link: https://www.zillow

### Descriptions from For Sale Pages

In [32]:
results

[{'title': '11102 Hendrix St,\xa0Philadelphia, PA 19116',
  'description': 'The data relating to real estate for sale on this website appears in part through the BRIGHT Internet Data Exchange program, a voluntary cooperative exchange of property listing data between licensed real estate brokerage firms, and is provided by BRIGHT through a licensing agreement.\nListing information is from various brokers who participate in the Bright MLS IDX program and not all listings may be visible on the site. \nThe property information being provided on or through the website is for the personal, non-commercial use of consumers and such information may not be used for any purpose other than to identify prospective properties consumers may be interested in purchasing.\nSome properties which appear for sale on the website may no longer be available because they are for instance, under contract, sold or are no longer being offered for sale. \nProperty information displayed is deemed reliable but is no

In [41]:
# if len(textSoup.select("div.px-captcha-container")) > 0:
#     time.sleep(3)
#     continue
# else:
# results=[]
title = textSoup.select(titleSelector)[0].text
if len(textSoup.select(descSelector))>0:
    text = textSoup.select(descSelector)[0].text
elif len(textSoup.select(sale2Selector)[0]):
    text= textSoup.select(sale2Selector)[0].text
elif len(textSoup.select(saleSelector)[0])>0:
    text= textSoup.select(saleSelector)[0].text

results.append({
    "title": title,
    "address": address,
    "neighborhood": nhood,
    "description": text,
    "url": link
})
print(f"title: {title}\nlink: {link}\n\n\t{text}\n\n")

title: 706 Latona St UNIT I, Philadelphia, PA 19147
link: https://www.zillow.com/homedetails/706-Latona-St-UNIT-I-Philadelphia-PA-19147/2059089682_zpid/

	Welcome to Maxim Plaza, a collection of ten luxury townhomes in the Passyunk Square neighborhood of Center City, Philadelphia. The “Brynn” is a 17-foot-wide home, spanning approximately 3,700 square feet and comprises of multiple outdoor spaces, 1-car parking,4-bedrooms, 3-full bathrooms, and 2-half bathrooms.   The living room; bright and ...   read more




In [None]:
results = []
for link in page_links:
    ovPage = textDriver.get(link)
    textSoup = BeautifulSoup(textDriver.page_source,"html.parser")

    if len(textSoup.select("div.px-captcha-container")) > 0:
        time.sleep(3)
        continue
    else:
    
        title = textSoup.select(titleSelector)[0].text
        text = textSoup.select(descSelector)[0].text

        results.append({
            "title": title,
            "description": text,
            "url": link
        })
        print(f"title: {title}\nlink{link}")

    time.sleep(3)
    print(textDriver.getWebHandle())

textDriver.quit()
results

# data = pd.Dataframe(results)

### Testing for data scrape pipeline

In [151]:
chrome_options = Options()
# sel_path = os.path.join(os.getcwd(), 'selenium_cookies')
# chrome_options.add_argument("user-data-dir="+ sel_path)
# chrome_options.add_argument("user-data-dir=selenium") 
chrome_options.add_argument("--headless")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False) 

driver = webdriver.Chrome(options=chrome_options)

stealth(driver,
       languages=["en-US", "en"],
       vendor="Google Inc.",
       platform="Win32",
       webgl_vendor="Intel Inc.",
       renderer="Intel Iris OpenGL Engine",
       fix_hairline=True,
       )

textDriver = webdriver.Chrome(options=chrome_options)

stealth(textDriver,
       languages=["en-US", "en"],
       vendor="Google Inc.",
       platform="Win32",
       webgl_vendor="Intel Inc.",
       renderer="Intel Iris OpenGL Engine",
       fix_hairline=True,
       )


In [152]:
url="https://www.zillow.com/philadelphia-pa/rentals/" #+str(page) + '_p/'
url2 = "https://www.zillow.com/philadelphia-pa/for_sale/"
    

# After the browser has been launched use the get() to pass the url 
browser = driver.get(url)
html = driver.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(html, 'html.parser')
# CraiglistBrowser.quit()
# print(url)

titleSelector = "h1.Text-c11n-8-84-3__sc-aiai24-0"
descSelector = "div.building-description"
linkSelector = "div.StyledPropertyCardDataWrapper-c11n-8-84-3__sc-1omp4c3-0"

#### Get webpage for overview

In [157]:
# len(soup.select("div.StyledPropertyCardDataWrapper-c11n-8-84-3__sc-1omp4c3-0"))
# soup.select("div.StyledPropertyCardDataWrapper-c11n-8-84-3__sc-1omp4c3-0")[0].select("a")[0].attrs["href"]
page_links = []
# soup = BeautifulSoup(driver.page_source, 'html.parser')
soup = BeautifulSoup(html, 'html.parser')

for item in soup.select(linkSelector):
    l = item.select("a")[0].attrs["href"]
    if not(l.startswith("https://")):
        l = "https://www.zillow.com"+l
    page_links.append(l)

page_links


['https://www.zillow.com/apartments/philadelphia-pa/the-ledger-residences/98vLHX/',
 'https://www.zillow.com/b/building/39.94893,-75.15103_ll/',
 'https://www.zillow.com/apartments/philadelphia-pa/the-national/B65ddJ/',
 'https://www.zillow.com/apartments/philadelphia-pa/the-legacy-at-powelton-village/98vHTJ/',
 'https://www.zillow.com/apartments/philadelphia-pa/the-grace/98wLDW/',
 'https://www.zillow.com/apartments/philadelphia-pa/the-hq/9PXKWh/',
 'https://www.zillow.com/homedetails/112-S-19th-St-A5AB73E33-Philadelphia-PA-19103/2054435564_zpid/',
 'https://www.zillow.com/apartments/philadelphia-pa/the-harper/9PWjcR/',
 'https://www.zillow.com/homedetails/112-S-19th-St-A6AD7858D-Philadelphia-PA-19103/2054435572_zpid/']