## Scraping Zillow.com to analyze housing price in New York City 

My goal here is to collect housing prices for both rental and sale in New York city. I looked at three major real estate website including Trulia, Zillow, and StreetEasy. Comparing to the other two websites, StreetEasy gives the most information on the searching results page and the format of each listing is very consistent, which is great for the purpose of web-scraping.<br\ >
<a href="http://zillow.com/">
<img "StreetEasy" src="map/streetEasy_logo.jpg" height="30px" width="150px"/></a><br\ >

Web scraping is done using the beautifulsoup package in Python. I created two functions that can loop through all the pages of searching results, and also empty strings to store results. Below are the steps I took to scrape StreetEasy:
1. Analyzing the HTML page: HTML code of a web page can be viewed by right click and selecting 'Inspect'. This helps us identifying the HTML tags of the information to be scraped
2. Making the soup!: It is important to select the correct parser for your data type. I used HTML parser.
3. Navigating the parse tree and iterate through tags: once the soup is made, we have the HTML code in Python. We can then find our desired information by searching through HTML tags.

In [1]:
def package_url_sale(page):
    """Creates a URL for sales based on the page number."""
    return f'https://www.zillow.com/new-york-ny/for_sale/{page}_p/'

In [2]:
def package_url_rent(page):
    """Creates a URL for rentals based on the page number."""
    return f'https://www.zillow.com/new-york-ny/rentals/{page}_p/'

In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from typing import List
import requests
import time


#price=[]
#where=[]
#bed=[]
#bath=[]
#size=[]
#monthly=[]
#street=[]

In [None]:
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from fake_useragent import UserAgent
from selenium.webdriver.common.action_chains import ActionChains
import time
import random
import json


ua = UserAgent()
user_agent = ua.random


chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument(f"user-agent={user_agent}")
# Specify the path to chromedriver
#chromedriver_path = './drivers/chromedriver-mac-arm64/'
driver = webdriver.Chrome(options=chrome_options)


def make_request(session, url):
    try:
        response = session.get(url)
        response.raise_for_status()  # Raise HTTPError for bad responses
        return response.content  # or response.text, depending on what you need
    except requests.Timeout:
        print("Request timed out.")
    except requests.RequestException as e:
        print(f"Request failed: {e}")
    return None

def scrape_data() -> None:
    """Scrapes data from StreetEasy and populates price, where, bed, bath, size, and street lists."""
    price: List[str] = []
    where: List[str] = []
    bed: List[str] = []
    bath: List[str] = []
    size: List[str] = []
    street: List[str] = []

    # Define the total number of pages
    total_pages = 2

    # Create a DataFrame outside the loop to avoid creating it in every iteration
    data = {'street': street, 'price': price, 'where': where, 'bed': bed, 'bath': bath, 'size': size}

    # Set up Chrome options
    chrome_options = Options()
    # If you want to run Chrome in headless mode, uncomment the next line
    # chrome_options.add_argument("--headless")

    # Create a tqdm progress bar for the loop
    progress_bar = tqdm(range(1, total_pages + 1), desc="Scraping Pages", position=0)

    # Set up the Selenium webdriver
    driver = webdriver.Chrome(options=chrome_options)

    for x in progress_bar:
        url = package_url_rent(str(x))
        print(f"Scraping URL: {url}")

        # Use Selenium to fetch the page content
        driver.get(url)

        # Wait for the page to load (adjust the timeout as needed)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'property-card-data'))
        )

    # Simulate scrolling for more interaction
        actions = ActionChains(driver)
        actions.move_by_offset(0, 500).perform()

        # Add a random delay (between 5 and 15 seconds)
        time.sleep(random.uniform(5, 15))

        # Continue with scraping using the response content
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Get the page source
        r = driver.page_source

        if r:
            print(f"Response Content: {r}")
        else:
            print("Request failed or timed out.")

        time.sleep(10)  # Implement a rate limit (e.g., 1 request per 2 seconds)

        # Continue with scraping using the response content
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        # Find the script tag with type="application/ld+json"
        script_tag = soup.find('script', {'type': 'application/ld+json'})
        listings = []
        # Inside the loop where you are scraping the page content
        for listing in soup.find_all('div', {'class': 'property-card-data'}):
            result = {}

            # Extract information from the script tag
            script_tag = listing.find_previous('script', {'type': 'application/ld+json'})

            try:
                if script_tag:
                    script_content = script_tag.string
                    json_data = json.loads(script_content)

                    # Extract information from the JSON data
                    result['zpid'] = json_data.get('url', '').split('/')[-2]
                    result['property_type']=json_data.get('type','')
                    result['address'] = json_data.get('name', '')
                    result['floor_size'] = json_data.get('floorSize', {}).get('value', '')
                    result['street_address'] = json_data.get('address', {}).get('streetAddress', '')
                    result['locality'] = json_data.get('address', {}).get('addressLocality', '')
                    result['region'] = json_data.get('address', {}).get('addressRegion', '')
                    result['postal_code'] = json_data.get('address', {}).get('postalCode', '')
                    result['latitude'] = json_data.get('geo', {}).get('latitude', '')
                    result['longitude'] = json_data.get('geo', {}).get('longitude', '')
                    result['url'] = json_data.get('url', '')

                    # Add the result to the listings
                    listings.append(result)
                    continue  # Skip the property card extraction if JSON extraction succeeds

            except json.JSONDecodeError as e:
                print(f"JSON decoding failed. Using property card information. Error: {e}")
            
                # Extract information from the property card
            result['zpid'] = listing.find('address', {'data-test': 'property-card-addr'}).get_text().strip()
            result['address'] = result.get('address', '')  # Don't overwrite if JSON extraction succeeded
            result['floor_size'] = ''  # You may modify this based on the property card structure
            result['street_address'] = ''  # You may modify this based on the property card structure
            result['locality'] = ''  # You may modify this based on the property card structure
            result['region'] = ''  # You may modify this based on the property card structure
            result['postal_code'] = ''  # You may modify this based on the property card structure
            result['latitude'] = ''  # You may modify this based on the property card structure
            result['longitude'] = ''  # You may modify this based on the property card structure
            result['url'] = ''  # You may modify this based on the property card structure

            # Continue extracting information from the property card
            result['price'] = listing.find('span', {'data-test': 'property-card-price'}).get_text().strip()
            details_list = listing.find('ul', {'class': 'dmDolk'})
            details = details_list.find_all('li') if details_list else []
            result['bedrooms'] = details[0].get_text().strip() if len(details) > 0 else ''
            result['bathrooms'] = details[1].get_text().strip() if len(details) > 1 else ''
            result['sqft'] = details[2].get_text().strip() if len(details) > 2 else ''
            type_div = listing.find('div', {'class': 'gxlfal'})
            result['type'] = type_div.get_text().split("-")[1].strip() if type_div else ''

            # Add the result to the listings
            data['zpid'].append(result['zpid'])
            data['address'].append(result['address'])
            data['floor_size'].append(result['floor_size'])
            data['street_address'].append(result['street_address'])
            data['locality'].append(result['locality'])
            data['region'].append(result['region'])
            data['postal_code'].append(result['postal_code'])
            data['latitude'].append(result['latitude'])
            data['longitude'].append(result['longitude'])
            data['url'].append(result['url'])

            # Add the result to the listings
            data['price'].append(result['price'])
            data['bed'].append(result['bedrooms'])
            data['bath'].append(result['bathrooms'])
            data['type'].append(result['type'])

        # Increment the page number
        # Update the progress bar description with the current page number
        progress_bar.set_postfix(Page=f"{x}/{total_pages}")
        progress_bar.refresh()  # Manually refresh the tqdm bar

        time.sleep(12)  # Implement a rate limit (e.g., 1 request per 2 seconds)

    progress_bar.close()  # Close the progress bar after processing all pages

    # Close the Selenium webdriver
    driver.quit()

    # Create DataFrame after the loop
    df = pd.DataFrame(data)

    print("done")  # Print "done" outside the loop
    return df

# Call the scraping function
scrape_data()


In [18]:
import asyncio
import json
import random
import time
from typing import List

import aiofiles
import aiohttp
import pandas as pd
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from tqdm import tqdm

async def make_request(session, url):
    try:
        async with session.get(url) as response:
            response.raise_for_status()
            return await response.read()
    except aiohttp.ClientTimeout:
        print("Request timed out.")
    except aiohttp.ClientResponseError as e:
        print(f"Request failed: {e}")
    return None

async def fetch_page_content(url, session):
    print(f"Scraping URL: {url}")
    response_content = await make_request(session, url)
    if response_content:
        print(f"Response Content: {response_content[:200]}...")  # Display part of the content
    else:
        print("Request failed or timed out.")
    return response_content

async def scrape_page(url, session, driver, data):
    response_content = await fetch_page_content(url, session)

    if response_content:
        soup = BeautifulSoup(response_content, 'html.parser')

        # Find the script tag with type="application/ld+json"
        script_tag = soup.find('script', {'type': 'application/ld+json'})
        listings = []

        try:
            if script_tag:
                script_content = script_tag.string
                json_data = json.loads(script_content)

                # Extract information from the JSON data
                result = {}
                result['zpid'] = json_data.get('url', '').split('/')[-2]
                result['property_type'] = json_data.get('type', '')
                result['address'] = json_data.get('name', '')
                result['floor_size'] = json_data.get('floorSize', {}).get('value', '')
                result['street_address'] = json_data.get('address', {}).get('streetAddress', '')
                result['locality'] = json_data.get('address', {}).get('addressLocality', '')
                result['region'] = json_data.get('address', {}).get('addressRegion', '')
                result['postal_code'] = json_data.get('address', {}).get('postalCode', '')
                result['latitude'] = json_data.get('geo', {}).get('latitude', '')
                result['longitude'] = json_data.get('geo', {}).get('longitude', '')
                result['url'] = json_data.get('url', '')

                # Add the result to the listings
                listings.append(result)
            else:
                print("No JSON data found. Using property card information.")

            # Continue with scraping using the response content
            for listing in soup.find_all('div', {'class': 'property-card-data'}):
                result = {}

                # Extract information from the property card
                result['zpid'] = listing.find('address', {'data-test': 'property-card-addr'}).get_text().strip()
                result['address'] = result.get('address', '')  # Don't overwrite if JSON extraction succeeded
                result['floor_size'] = ''  # You may modify this based on the property card structure
                result['street_address'] = ''  # You may modify this based on the property card structure
                result['locality'] = ''  # You may modify this based on the property card structure
                result['region'] = ''  # You may modify this based on the property card structure
                result['postal_code'] = ''  # You may modify this based on the property card structure
                result['latitude'] = ''  # You may modify this based on the property card structure
                result['longitude'] = ''  # You may modify this based on the property card structure
                result['url'] = ''  # You may modify this based on the property card structure

                # Continue extracting information from the property card
                result['price'] = listing.find('span', {'data-test': 'property-card-price'}).get_text().strip()
                details_list = listing.find('ul', {'class': 'dmDolk'})
                details = details_list.find_all('li') if details_list else []
                result['bedrooms'] = details[0].get_text().strip() if len(details) > 0 else ''
                result['bathrooms'] = details[1].get_text().strip() if len(details) > 1 else ''
                result['sqft'] = details[2].get_text().strip() if len(details) > 2 else ''
                type_div = listing.find('div', {'class': 'gxlfal'})
                result['type'] = type_div.get_text().split("-")[1].strip() if type_div else ''

                # Add the result to the listings
                listings.append(result)

        except json.JSONDecodeError as e:
            print(f"JSON decoding failed. Error: {e}")

    # Increment the page number
    print("Scraping done for:", url)
    return listings

async def scrape_data_async(total_pages):
    ua = UserAgent()
    user_agent = ua.random

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument(f"user-agent={user_agent}")
    
    # Set up the Selenium webdriver
    driver = webdriver.Chrome(options=chrome_options)

    data = {
        'zpid': [],
        'address': [],
        'floor_size': [],
        'street_address': [],
        'locality': [],
        'region': [],
        'postal_code': [],
        'latitude': [],
        'longitude': [],
        'url': [],
        'price': [],
        'bed': [],
        'bath': [],
        'type': [],
    }

    async with aiohttp.ClientSession() as session:
        tasks = []
        for x in range(1, total_pages + 1):
            url = package_url_rent(str(x))
            tasks.append(scrape_page(url, session, driver, data))

        # Use asyncio.gather() to run tasks concurrently
        results = await asyncio.gather(*tasks)

    # Flatten the list of lists
    listings = [item for sublist in results for item in sublist]

    # Close the Selenium webdriver
    driver.quit()

    # Create DataFrame
    df = pd.DataFrame(listings)
    return df

# Example: Call the scraping function with 2 pages
total_pages = 2
result_df = asyncio.run(scrape_data_async(total_pages))

# Display the result DataFrame
print(result_df)


RuntimeError: asyncio.run() cannot be called from a running event loop

## Data Manipulation

For some listings the information on number of bedroom, number of bathroom, and apartment size is incomplete or mixed up. I performed data manipulation to fix the mistaken values and clean up the extra symbols such as comma and dollar sign. <br\ >
Finally, I have two data sets containing the housing information for apartments for rent and apartments for sale. My for sale data set has 8,456 rows and 8 columns, and the for rent data set has 20,988 rows and 7 columns

In [None]:
import pandas as pd
import numpy as np


#is the apartment furnished?
cond=data['bed']=='Furnished'
data.loc[cond,'furnished']=1
data.loc[cond,'bed']=''

#move from size to bath
cond=[]
for i in data['size']:
    if(i==''):
        cond.append(False)
    else:
        cond.append(i.split(" ")[1] in ('bath','baths'))
data.loc[cond,'bath']=data.loc[cond,'size'] 
data.loc[cond,'size']=''

#move from bed to bath
cond=[]
for i in data['bed']:
    if(i=='' or i=='Furnished' or i=='studio'):
        cond.append(False)
    else:
        cond.append(i.split(" ")[1] in ('bath','baths'))
data.loc[cond,'bath']=data.loc[cond,'bed'] 
data.loc[cond,'bed']=''

#move from bath to bed
cond=[]
for i in data['bath']:
    if(i==''):
        cond.append(False)
    else:
        if(len(i.split(" "))==1):
            cond.append(True)
        else:
            if(i.split(" ")[1] in ('bath','baths')):
                cond.append(False)
            else:
                cond.append(True)
data.loc[cond,'bed']=data.loc[cond,'bath'] 
data.loc[cond,'bath']=''

#move from bed to size
cond=[]
for i in data['bed']:
    if(i=='' or i=='studio'):
        cond.append(False)
    else:
        if(i.split(" ")[1] in ('bed','beds')):
            cond.append(False)
        else:
            cond.append(True)
data.loc[cond,'size']=data.loc[cond,'bed'] 
data.loc[cond,'bed']=''


#replace blank with nan
data=data.applymap(lambda x: np.nan if x=='' else x)

#data
data.to_csv('rent.csv',encoding='utf-8')

In [None]:
#size to numeric
cond=data['size'].isnull()
for i in range(0,len(cond)):
    if (not cond[i]):
        data.loc[i,'size']=int(data['size'][i].split(" ")[0].replace(',',''))
#bath to numeric
cond=data['bath'].isnull()
for i in range(0,len(cond)):
    if (not cond[i]):
        data.loc[i,'bath']=float(data['bath'][i].split(" ")[0].replace('+',''))
#bed to numeric
cond=data['bed'].isnull()
data['bed']=data['bed'].replace('studio','0 bed')
for i in range(0,len(cond)):
    if (not cond[i]):
        data.loc[i,'bed']=float(data['bed'][i].split(" ")[0].replace(',','').replace('+',''))
#remove dollar sign
data['price']=[int(i.replace('$','').replace(',','')) for i in data['price']]

data.to_csv('rent_2.csv')