## Scraping Zillow.com to analyze housing price in New York City 

My goal here is to collect housing prices for both rental and sale in New York city. I looked at three major real estate website including Trulia, Zillow, and StreetEasy. Comparing to the other two websites, StreetEasy gives the most information on the searching results page and the format of each listing is very consistent, which is great for the purpose of web-scraping.<br\ >
<a href="http://zillow.com/">
<img "StreetEasy" src="map/streetEasy_logo.jpg" height="30px" width="150px"/></a><br\ >

Web scraping is done using the beautifulsoup package in Python. I created two functions that can loop through all the pages of searching results, and also empty strings to store results. Below are the steps I took to scrape StreetEasy:
1. Analyzing the HTML page: HTML code of a web page can be viewed by right click and selecting 'Inspect'. This helps us identifying the HTML tags of the information to be scraped
2. Making the soup!: It is important to select the correct parser for your data type. I used HTML parser.
3. Navigating the parse tree and iterate through tags: once the soup is made, we have the HTML code in Python. We can then find our desired information by searching through HTML tags.

In [47]:
import os
import time
import sys
import numpy as np
import pandas as pd
import regex as re
import lxml
import numbers
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import random
from ratelimit import limits, sleep_and_retry
from datetime import datetime

In [58]:
req_headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-encoding': 'gzip, deflate, br',
    'Accept-language': 'en-US,en;q=0.8',
    'Upgrade-insecure-requests': '1',
    'User-agent': UserAgent().random,
}

#base_url = "https://www.zillow.com/homes/for_sale/"
base_url = "https://streeteasy.com/for-sale/"
urls = []

city = 'nyc'
url1 = base_url +city+'/'
urls.append(url1)

start_page = 2
end_page = 10

# Add all pages
for i in range(start_page, end_page + 1):
    dom = base_url + city + '/' + 'page_' + str(i) #streeteasy
    #dom = base_url + city + '/' + str(i) '_p' +'/' #zillow
    if dom not in urls:
        urls.append(dom)

print(urls)

# Define the rate limit: e.g., 5 calls per 60 seconds
@sleep_and_retry
@limits(calls=5, period=60)
def soups(data):
    with requests.Session() as s:
        r = s.get(data, headers=req_headers)
        soup = BeautifulSoup(r.content, 'html.parser')
        #print(soup.prettify())  # Corrected line: Print prettified HTML content from the soup object
    return soup


#for url in urls:
#    htmls=soups(url)
# Call soup function and store output in a list
#lst = []

#for url in urls:
#    htmls = soups(url)

#print(len(urls))

['https://streeteasy.com/for-sale/nyc/', 'https://streeteasy.com/for-sale/nyc/page_2', 'https://streeteasy.com/for-sale/nyc/page_3', 'https://streeteasy.com/for-sale/nyc/page_4', 'https://streeteasy.com/for-sale/nyc/page_5', 'https://streeteasy.com/for-sale/nyc/page_6', 'https://streeteasy.com/for-sale/nyc/page_7', 'https://streeteasy.com/for-sale/nyc/page_8', 'https://streeteasy.com/for-sale/nyc/page_9', 'https://streeteasy.com/for-sale/nyc/page_10']


In [39]:
import math

rate_limit = 5 #calls/pages per minute
period = 60 #a period of 60 seconds

def estimate_time(rate_limit, period):
    time_needed = pages / rate_limit #pages/5calls/minute
    minutes = math.floor(time_needed)
    seconds = math.ceil((time_needed - minutes) * 60)
    return minutes, seconds

# Example usage:
pages = 98

minutes, seconds = estimate_time(pages, rate_limit)
print(f"Estimated time needed: {minutes} minutes and {seconds} seconds.")


Estimated time to fetch data from 30 pages: 30.0 minutes


In [None]:
# Initialize a list to store listing information
def parse_soup(soup):
    listings = []

    # Find all listing cards on the page
    listing_cards = soup.find_all(class_="listingCard")

    for card in listing_cards:
        # Iterate through each listing card and extract relevant information
            # Extract building type and neighborhood
        building_info = card.find(class_="listingCardLabel")
        building_type_neighborhood = building_info.text.strip()

        # Extract address
        address = card.find('address', class_='listingCard-addressLabel listingCard-upperShortLabel').get_text().strip()

        # Extract price
        price = card.find('span', class_='price listingCard-priceMargin').get_text().strip()

        # Extract bed, bathroom details, and size
        description = card.find('div', class_='description').text
        bedrooms = card.find('div', class_='bedrooms').text
        bathrooms = card.find('div', class_='bathrooms').text
        size = card.find('div', class_='size').text

        # Extract amenities match
        amenities_list = card.find('ul', class_='amenities').find_all('li')
        amenities = [amenity.text for amenity in amenities_list]

        # property url
        url = card.find('a', class_='listingCard-link jsCardLinkGA featured-link-to-hdp')['href']

        # Create a dictionary to store the extracted information
        extracted_listings = {
            "address": address,
             "building_type_neighborhood": building_type_neighborhood,
            "size (sq. ft.)": size,
            "price": price,
            "# bedrooms": bedrooms,
            "# bathrooms": bathrooms,
            "amenities": amenities,
            "description": description,
            "url": url
        
        }
        # Append the dictionary to the list
        listings.append(extracted_listings)

    return listings

        # Append the listing dictionary to the list of listings
        
            #address = data.find_all(class_= 'list-card-addr') zillow
            #price = list(data.find_all(class_='list-card-price')) zillow
            #beds = list(data.find_all("ul", class_="list-card-details")) zillow
            #last_updated = data.find_all('div', {'class': 'list-card-top'}) zillow

        

# Example usage:
# Define the URL from which you want to fetch the HTML content
#url = "your_url_here"

# Fetch the HTML content from the URL
#soup = soups(url)

# Parse the HTML content and extract listing information from JSON-LD data
listings = parse_soup(soup)

# Create a pandas DataFrame from the extracted listing information
df = pd.DataFrame(listings)

# Print the DataFrame
print(df)

In [61]:
import pandas as pd
from collections import defaultdict
import re

# Define a defaultdict to store the occurrence count of each address
address_counter = defaultdict(int)

def extract_numeric_value(text):
    if text is not None:
        # Use regular expression to extract numerical values, including optional units
        numeric_value = re.search(r'(\d{1,3}(,\d{3})*(\.\d+)?)\s*(?:square\s*feet)?', text)
        if numeric_value:
            # Remove commas from the extracted value
            numeric_value_without_commas = numeric_value.group(1).replace(',', '')
            return numeric_value_without_commas  # Return the extracted numeric value
    return None

# Function to parse the HTML soup
def parse_soup(soup):
    listings = []

    # Find all listing cards on the page
    listing_cards = soup.find_all(class_="listingCard")

    for card in listing_cards:
        # Extract relevant information from each listing card
        address = card.find('address', class_='listingCard-addressLabel listingCard-upperShortLabel').get_text().strip()
        
        building_info = card.find('p',class_="listingCardLabel listingCardLabel-grey listingCard-upperShortLabel")
        building_type_neighborhood = building_info.text.strip()

        price = card.find('span', class_='price listingCard-priceMargin').get_text().strip()

        bed_elem = card.find('span', class_='listingDetailDefinitionsIcon--bed')
        beds_text = bed_elem.find_next_sibling('span', class_='listingDetailDefinitionsText').text.strip() if bed_elem else None
        beds = extract_numeric_value(beds_text)

        bath_elem = card.find('span', class_='listingDetailDefinitionsIcon--bath')
        baths_text = bath_elem.find_next_sibling('span', class_='listingDetailDefinitionsText').text.strip() if bath_elem else None
        baths = extract_numeric_value(baths_text)

        # Extract size
        size_elem = card.find('span', class_='listingDetailDefinitionsIcon--measure')
        size_text = size_elem.find_next_sibling('span', class_='listingDetailDefinitionsText').text.strip() if size_elem else None
        size = extract_numeric_value(size_text)

        url_element = card.find('a', class_='listingCard-link jsCardLinkGA')
        url = url_element.get('href') if url_element else 'Property URL not found.'

        # Update the address counter
        address_counter[address] += 1

        # Create a dictionary to store the extracted information
        extracted_listings = {
            "Address": address,
            "Building_type_neighborhood": building_type_neighborhood,
            "Size (sq. ft.)": size,
            "Price": price,
            "Bedrooms #": beds,
            "Bathrooms # ": baths,
            "Url": url
        }
        # Append the dictionary to the list
        listings.append(extracted_listings)

    return listings

# Call the parse_soup function to extract listing information
all_listings = []  # List to store all dataframes

for url in urls:
    soup = soups(url)
    listings = parse_soup(soup)
    df = pd.DataFrame(listings)
    all_listings.append(df)  # Append the dataframe to the list of dataframes


# Concatenate all dataframes into a single dataframe
combined_df = pd.concat(all_listings, ignore_index=True)
# Print the concatenated DataFrame

# Display DataFrame with revised structure and formatting
#pd.set_option('display.max_colwidth', None)
#pd.set_option('display.max_rows', None)


# Filter out duplicates and ensure at least one unique occurrence of each address
unique_addresses = [address for address, count in address_counter.items() if count == 1]
print(f"Total number of unique addresses: {len(unique_addresses)}")
unique_df = combined_df[combined_df['Address'].isin(unique_addresses)]

# Export DataFrame to Excel file with date and page range in the file name
date_created = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
excel_file_name = f"unique_listings_{date_created}_pages_{start_page}_to_{end_page}.xlsx"
#combined_df.to_excel(excel_file_name, index=False)
unique_df.to_excel(excel_file_name, index=False)
print(f"Unique DataFrame exported to {excel_file_name}")
print(unique_df)

#print("Unique DataFrame:")
#print(unique_df)

# Print the total number of unique addresses


Concatenated DataFrame:
                        Address        Building_type_neighborhood  \
0    543 West 122nd Street #28B                   New Development   
1      244 East 52nd Street #5A                   New Development   
2         66 Madison Avenue #4E                    Co-op in NoMad   
3         200 Rector Place #36A        Condo in Battery Park City   
4    50 Bridge Park Drive #17AE                   New Development   
5             89-15 96th Street                House in Woodhaven   
6           36 Remsen Street #2         Condo in Brooklyn Heights   
7            2373 Broadway #511         Condop in Upper West Side   
8           56 Mc Arthur Avenue          Multi-family in Annadale   
9     1020 Grand Concourse #11S                Co-op in Concourse   
10          138 Randolph Avenue  Multi-family in Bergen/Lafayette   
11          1021 Boulevard East         Multi-family in Weehawken   
12        21-68 35th Street #5D         Co-op in Ditmars-Steinway   
13    235 

## Data Manipulation

For some listings the information on number of bedroom, number of bathroom, and apartment size is incomplete or mixed up. I performed data manipulation to fix the mistaken values and clean up the extra symbols such as comma and dollar sign. <br\ >
Finally, I have two data sets containing the housing information for apartments for rent and apartments for sale. My for sale data set has 8,456 rows and 8 columns, and the for rent data set has 20,988 rows and 7 columns

In [None]:
import pandas as pd
import numpy as np


#is the apartment furnished?
cond=data['bed']=='Furnished'
data.loc[cond,'furnished']=1
data.loc[cond,'bed']=''

#move from size to bath
cond=[]
for i in data['size']:
    if(i==''):
        cond.append(False)
    else:
        cond.append(i.split(" ")[1] in ('bath','baths'))
data.loc[cond,'bath']=data.loc[cond,'size'] 
data.loc[cond,'size']=''

#move from bed to bath
cond=[]
for i in data['bed']:
    if(i=='' or i=='Furnished' or i=='studio'):
        cond.append(False)
    else:
        cond.append(i.split(" ")[1] in ('bath','baths'))
data.loc[cond,'bath']=data.loc[cond,'bed'] 
data.loc[cond,'bed']=''

#move from bath to bed
cond=[]
for i in data['bath']:
    if(i==''):
        cond.append(False)
    else:
        if(len(i.split(" "))==1):
            cond.append(True)
        else:
            if(i.split(" ")[1] in ('bath','baths')):
                cond.append(False)
            else:
                cond.append(True)
data.loc[cond,'bed']=data.loc[cond,'bath'] 
data.loc[cond,'bath']=''

#move from bed to size
cond=[]
for i in data['bed']:
    if(i=='' or i=='studio'):
        cond.append(False)
    else:
        if(i.split(" ")[1] in ('bed','beds')):
            cond.append(False)
        else:
            cond.append(True)
data.loc[cond,'size']=data.loc[cond,'bed'] 
data.loc[cond,'bed']=''


#replace blank with nan
data=data.applymap(lambda x: np.nan if x=='' else x)

#data
data.to_csv('rent.csv',encoding='utf-8')

In [None]:
#size to numeric
cond=data['size'].isnull()
for i in range(0,len(cond)):
    if (not cond[i]):
        data.loc[i,'size']=int(data['size'][i].split(" ")[0].replace(',',''))
#bath to numeric
cond=data['bath'].isnull()
for i in range(0,len(cond)):
    if (not cond[i]):
        data.loc[i,'bath']=float(data['bath'][i].split(" ")[0].replace('+',''))
#bed to numeric
cond=data['bed'].isnull()
data['bed']=data['bed'].replace('studio','0 bed')
for i in range(0,len(cond)):
    if (not cond[i]):
        data.loc[i,'bed']=float(data['bed'][i].split(" ")[0].replace(',','').replace('+',''))
#remove dollar sign
data['price']=[int(i.replace('$','').replace(',','')) for i in data['price']]

data.to_csv('rent_2.csv')