## Scraping Zillow.com to analyze housing price in New York City 

My goal here is to collect housing prices for both rental and sale in New York city. I looked at three major real estate website including Trulia, Zillow, and StreetEasy. Comparing to the other two websites, StreetEasy gives the most information on the searching results page and the format of each listing is very consistent, which is great for the purpose of web-scraping.<br\ >
<a href="http://zillow.com/">
<img "StreetEasy" src="map/streetEasy_logo.jpg" height="30px" width="150px"/></a><br\ >

Web scraping is done using the beautifulsoup package in Python. I created two functions that can loop through all the pages of searching results, and also empty strings to store results. Below are the steps I took to scrape StreetEasy:
1. Analyzing the HTML page: HTML code of a web page can be viewed by right click and selecting 'Inspect'. This helps us identifying the HTML tags of the information to be scraped
2. Making the soup!: It is important to select the correct parser for your data type. I used HTML parser.
3. Navigating the parse tree and iterate through tags: once the soup is made, we have the HTML code in Python. We can then find our desired information by searching through HTML tags.

In [1]:
def package_url_sale(page):
    """Creates a URL for sales based on the page number."""
    return f'https://www.zillow.com/new-york-ny/for_sale/{page}_p/'

In [2]:
def package_url_rent(page):
    """Creates a URL for rentals based on the page number."""
    return f'https://www.zillow.com/new-york-ny/rentals/{page}_p/'

In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from typing import List
import requests
import time


#price=[]
#where=[]
#bed=[]
#bath=[]
#size=[]
#monthly=[]
#street=[]

In [7]:
from tqdm import tqdm

# Define session and configure it with headers and timeout
s = requests.Session()
s.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'DNT': '1',
    'X-Requested-With': 'XMLHttpRequest',
    'Referer': 'http://www.google.com',
    'X-Location': 'NYC',
})
s.timeout = (5, 5)  # Set timeout for connect and read

def make_request(session, url):
    try:
        response = session.get(url)
        response.raise_for_status()  # Raise HTTPError for bad responses
        return response.content  # or response.text, depending on what you need
    except requests.Timeout:
        print("Request timed out.")
    except requests.RequestException as e:
        print(f"Request failed: {e}")
    return None

def scrape_data() -> None:
    """Scrapes data from StreetEasy and populates price, where, bed, bath, size, and street lists."""
    price: List[str] = []
    where: List[str] = []
    bed: List[str] = []
    bath: List[str] = []
    size: List[str] = []
    street: List[str] = []

    # Define the total number of pages
    total_pages = 20

    # Create a DataFrame outside the loop to avoid creating it in every iteration

    data={'street':street,'price':price,'where':where,'bed':bed, 'bath':bath, 'size':size,'furnished':0}
    
    # Then, you can use this function in your scraping code like this:
    # Create a tqdm progress bar for the loop
    progress_bar = tqdm(range(1, total_pages + 1), desc="Scraping Pages", position=0)

    for x in progress_bar:
        url = package_url_rent(str(x))
        print(f"Scraping URL: {url}")
        r = make_request(s, url)
        if r:
            print(f"Response Content: {r}")
        else:
            print("Request failed or timed out.")

        time.sleep(10)             # Implement a rate limit (e.g., 1 request per 2 seconds)
        # Continue with scraping using the response content
        
        soup = BeautifulSoup(r.text, 'html.parser')
        listings = []
        for listing in soup.find_all('div', {'class': 'property-card-data'}):
            result = {}
            result['address'] = listing.find('address', {'data-test': 'property-card-addr'}).get_text().strip()
            result['price'] = listing.find('span', {'data-test': 'property-card-price'}).get_text().strip()
            details_list = listing.find('ul', {'class': 'dmDolk'})
            details = details_list.find_all('li') if details_list else []
            result['bedrooms'] = details[0].get_text().strip() if len(details) > 0 else ''
            result['bathrooms'] = details[1].get_text().strip() if len(details) > 1 else ''
            result['sqft'] = details[2].get_text().strip() if len(details) > 2 else ''
            type_div = listing.find('div', {'class': 'gxlfal'})
            result['type'] =  type_div.get_text().split("-")[1].strip() if type_div else ''
            listings.append(result)
        '''if r:
            soup = BeautifulSoup(r, 'html.parser')
            lst = soup.find_all(lambda tag: tag.has_attr('data-id'))
            for i in range(len(lst)):
                #price
                if lst[i].find_all('span',{'class':'price'})==[]:
                    price.append('')
                else:
                    price.append(lst[i].find_all('span',{'class':'price'})[0].string)
                #where
                length=len(lst[i].find_all('div',{'class':'details_info'}))
                if(lst[i].find_all('div',{'class':'details_info'})[0].find_all('a',href=True)==[]):
                    if(length==1):
                        where.append('')
                    else:
                        if(lst[i].find_all('div',{'class':'details_info'})[1].find_all('a',href=True)==[]):
                            where.append('')
                        else:
                            where.append(lst[i].find_all('div',{'class':'details_info'})[1].find_all('a',href=True)[0].string)
                else:
                    where.append(lst[i].find_all('div',{'class':'details_info'})[0].find_all('a',href=True)[0].string)
                #bedroom
                if(lst[i].find_all('span',{'class':'first_detail_cell'})==[]):
                    bed.append('')
                else:
                    bed.append(lst[i].find_all('span',{'class':'first_detail_cell'})[0].string)
                #bedroom
                if(lst[i].find_all('span',{'class':'detail_cell'})==[]):
                    bath.append('')
                else:
                    bath.append(lst[i].find_all('span',{'class':'detail_cell'})[0].string)
                #size
                if(lst[i].find_all('span',{'class':'last_detail_cell'})==[]):
                    size.append('')
                else:
                    size.append(lst[i].find_all('span',{'class':'last_detail_cell'})[0].string)
                #monthly rent
                #monthly.append(lst[i].find_all('span',{'class':'monthly_payment'})[0].string)
                #street
                street.append(lst[i].find_all('div',{'class':'details-title'})[0].a.string)
                '''
        
        data['street'].extend([result['address'] for result in listings])
        data['price'].extend([result['price'] for result in listings])
        data['where'].extend([result['address'] for result in listings])
        data['bed'].extend([result['bedrooms'] for result in listings])
        data['bath'].extend([result['bathrooms'] for result in listings])
        data['size'].extend([result['sqft'] for result in listings])
        data['furnished'].extend([0] * len(listings))
                
        # Update the progress bar description with the current page number
        progress_bar.set_postfix(Page=f"{x}/{total_pages}")
        progress_bar.refresh()  # Manually refresh the tqdm bar

        '''# Add scraped data to the 'data' dictionary
         data['street'] = street
        data['price'] = price
        data['where'] = where
        data['bed'] = bed
        data['bath'] = bath
        data['size'] = size
        data['furnished'] = [0] * len(street)
        '''
        time.sleep(12)             # Implement a rate limit (e.g., 1 request per 2 seconds)

    progress_bar.close()         # Close the progress bar after processing all pages

    df=pd.DataFrame(data)        # Create DataFrame after the loop
            
    print("done")  # Print "done" outside the loop
    return df
            
 
# Call the scraping function
scrape_data()

Scraping Pages:   0%|          | 0/20 [00:00<?, ?it/s]

Scraping URL: https://www.zillow.com/new-york-ny/rentals/1_p/


Scraping Pages:   0%|          | 0/20 [00:10<?, ?it/s]


AttributeError: 'bytes' object has no attribute 'text'

## Data Manipulation

For some listings the information on number of bedroom, number of bathroom, and apartment size is incomplete or mixed up. I performed data manipulation to fix the mistaken values and clean up the extra symbols such as comma and dollar sign. <br\ >
Finally, I have two data sets containing the housing information for apartments for rent and apartments for sale. My for sale data set has 8,456 rows and 8 columns, and the for rent data set has 20,988 rows and 7 columns

In [None]:
import pandas as pd
import numpy as np


#is the apartment furnished?
cond=data['bed']=='Furnished'
data.loc[cond,'furnished']=1
data.loc[cond,'bed']=''

#move from size to bath
cond=[]
for i in data['size']:
    if(i==''):
        cond.append(False)
    else:
        cond.append(i.split(" ")[1] in ('bath','baths'))
data.loc[cond,'bath']=data.loc[cond,'size'] 
data.loc[cond,'size']=''

#move from bed to bath
cond=[]
for i in data['bed']:
    if(i=='' or i=='Furnished' or i=='studio'):
        cond.append(False)
    else:
        cond.append(i.split(" ")[1] in ('bath','baths'))
data.loc[cond,'bath']=data.loc[cond,'bed'] 
data.loc[cond,'bed']=''

#move from bath to bed
cond=[]
for i in data['bath']:
    if(i==''):
        cond.append(False)
    else:
        if(len(i.split(" "))==1):
            cond.append(True)
        else:
            if(i.split(" ")[1] in ('bath','baths')):
                cond.append(False)
            else:
                cond.append(True)
data.loc[cond,'bed']=data.loc[cond,'bath'] 
data.loc[cond,'bath']=''

#move from bed to size
cond=[]
for i in data['bed']:
    if(i=='' or i=='studio'):
        cond.append(False)
    else:
        if(i.split(" ")[1] in ('bed','beds')):
            cond.append(False)
        else:
            cond.append(True)
data.loc[cond,'size']=data.loc[cond,'bed'] 
data.loc[cond,'bed']=''


#replace blank with nan
data=data.applymap(lambda x: np.nan if x=='' else x)

#data
data.to_csv('rent.csv',encoding='utf-8')

In [None]:
#size to numeric
cond=data['size'].isnull()
for i in range(0,len(cond)):
    if (not cond[i]):
        data.loc[i,'size']=int(data['size'][i].split(" ")[0].replace(',',''))
#bath to numeric
cond=data['bath'].isnull()
for i in range(0,len(cond)):
    if (not cond[i]):
        data.loc[i,'bath']=float(data['bath'][i].split(" ")[0].replace('+',''))
#bed to numeric
cond=data['bed'].isnull()
data['bed']=data['bed'].replace('studio','0 bed')
for i in range(0,len(cond)):
    if (not cond[i]):
        data.loc[i,'bed']=float(data['bed'][i].split(" ")[0].replace(',','').replace('+',''))
#remove dollar sign
data['price']=[int(i.replace('$','').replace(',','')) for i in data['price']]

data.to_csv('rent_2.csv')