# Web Scraping Toronto Property Prices

To extract the residential property prices in different regions of Greater Toronto area, we use the website https://realtor.ca. 

We scrape data from the website using Selenium package. 

In [102]:
import os
import time
import AppKit
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.wait import WebDriverWait

# Minimum number of clicks to scroll through different pages.
clicks = 51

# Enter the video url below.
url = "https://www.realtor.ca/"

# Regions in the Greater Toronto area to be scraped.
areas = [
        'Ajax, ON', 'Clarington, ON', 'Brock, ON', 'Oshawa, ON', 
        'Pickering, ON', 'Scugog, ON', 'Uxbridge, ON', 'Whitby, ON', 
        'Burlington, ON', 'Halton Hills, ON', 'Milton, ON', 
        'Oakville, ON', 'Brampton, ON', 'Caledon, ON', 
        'Mississauga, ON', 'Aurora, ON', 'East Gwillimbury, ON', 
        'Georgina, ON', 'King, ON', 'Markham, ON', 'Newmarket, ON', 
        'Richmond Hill, ON', 'Vaughan, ON', 'Whitchurch-Stouffville, ON', 
        'Old Toronto, Toronto, ON', 'Hamilton, ON', 'Guelph, ON',  
        'Kitchener, ON', 'Cambridge, ON', 'Brantford, ON', 'Scarborough, ON'
    ]
 

# Loop through different regions to scrape property listing prices and other relevant details.
for area in areas:
    driver = webdriver.Chrome()
    print('Opening the Browser.')
    time.sleep(4)
    driver.get(url)
    print('Browser opened the requested url.')
    AppKit.NSBeep()
    print('Waiting for the manual captcha entering by the user.')
    
    # This step required manual entering of the Captcha.
    wait = 25
    print(f'Captcha is to be entered within', wait, 'seconds.')
    time.sleep(wait)
    driver.find_element_by_id("homeSearchTxt").send_keys(area)
    print('Region name entered in the search bar.')
    time.sleep(6)
    driver.find_element_by_xpath('//*[@id="homeSearchBtn"]').click()
    time.sleep(6)
    driver.find_element_by_xpath('//*[@id="polygonOptInBtn"]').click()
    time.sleep(10)

    button = driver.find_element_by_xpath('//*[@id="SideBarPagination"]/div/a[3]/div')
    
    price = []
    region = []
    address = []
    bedrooms = []
    bathrooms = []
    
    while clicks > 0:
        time.sleep(1)
        listings = driver.find_elements_by_class_name('cardCon')
        for index in range(0, len(listings)):
            try:
                price_iter = listings[index].find_element_by_class_name('smallListingCardPrice').text
            except:
                price_iter = 'NaN'
            price.append(price_iter)
            try:
                region_iter = driver.find_element_by_id('locationSearchFilterText').text
            except:
                region_iter = 'NaN'
            region.append(region_iter)
            try:
                address_iter = listings[index].find_element_by_class_name('smallListingCardAddress').text
            except:
                address_iter = 'NaN'
            address.append(address_iter)
            try:
                bedrooms_iter = listings[index].find_element_by_class_name('smallListingCardIconNum').text
            except:
                bedrooms_iter = 'NaN'
            bedrooms.append(bedrooms_iter)
            try:
                bathrooms_iter = listings[index].find_elements_by_class_name('smallListingCardIconNum')[1].text
            except:
                bathrooms_iter  ='NaN'
            bathrooms.append(bathrooms_iter)
        button.click()
        print('click-',{clicks})
        clicks -= 1


    data = pd.DataFrame(
        {
            'price' : price,
            'region' : region,
            'address' : address,
            'bedrooms' : bedrooms,
            'bathrooms' : bathrooms
        }
    )   

    # Drop duplicate values in the dataframe. 
    data.drop_duplicates(inplace= True)
    #data.to_excel(excel_writer = os.getcwd() + '/' + area + '.xlsx', index = False)
    print('Dataframe saved.')
    driver.quit()
    print('Driver closed.')
    print('There are', data.shape[0], 'number of rows in the dataframe.')

Opening the Browser.
Browser opened the requested url.
Waiting for the manual captcha entering by the user.
Captcha is to be entered within 25 seconds.


  driver.find_element_by_id("homeSearchTxt").send_keys(area)


Region name entered in the search bar.


  driver.find_element_by_xpath('//*[@id="homeSearchBtn"]').click()
  driver.find_element_by_xpath('//*[@id="polygonOptInBtn"]').click()
  button = driver.find_element_by_xpath('//*[@id="SideBarPagination"]/div/a[3]/div')
  listings = driver.find_elements_by_class_name('cardCon')
  region_iter = driver.find_element_by_id('locationSearchFilterText').text


click- {51}
click- {50}
click- {49}
click- {48}
click- {47}
click- {46}
click- {45}
click- {44}
click- {43}
click- {42}
click- {41}
click- {40}
click- {39}
click- {38}
click- {37}
click- {36}
click- {35}
click- {34}
click- {33}
click- {32}
click- {31}
click- {30}
click- {29}
click- {28}
click- {27}
click- {26}
click- {25}
click- {24}
click- {23}
click- {22}
click- {21}
click- {20}
click- {19}
click- {18}
click- {17}
click- {16}
click- {15}
click- {14}
click- {13}
click- {12}
click- {11}
click- {10}
click- {9}
click- {8}
click- {7}
click- {6}
click- {5}
click- {4}
click- {3}
click- {2}
click- {1}
Dataframe saved.
Driver closed.
There are 118 number of rows in the dataframe.


Load separate dataframes from different regions and combine them to make a single large dataset.

In [94]:
combined_toronto_property_data = pd.DataFrame({
                'price' : [], 
                'region' : [], 
                'address' : [], 
                'bedrooms' : [], 
                'bathrooms' : []
                })
                
for index in range(0, len(areas)):
    combined_toronto_property_data = pd.concat(
        [combined_toronto_property_data , 
        pd.read_excel(os.getcwd()+'/'+areas[index]+'.xlsx')]
    )

Unnamed: 0,price,region,address,bedrooms,bathrooms
0,"$799,000","Ajax, ON","2 ROLLO DR, Ajax, Ontario",3,3.0
1,"$989,000","Ajax, ON","717 OLD HARWOOD AVE, Ajax, Ontario",2 + 1,1.0
2,"$999,900","Ajax, ON","52 ADDLEY CRES, Ajax, Ontario",3,4.0
3,"$799,900","Ajax, ON","249 MONARCH AVE, Ajax, Ontario",3,3.0
4,"$899,999","Ajax, ON","18 MONK CRES, Ajax, Ontario",3,3.0
...,...,...,...,...,...
417,"$1,150,000","Scarborough, Toronto, ON","#902 -2799 KINGSTON RD, Toronto, Ontario",3,3.0
418,"$12,200,000","Scarborough, Toronto, ON","3475* DANFORTH AVE, Toronto, Ontario",,
419,"$550,000","Scarborough, Toronto, ON","#506 -2201 KINGSTON RD, Toronto, Ontario",1,1.0
420,"$1,820,000","Scarborough, Toronto, ON","103 SLAN AVE, Toronto, Ontario",7 + 3,4.0


Save the dataframe to an excel sheet.

In [108]:
combined_toronto_property_data.to_excel(excel_writer= os.getcwd()+'/combined_toronto_property_data.xlsx', index= False)