In [1]:
import pandas as pd
from random import randint
import time
import re
# Scraping through Chrome webdriver
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys

In [2]:
# Starting URLs
centris = "https://www.centris.ca/en/properties~for-sale?view=Thumbnail"
duproprio = "https://duproprio.com/en/search/list?search=true&is_for_sale=1&with_builders=1&parent=1&pageNumber=1&sort=-published_at"

In [267]:
class Centris:
    """
    Represents a data object scraped from
    centris.ca. Uses selenium and the Chrome webdriver
    to access# Path to Chromedriver web elements.
    
    Attr:
    self.url - starting url for scraping process
    self.data - pandas.DataFrame object containing scraped data
    self.driver - Chrome webdriver
    self.containers - List of web-elements containing information on listings
        - eg: link to listing, price, picture, address,...
    self.links_to_listings - List of web-elements, each containing the link to a listing
    """
    
    
        
    def __init__(self, url="https://www.centris.ca/en/houses~for-sale~lac-simon/11851081?view=Summary&uc=3"): 
        self.url = url
        self.data = pd.DataFrame()
        # Path to Chromedriver
        self.DRIVER_PATH = 'C:/webdriver/chromedriver.exe'
        self.driver = None
        # Verification for new DOM
        self.old_DOM = {\
            "title" : "",\
            "address": "",\
            "price": "",\
            "lat": "",\
            "long": "",\
            "descriptions": "",\
            "neighbourhood_indicators": "",\
            "demographics_buttons": "",\
        }

    def append_data(self, title, address, price, lat, long):
        new_data = pd.DataFrame({'title': title,\
                                'address': address,\
                                'price': price,\
                                'lat': lat,\
                                'long': long}, index=[0])
        self.data = self.data.append(new_data, ignore_index=True)
             
    def get_data(self):
        return self.data
        
    def start_driver(self):
        """
        Starts and returns Crome webdriver. 
        The page link in the url attribute 
        is opened in headless mode.
        """
        
        # Activate headless mode for fastest response
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-infobars"); # disabling infobars
        options.add_argument("--disable-extensions"); # disabling extensions
        options.add_argument("--disable-gpu"); # applicable to windows os only
        options.add_argument("--disable-dev-shm-usage"); # overcome limited resource problems
        options.add_argument("--no-sandbox"); # Bypass OS security model
        options.add_argument('--start-maximized') # open Browser in maximized mode
        options.add_argument('--incognito')

        # Start driver with url
        self.driver = webdriver.Chrome(executable_path=self.DRIVER_PATH)
        self.driver.get(self.url)

    def sort_listings(self):
        """Sorts listings in webdriver from newest to oldest."""
        
        # Click drop down menu
        drop_down = self.driver.find_element_by_xpath(\
                                    "//button[@id='dropdownSort']")
        drop_down.click()
        
        # Sort by most recent listings
        sort_by = self.driver.find_element_by_xpath("//a[@data-option-value='3']")
        sort_by.click()
    
    def goto_first_page(self):
        try:
            next_page = self.driver.find_element_by_xpath(\
                                        "//li[@class='goFirst']")
            next_page.click()
        except:
            print("goFirst button not available")
    
    def next_page(self):
        try:
            next_page = self.driver.find_element_by_xpath(\
                                        "//li[@class='next']")
            next_page.click()
            pass
        except:
            time.sleep(0.5)
            # Try again after waiting 0.5 sec.
            try:
                next_page = self.driver.find_element_by_xpath(\
                                            "//li[@class='next']")
                next_page.click()
                pass
            except:
                print("Next-page button not found!")
                
    def get_last_page(self):
        '''Returns page number of last page in browser.'''
        
        last_page = self.driver.find_element_by_xpath(\
                                    "//li[@class='pager-current']").text
        return int(last_page.split(" / ")[1].replace(",", ""))
                
#     def get_data_from_listings(self):
#         """
#         Retrieves information from link at link_index in  
#         links_to_listings list and appends to self.data
        
#         Arg:
#         area - list of areas from which data should be scraped
#         """

#         # Data from headers
#         print("Start scraping new page...")
#         title = self.driver.find_element_by_xpath("//span[@data-id='PageTitle']")
#         address = self.driver.find_element_by_xpath("//h2[@itemprop='address']")
#         price = self.driver.find_element_by_xpath("//span[@itemprop='price']")
#         lat = self.driver.find_element_by_xpath("//meta[@itemprop='latitude']")
#         long = self.driver.find_element_by_xpath("//meta[@itemprop='longitude']")
#         self.append_data(title.text,\
#                          address.text,\
#                          price.text,\
#                          lat.get_attribute("content"),\
#                          long.get_attribute("content"))

#         # Description of listing (Year build, price, Lot area, etc.)
#         descriptions = self.driver.find_element_by_xpath("//div[@class='col-lg-12 description']")
#         descriptions_list = descriptions.text.split("\n")
        
#         # Rating of indicators between 0-10 (Groceries, parks, etc.)
#         neighbourhood_indicators = self.driver.find_element_by_xpath(\
#                              "//div[@class='ll-list ps ps--active-y']")
#         neighbourhood_indicators_list = neighbourhood_indicators\
#                             .text.split("\n")
        
#         # Population summary data (density, variation etc.)
#         population_summaries =  self.driver.find_element_by_id('info')
#         population_summaries_list = population_summaries\
#                             .text.split("\n")
        
#         # Buttons to access demographics data (education, incomes, etc.)
#         demographics_buttons = self.driver.find_elements_by_xpath(\
#                             "//div[@class='centrisSocioDemobutton']")
        
#         # First entry on clickable demographics list (pre-selected)
#         demographics = [self.driver.find_element_by_xpath(\
#                              "//div[@class='socioDemoLabel']").text]
        
#         # Click buttons to access next demogrpahics elements
#         for button in demographics_buttons[1:]:
#             try: 
#                 button.click()
#                 demographic_data = self.driver.find_element_by_xpath(\
#                              "//div[@class='socioDemoLabel']")
#                 demographics.append(demographic_data.text)
#             except : 
#                 print("Demogrphics button missing!")
#                 demographics.append(None)
        
#         # Return to top of page, to access next-page button
#         body = self.driver.find_element_by_tag_name("body")
#         for i in range(7):
#             body.send_keys(Keys.PAGE_UP)
                                                  
# Instantiate class object
centris = Centris()

The following functions need to be outside of the Class. wait_for_xpath() determined the approptiate time to call get_data(). Initially, both fuctions were part of the class object. It seems that after the get_data() call, the driver does not get updated within the class. This leads in some cases to old DOM's being accessed after the browser has already switched to the next page. To circumvent this issue, elements are called outside the class and tried until accessible. This allows the entire new DOM to be loaded before get_data() is called.

In [253]:
 def wait_for_xpath(centris_driver, xpath: str, old_element):
        """
        Wait until elements in new DOM are accessible.
        
        Returns:
        current_element - the element found in the new DOM at xpath
        """
        
        # Wait until DOM accessible
        time.sleep(0.2)
        current_element = None
        try:
            current_element = centris_driver.find_elements_by_xpath(xpath)
        except:
            time.sleep(0.2)
            try:
                print("Second attempt loading element!")
                current_element = centris_driver.find_elements_by_xpath(xpath)
            except:
                time.sleep(0.15)
                try:
                    print("Third and last attempt loading element!")
                    current_element = centris_driver.find_elements_by_xpath(xpath)
                except:
                    print("TimeOutError: Unable to load element")
        
        # LOGGING --------------------------------
#         print("Previous:\n", old_element)
#         print("Current:\n", current_element)
#         print("="*50)
        
        # Ensure that the NEW rather than the previous DOM is active
        time_passed = 0
        while (current_element == old_element\
               and time_passed < 10):
            print("Waiting for new DOM...")
            time.sleep(0.15)
            time_passed += 0.15
            current_element = centris_driver.find_elements_by_xpath(xpath)
        return current_element

In [292]:
def get_data_from_centris(centris_class_object):
        """
        Retrieves information from link at link_index in  
        links_to_listings list and appends to data
        
        Arg:
        area - list of areas from which data should be scraped
        """
        driver = centris_class_object.driver
        old_DOM = centris_class_object.old_DOM
        
        # Data from headers
        print("Start scraping new page...")
        title = wait_for_xpath(driver, "//span[@data-id='PageTitle']", old_DOM['title'])
        address = wait_for_xpath(driver, "//h2[@itemprop='address']", old_DOM['address'])
        price = wait_for_xpath(driver, "//span[@itemprop='price']", old_DOM['price'])
        lat = wait_for_xpath(driver, "//meta[@itemprop='latitude']", old_DOM['lat'])
        long = wait_for_xpath(driver, "//meta[@itemprop='longitude']", old_DOM['long'])
        
        #LOGGING------------------------
        #print("DATA:", centris_class_object.data)

        # Description of listing (Year build, price, Lot area, etc.)
        descriptions = wait_for_xpath(driver, "//div[@class='col-lg-12 description']",\
                                     old_DOM['descriptions'])
        descriptions_list = descriptions[0].text.split("\n")
        
        #LOGGING------------------------
        print("DESCRIPTION:", descriptions_list)
        
        # Rating of indicators between 0-10 (Groceries, parks, etc.)
        neighbourhood_indicators = wait_for_xpath(driver,\
                             "//div[@class='ll-list ps ps--active-y']",\
        
                                                  old_DOM['neighbourhood_indicators'])
        # REFACTOR ----> wait_for_xpath()
        # Refresh page if indicators list hasn't been correctly loaded
        if len(neighbourhood_indicators) == 0:
            driver.refresh()
            time.sleep(2)
            centris_class_object.get_data_from_centris()
        else:
            neighbourhood_indicators_list = neighbourhood_indicators[0]\
                            .text.split("\n")
        
        #LOGGING------------------------
        print("NEIGHBORHOOD:", neighbourhood_indicators_list)
        
        # Population summary data (density, variation etc.)
        population_summaries =  driver.find_element_by_id('info')
        population_summaries_list = population_summaries\
                            .text.split("\n")
        
        # Buttons to access demographics data (education, incomes, etc.)
        demographics_buttons = wait_for_xpath(driver,\
                            "//div[@class='centrisSocioDemobutton']",\
                                                     old_DOM['demographics_buttons'])
        
        #LOGGING------------------------
        #print("DEMO. BUTTONS:", demographics_buttons)
        
        # First entry on clickable demographics list (pre-selected)
        demographics = []
        
        # Click buttons to access next demogrpahics elements
        for button in demographics_buttons:
            try: 
                button.click()
                demographic_data = driver.find_element_by_xpath(\
                             "//div[@class='socioDemoLabel']")
                demographics.append(demographic_data.text)
            except : 
                print("Demogrphics button missing!")
                demographics.append(None)
        
        #LOGGING------------------------
        print("DEMO. DATA:", demographics)
        print("-"*50)
                
        # Save elements as old DOM
        centris_class_object.old_DOM = {\
            "title" : title,\
            "address": address,\
            "price": price,\
            "lat": lat,\
            "long": long,\
            "descriptions": descriptions,\
            "neighbourhood_indicators": neighbourhood_indicators,\
            "demographics_buttons": demographics_buttons,\
        }
        
        # Register data in dataframe
        centris_class_object.append_data(title[0].text,\
                         address[0].text,\
                         price[0].text,\
                         lat[0].get_attribute("content"),\
                         long[0].get_attribute("content"))
        
        # Return to top of page, to access next-page button
        body = driver.find_element_by_tag_name("body")
        for i in range(7):
            body.send_keys(Keys.PAGE_UP)

## Testing

In [293]:
# Test
start = time.time()
centris.start_driver()
centris.sort_listings()
print("Execution time:", time.time() - start)

Execution time: 10.683602094650269


Before running the next cell, search for the region(s) you want to scrape in the webdriver window.
This is not required but will substential limit run time and narrow results.

In [294]:
start = time.time() 
total_pages = centris.get_last_page() # Number of total listings
one_to_100 = range(1,100) # Used to print message after each 1% completion

print("Scraping initiated.")
print("Totla number of pages to scrap:", total_pages)
print("="*50)

for i in range(2):
    # -0.01 corrects for overshoots
    percent_complete = round(100*((i-0.01)/total_pages),1) # percent completed of scraping
    print("Page:", i+1)
    time_passed = 0 # to exit while loop after 10 seconds
    
    #Refresh every 20 pages to clear memory build-up
    if (i+1)%20 == 0:
        print("Refreshing page")
        print("-"*50)
        
        # Each refresh frees some memory. Four seem to work best.
        for i in range(4):
            centris.driver.refresh()
            # Give time to load fresh page
            time.sleep(0.3)
            # Extra time for last refresh
            # Ensures that DOM is fully loaded
            if i == 3:
                time.sleep(2)
            
    #Retrieve data    
    get_data_from_centris(centris)
    
    # Short delay for chrome to respond to PAGE_UP command
    time.sleep(0.5)
    centris.next_page()
            
    # Print after every 1% mark
    if percent_complete in one_to_100:
        print(percent_complete, "%", "completed")
        print("Estimated remaining runtime:", round(total_pages*((execution_time/N)/(60*60)), 1), "hours")
        print("="*50)

execution_time = time.time() - start
print("Total runtime:", execution_time/(60*60), "hours")
centris.data

Scraping initiated.
Totla number of pages to scrap: 45886
Page: 1
Start scraping new page...
DESCRIPTION: ['Features', '?', 'Lifestyle', '9 rooms', '3 bedrooms', '1 bathroom and 1 powder room', 'Building style', 'Two or more storey, Semi-detached', 'Year built', 'Under construction, New', 'Lot area', '4,279 sqft', 'Parking (total)', 'Driveway (2)', 'Additional features', 'Basement 6 feet or +']
NEIGHBORHOOD: ['Quiet', '10', 'Car friendly', '10', 'Groceries', '6', 'Parks', '5', 'Elementary Schools', '4', 'High Schools', '4', 'Restaurants', '4', 'Shopping', '4']
DEMO. DATA: ['Less than $50,000\n38%\n\nBetween $50,000 and $80,000\n24%\n\nBetween $80,000 and $100,000\n12%\n\nBetween $100,000 and $150,000\n17%\n\nMore than $150,000\n9%', '1-person households\n35%\n\n2-person households\n38%\n\n3-person households\n12%\n\n4-person households\n11%\n\n5-person or more households\n4%', 'Couples without children at home\n49%\n\nCouples with children at home\n36%\n\nSingle-parent families\n15%', 

Unnamed: 0,title,address,price,lat,long
0,Condo for sale,"4597, Avenue Papineau, Montréal (Le Plateau-Mo...","$385,000",45.5337310000,-73.5757970000
1,Condo for sale,"945, Rue Muir, apt. 306, Montréal (Saint-Laure...","$319,000",45.5235647300,-73.6708712000
2,Triplex for sale,"2606 - 2610, Rue Sicard, Montréal (Mercier/Hoc...","$887,000",45.5593864800,-73.5469227300
3,Duplex for sale,"11554 - 11556, Avenue Désy, Montréal (Montréal...","$679,000",45.6095560000,-73.6198640000
4,House for sale,"7040, Avenue Goncourt, Montréal (Anjou)","$695,000",45.6025520000,-73.5547700000
...,...,...,...,...,...
3264,Condo for sale,"1050, Rue Drummond, apt. 3305, Montréal (Ville...","$489,000",45.4969509000,-73.5703519800
3265,House for sale,"2379Z, Rue Grand Trunk, Montréal (Le Sud-Ouest...","$2,100,000",45.4786925900,-73.5656457500
3266,Condo for sale,"1900, boulevard Angrignon, apt. 1207, Montréal...","$999,000",45.4471880000,-73.6094830000
3267,House for sale,"1300, Rue du Blé, Lévis (Desjardins), Neighbou...","$259,000",46.8110149300,-71.1403038300


In [249]:
centris.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 527 entries, 0 to 526
Data columns (total 5 columns):
title      527 non-null object
address    527 non-null object
price      527 non-null object
lat        527 non-null object
long       527 non-null object
dtypes: object(5)
memory usage: 20.7+ KB


In [284]:
centris.data.describe(include="all")

Unnamed: 0,title,address,price,lat,long
count,200,200,200,200.0,200.0
unique,9,190,144,187.0,187.0
top,Condo for sale,"2055, Rue du Fort, apt. 402, Montréal (Ville-M...","$499,000",45.495988,-73.569304
freq,133,2,10,3.0,3.0


In [234]:
# start = time.time()
# N = 200 
# total_pages = 5574 # Number of Montreal listings
# previous_demographics = "" # demographics button elements found on previous page
# current_demographics = "" # demographics button elements found on current page
# previous_neighborhood = ""
# current_neighborhood = ""

# for i in range(N):
#     percent_complete = 100*(i/total_pages)
#     print("Page:", i+1)
#     time_passed = 0 # to exit while loop after 10 seconds
    
#     #Refrege every 20 pages to prevent running out of memory
#     if (i+1)%20 == 0:
#         print("Refreshing page")
#         print("-"*50)
#         # Each refresh releases about 1% of memory
#         # 1% of additional memory is required for every third page
#         # 20 pages -> 7 page refreshes 
#         for i in range(7):
#             centris.driver.refresh()
#             # Give extra time to load fresh page
#             time.sleep(0.3)
    
#     # Verify that demographics buttons and enighbourhood data are accessible
#     time.sleep(0.4)
#     try:
#         current_demographics = centris.driver.find_element_by_xpath(\
#                                 "//div[@class='centrisSocioDemobutton']")
#         current_neighborhood = centris.driver.find_element_by_xpath(\
#                                 "//div[@class='ll-list ps ps--active-y']")
#     except:
#         time.sleep(0.15)
#         try:
#             print("Second attempt loading element!")
#             current_demographics = centris.driver.find_elements_by_xpath(\
#                             "//div[@class='centrisSocioDemobutton']")
#             current_neighborhood = centris.driver.find_element_by_xpath(\
#                             "//div[@class='ll-list ps ps--active-y']")
#         except:
#             time.sleep(0.15)
#             try:
#                 print("Third and last attempt loading element!")
#                 current_demographics = centris.driver.find_element_by_xpath(\
#                                 "//div[@class='centrisSocioDemobutton']")
#                 current_neighborhood = centris.driver.find_element_by_xpath(\
#                                 "//div[@class='ll-list ps ps--active-y']")
#             except:
#                 print("TimeOutError: Unable to load element")
    
# #     print("Previous:\n", previous_demographics)
# #     print("="*50)
# #     print("Current:\n", current_demographics)
    
#     # Ensure that the NEW rather than the previous DOM is active
#     while (current_demographics == previous_demographics\
#            and current_neighborhood == previous_neighborhood
#            and time_passed < 10):
#         print("Waiting for new DOM...")
#         time.sleep(0.15)
#         time_passed += 0.15

#         # Repeat loading attempt and compare to previous data -> while
#         current_demographics = centris.driver.find_element_by_xpath(\
#                                 "//div[@class='centrisSocioDemobutton']")
#         current_neighborhood = centris.driver.find_element_by_xpath(\
#                                 "//div[@class='ll-list ps ps--active-y']")
    
#     # Set current demographic data as previous for next verification
#     previous_demographics = current_demographics
#     previous_neighborhood = current_neighborhood
#     #Retrieve data
#     centris.get_data_from_listings()
#     centris.next_page()
            
#     # Print message after every 1% mark
#     if percent_complete in range(1,100):
#         print(percent_complete, "%", "completed")
#         print("Estimated remaining runtime:", round(46221*((execution_time/N)/(60*60)), 1), "hours")
#         print("="*50)

# execution_time = time.time() - start
# print(execution_time)
# centris.data
# print("Total expected scrape time =", round(46221*((execution_time/N)/(60*60)), 1), "hours")

Page: 1


AttributeError: 'Centris' object has no attribute 'get_data_from_listings'