In [446]:
import pandas as pd
from random import randint
import time
import re
# Scraping through Chrome webdriver
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

In [2]:
# Starting URLs
centris = "https://www.centris.ca/en/properties~for-sale?view=Thumbnail"
duproprio = "https://duproprio.com/en/search/list?search=true&is_for_sale=1&with_builders=1&parent=1&pageNumber=1&sort=-published_at"

In [631]:
class Centris:
    """
    Represents a data object scraped from
    centris.ca. Uses selenium and the Chrome webdriver
    to access# Path to Chromedriver web elements.
    
    Attr:
    self.url - starting url for scraping process
    self.data - pandas.DataFrame object containing scraped data
    self.driver - Chrome webdriver
    self.containers - List of web-elements containing information on listings
        - eg: link to listing, price, picture, address,...
    self.links_to_listings - List of web-elements, each containing the link to a listing
    """
    # Path to Chromedriver
    DRIVER_PATH = 'C:/webdriver/chromedriver.exe'
        
    def __init__(self, url="https://www.centris.ca/en/condominium-houses"\
                             "~for-sale~saint-come/27673618?view=Summary&uc=2"): 
        self.url = url
        self.data = pd.DataFrame(columns=['title'\
                                        'address',\
                                        'price',\
                                        'lat',\
                                        'long'])
        self.driver = None
        self.containers = None
#       self.links_to_listings = None
        
    def append_data(self, title, address, price, lat, long):
        new_data = pd.DataFrame({'title': title,\
                                'address': address,\
                                'price': price,\
                                'lat': lat,\
                                'long': long}, index=[0])
        self.data = self.data.append(new_data, ignore_index=True)
             
    def get_data(self):
        return self.data
        
    def start_driver(self):
        """
        Starts and returns Crome webdriver. 
        The page link in the url attribute 
        is opened in headless mode.
        """
        
        # Activate headless mode for fastest response
        options = Options()
        options.add_argument("--headless")

        # Start driver with url
        self.driver = webdriver.Chrome(executable_path=DRIVER_PATH)
        self.driver.get(self.url)
    
#     def set_containers(self):
#         try:
#             containers = self.driver.find_elements_by_class_name(\
#                                             "description")
#             # Last element is empty string
#             self.containers = containers[:-1]
#         except:
#             print("Chrome driver is not running!")
#             print("Use: \"start_driver()\"")
            
    def goto_first_page(self):
        try:
            next_page = self.driver.find_element_by_xpath(\
                                        "//li[@class='goFirst']")
            next_page.click()
        except:
            print("goFirst button not available")
    
    def next_page(self):
        try:
            next_page = self.driver.find_element_by_xpath(\
                                        "//li[@class='next']")
            next_page.click()
        except:
            print("Next page button not available")
            
#     def set_links_to_listings(self):
#         """Retrieves list of elements from each container
#         with link to listing site"""
#         try:
#             self.links_to_listings = [container.\
#                             find_element_by_class_name("a-more-detail")\
#                                       for container in self.containers]
#         except:
#             if self.driver == None:
#                 print("Start driver first: start_driver()")
#             elif self.containers == None:
#                 print("Set container attribute: set_containers()" )
#             else: print("No class 'a-more-detail' found on current page")


                
    def get_data_from_link(self):
        """
        Retrieves information from link at link_index in  
        links_to_listings list and appends to self.data
        """
        
#         # Open link in new tab
#         actions = ActionChains(self.driver)\
#             .key_down(Keys.CONTROL)\
#             .click(element)\
#             .key_up(Keys.CONTROL)\
#             .perform()
#         # Move to new tab
#         self.driver.switch_to.window(self.driver.window_handles[-1])
        
        # Scrape data
        print("A new page is being scraped...")
        
        # Allow for time to load elements
        time.sleep(1)
        # Data from headers
        title = self.driver.find_element_by_xpath("//span[@data-id='PageTitle']").text
        address = self.driver.find_element_by_xpath("//h2[@itemprop='address']").text
        price = self.driver.find_element_by_xpath("//span[@itemprop='price']").text
        lat = self.driver.find_element_by_xpath("//meta[@itemprop='latitude']").get_attribute("content")
        long = self.driver.find_element_by_xpath("//meta[@itemprop='longitude']").get_attribute("content")
        print(title, address, price, lat, long)
        self.append_data(title, address, price, lat, long)
        
        # Scroll down 3x to load additional page elements
        body = self.driver.find_element_by_tag_name("body")
        for i in range(3):
            body.send_keys(Keys.PAGE_DOWN)
        # Allow for time to load elements
        time.sleep(1)
        
        # Description of listing (Year build, price, Lot area, etc.)
        descriptions = self.driver.find_element_by_xpath(\
                            "//div[@class='col-lg-12 description']")
        descriptions_list = descriptions.text.split("\n")
        
        # Rating of indicators between 0-10 (Groceries, parks, etc.)
        neighbourhood_indicators = self.driver.find_element_by_xpath(\
                             "//div[@class='ll-list ps ps--active-y']")
        neighbourhood_indicators_list = neighbourhood_indicators\
                            .text.split("\n")
        
        # Population summary data (density, variation etc.)
        population_summaries =  self.driver.find_element_by_id('info')
        population_summaries_list = population_summaries\
                            .text.split("\n")
        
        # Buttons to access demographics data (education, incomes, etc.)
        demographics_buttons = self.driver.find_elements_by_xpath(\
                            "//div[@class='centrisSocioDemobutton']")
        
        # First entry on clickable demographics list (pre-selected)
        demographics = [self.driver.find_element_by_xpath(\
                             "//div[@class='socioDemoLabel']").text]
        
        # Click buttons to access next demogrpahics elements
        for button in demographics_buttons[1:]:
            button.click()
            demographic_data = self.driver.find_element_by_xpath(\
                             "//div[@class='socioDemoLabel']")
            # Append to list
            demographics.append(demographic_data.text)
            
        # Return to top of page, to access next-page button
        for i in range(7):
            body.send_keys(Keys.PAGE_UP)
        
        # Logging
#         print(descriptions_list)
#         print(neighbourhood_indicators_list)
#         print(population_summaries_list)
#         print([demo for demo in demographics])
                                                  
# Instantiate class object
centris = Centris()

## Testing

In [632]:
# Test
start = time.time()
centris.start_driver()
print("Execution time:", time.time() - start)

Execution time: 6.429761171340942


In [634]:
for i in range(20):
    centris.get_data_from_link()
    centris.next_page()

centris.data

A new page is being scraped...
Condominium house for sale 235, Rue de l'Auberge, Saint-Côme $147,900 46.2725780000 -73.8838280000
A new page is being scraped...
Condo for sale 8635, Rue Lajeunesse, apt. 617, Montréal (Villeray/Saint-Michel/Parc-Extension), Neighbourhood Villeray $439,000 45.5461014500 -73.6364320000
A new page is being scraped...
Condo for sale 1117, Avenue De Bourlamaque, Québec (La Cité-Limoilou), Neighbourhood Montcalm $268,900 46.8020410000 -71.2269650000
A new page is being scraped...
Condo for sale 201, Rue du Val-des-Neiges, apt. 404, Beaupré $247,500 47.0615171900 -70.9083216400
A new page is being scraped...
Condo for sale 201, Rue du Val-des-Neiges, apt. 404, Beaupré $247,500 47.0615171900 -70.9083216400
A new page is being scraped...
Condo for sale 11445B, boulevard de la Colline, apt. 101, Québec (La Haute-Saint-Charles), Neighbourhood Loretteville $300,000 46.8594270000 -71.3444990000
A new page is being scraped...
House for sale 165, Rue Champêtre, Saint-

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//div[@class='ll-list ps ps--active-y']"}
  (Session info: chrome=84.0.4147.89)


In [470]:
browser = webdriver.Chrome('C:/webdriver/chromedriver.exe')
browser.get("https://www.centris.ca/en/lots~for-sale~terrebonne-lachenaie/12479005?view=Summary&uc=1")
browser.find_element_by_tag_name("body").send_keys(Keys.PAGE_DOWN)

In [473]:
browser.find_element_by_xpath(\
        "//div[@class='ll-list ps ps--active-y']")

<selenium.webdriver.remote.webelement.WebElement (session="7d04f02d53d5f761449e6ce684bac0e3", element="a425e126-6a30-4109-89a3-560e2a4a5b79")>

In [572]:
centris.driver.find_element_by_xpath(\
                              "//div[@class='socioDemoLabel']").text

'University\n19%\n\nCollege\n15%\n\nSecondary (high) school\n26%\n\nApprentice or trade school diploma\n19%\n\nNo diploma\n20%'

In [576]:
body = centris.driver.find_element_by_tag_name('body')
body.send_keys(Keys.PAGE_DOWN)

In [580]:
body.send_keys(Keys.PAGE_DOWN)

In [630]:
centris.data

Unnamed: 0,address,lat,long,price,title,titleaddress
0,"235, Rue de l'Auberge, Saint-Côme",46.272578,-73.883828,"$147,900",Condominium house for sale,
1,"8635, Rue Lajeunesse, apt. 617, Montréal (Vill...",45.54610145,-73.636432,"$439,000",Condo for sale,
