In [446]:
import pandas as pd
from random import randint
import time
import re
# Scraping through Chrome webdriver
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

In [2]:
# Starting URLs
centris = "https://www.centris.ca/en/properties~for-sale?view=Thumbnail"
duproprio = "https://duproprio.com/en/search/list?search=true&is_for_sale=1&with_builders=1&parent=1&pageNumber=1&sort=-published_at"

In [537]:
class Centris:
    """
    Represents a data object scraped from
    centris.ca. Uses selenium and the Chrome webdriver
    to access# Path to Chromedriver web elements.
    
    Attr:
    self.url - starting url for scraping process
    self.data - pandas.DataFrame object containing scraped data
    self.driver - Chrome webdriver
    self.containers - List of web-elements containing information on listings
        - eg: link to listing, price, picture, address,...
    self.links_to_listings - List of web-elements, each containing the link to a listing
    """
    # Path to Chromedriver
    DRIVER_PATH = 'C:/webdriver/chromedriver.exe'
        
    def __init__(self, url="https://www.centris.ca/en/condominium-houses"\
                             "~for-sale~saint-come/27673618?view=Summary&uc=2"): 
        self.url = url
        self.data = pd.DataFrame(columns=['title',\
                                          'price(CAD)',\
                                          'address'])
        self.driver = None
        self.containers = None
#       self.links_to_listings = None
        
    def add_data(self, title, price, address):
        new_data = pd.DataFrame({'title': title,
                                'price(CAD)': price,
                                 'address': address})
        self.data = self.data.append(new_data)
             
    def get_data(self):
        return self.data
        
    def start_driver(self):
        """
        Starts and returns Crome webdriver. 
        The page link in the url attribute 
        is opened in headless mode.
        """
        
        # Activate headless mode for fastest response
        options = Options()
        options.add_argument("--headless")

        # Start driver with url
        self.driver = webdriver.Chrome(executable_path=DRIVER_PATH)
        self.driver.get(self.url)
    
#     def set_containers(self):
#         try:
#             containers = self.driver.find_elements_by_class_name(\
#                                             "description")
#             # Last element is empty string
#             self.containers = containers[:-1]
#         except:
#             print("Chrome driver is not running!")
#             print("Use: \"start_driver()\"")
            
    def goto_first_page(self):
        try:
            next_page = self.driver.find_element_by_xpath(\
                                        "//li[@class='goFirst']")
            next_page.click()
        except:
            print("goFirst button not available")
    
    def next_page(self):
        try:
            next_page = self.driver.find_element_by_xpath(\
                                        "//li[@class='next']")
            next_page.click()
        except:
            print("Next page button not available")
            
#     def set_links_to_listings(self):
#         """Retrieves list of elements from each container
#         with link to listing site"""
#         try:
#             self.links_to_listings = [container.\
#                             find_element_by_class_name("a-more-detail")\
#                                       for container in self.containers]
#         except:
#             if self.driver == None:
#                 print("Start driver first: start_driver()")
#             elif self.containers == None:
#                 print("Set container attribute: set_containers()" )
#             else: print("No class 'a-more-detail' found on current page")
                
    def get_data_from_link(self):
        """
        Retrieves information from link at link_index in  
        links_to_listings list and appends to self.data
        """
        
#         # Open link in new tab
#         actions = ActionChains(self.driver)\
#             .key_down(Keys.CONTROL)\
#             .click(element)\
#             .key_up(Keys.CONTROL)\
#             .perform()
#         # Move to new tab
#         self.driver.switch_to.window(self.driver.window_handles[-1])
        
        # Scrape data
        print("Starting scrape process.")
        # Scroll down 3x to load entire page
        for i in range(3):
            self.driver.find_element_by_tag_name("body")\
                .send_keys(Keys.PAGE_DOWN)
        
        # Description of listing (Year build, price, Lot area, etc.)
        descriptions = self.driver.find_element_by_xpath(\
                            "//div[@class='col-lg-12 description']")
        descriptions_list = descriptions.text.split("\n")
        
        # Rating of indicators between 0-10 (Groceries, parks, etc.)
        neighbourhood_indicators = self.driver.find_element_by_xpath(\
                             "//div[@class='ll-list ps ps--active-y']")
        neighbourhood_indicators_list = neighbourhood_indicators\
                            .text.split("\n")
        
        # Population summary data (density, variation etc.)
        population_summaries =  self.driver.find_element_by_id('info')
        population_summaries_list = population_summaries\
                            .text.split("\n")
        
        # Buttons to access demographics data (education, incomes, etc.)
        demographics_buttons = self.driver.find_elements_by_xpath(\
                            "//div[@class='centrisSocioDemobutton']")
        demographics = []
        # Click all buttons to access all information
        for button in demographics_buttons:
            button.click()
            time.sleep(3)
            demographic_data = self.driver.find_elements_by_class_name(\
                             "socioDemoLabel")
            print(demographic_data)
            demographics.append(demographic_data[0])
            
        # Scroll down 3x to make next page button accessible
        for i in range(3):
            self.driver.find_element_by_tag_name("body")\
                .send_keys(Keys.PAGE_DOWN)
        
        # Logging
        print(descriptions_list)
        print(neighbourhood_indicators_list)
        print(population_summaries_list)
        print("Numeber of buttons:", len(demographics_buttons))
        print([demo.text for demo in demographics])
                                                  
# Instantiate class object
centris = Centris()

In [540]:
# Test
start = time.time()
centris.start_driver()
print("Execution time:", time.time() - start)

Execution time: 6.110896587371826


In [541]:
centris.get_data_from_link()
centris.next_page()
centris.get_data_from_link()

Starting scrape process.
[<selenium.webdriver.remote.webelement.WebElement (session="8006c9fe12c138a0128e27295a872e20", element="f77c6b47-30b0-4129-96c5-cf6f5338f01c")>]
[<selenium.webdriver.remote.webelement.WebElement (session="8006c9fe12c138a0128e27295a872e20", element="f77c6b47-30b0-4129-96c5-cf6f5338f01c")>]
[<selenium.webdriver.remote.webelement.WebElement (session="8006c9fe12c138a0128e27295a872e20", element="f77c6b47-30b0-4129-96c5-cf6f5338f01c")>]
[<selenium.webdriver.remote.webelement.WebElement (session="8006c9fe12c138a0128e27295a872e20", element="f77c6b47-30b0-4129-96c5-cf6f5338f01c")>]
[<selenium.webdriver.remote.webelement.WebElement (session="8006c9fe12c138a0128e27295a872e20", element="f77c6b47-30b0-4129-96c5-cf6f5338f01c")>]
[<selenium.webdriver.remote.webelement.WebElement (session="8006c9fe12c138a0128e27295a872e20", element="f77c6b47-30b0-4129-96c5-cf6f5338f01c")>]
[<selenium.webdriver.remote.webelement.WebElement (session="8006c9fe12c138a0128e27295a872e20", element="f

In [470]:
browser = webdriver.Chrome('C:/webdriver/chromedriver.exe')
browser.get("https://www.centris.ca/en/lots~for-sale~terrebonne-lachenaie/12479005?view=Summary&uc=1")
browser.find_element_by_tag_name("body").send_keys(Keys.PAGE_DOWN)

In [473]:
browser.find_element_by_xpath(\
        "//div[@class='ll-list ps ps--active-y']")

<selenium.webdriver.remote.webelement.WebElement (session="7d04f02d53d5f761449e6ce684bac0e3", element="a425e126-6a30-4109-89a3-560e2a4a5b79")>

In [472]:
browser.find_element_by_tag_name("body").send_keys(Keys.PAGE_DOWN)