In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from random import randint
import time
import re
# Scraping through chrome driver 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [2]:
# Starting URLs
centris = "https://www.centris.ca/en/properties~for-sale?view=Thumbnail"
duproprio = "https://duproprio.com/en/search/list?search=true&is_for_sale=1&with_builders=1&parent=1&pageNumber=1&sort=-published_at"
# Path to Schromedriver
DRIVER_PATH = 'C:/webdriver/chromedriver.exe'

In [188]:
class Centris:
    """
    Represents a data object scraped from
    centris.ca. Uses selenium and the Chrome webdriver
    to access web elements.
    
    Attr:
    self.url - starting url for scraping process
    """

        
    def __init__(self, url="https://www.centris.ca/en/properties~for-sale?view=Thumbnail"): 
        # Starting url
        self.url = url
        # Extracted data
        self.data = pd.DataFrame(columns=['title', 'price(CAD)', 'address'])
        # Google Chrome driver
        self.driver = None
        # List of real estate descriptions found on current page
        self.containers = None
        # List of container elements with links to listings (<a class="a-more-detail"...>)
        self.links_to_listings = None
        
    def add_data(self, title, price, address):
        new_data = pd.DataFrame({'title': title,
                                'price(CAD)': price,
                                 'address': address})
        self.data = self.data.append(new_data)
             
    def get_data(self):
        return self.data
        
    def start_driver(self):
        """
        Starts and returns Crome webdriver. The specified 'self.url' page 
        is opened in headless mode.
        """
        
        # Activate headless mode for fastest response
        options = Options()
        options.add_argument("--headless")

        # Start driver with url
        self.driver = webdriver.Chrome(executable_path=DRIVER_PATH)
        self.driver.get(self.url)
    
    def set_containers(self):
        try:
            containers = self.driver.find_elements_by_class_name("description")
            # Last element is empty string
            self.containers = containers[:-2]
        except:
            print("Chrome driver is not running!")
            print("Use: \"start_driver()\"")
    
    def next_page(self):
        try:
            next_page = self.driver.find_element_by_xpath("//li[@class='next']")
            next_page.click()
        except:
            print("Next page button not available")
            
    def set_links_to_listings(self):
        """Retrieves list of elements from each container with link to listing site"""
        try:
            self.links_to_listings = [container.find_element_by_class_name("a-more-detail") for container in self.containers]
        except:
            if self.driver == None:
                print("Start driver first: start_driver()")
            elif self.containers == None:
                print("Set container attribute: set_containers()" )
            else: print("No class 'a-more-detail' found on current page")
        

# Instantiate class object
centris = Centris()

In [193]:
# Test
centris.start_driver()
centris.set_containers()
centris.set_links_to_listings()

[links.get_attribute('href') for links in centris.links_to_listings]

['https://www.centris.ca/en/lots~for-sale~l-ange-gardien-outaouais/10602831?view=Summary',
 'https://www.centris.ca/en/houses~for-sale~donnacona/15635416?view=Summary',
 'https://www.centris.ca/en/5plex~for-sale~longueuil-saint-hubert/22568604?view=Summary',
 'https://www.centris.ca/en/condos~for-sale~quebec-la-cite-limoilou/18796294?view=Summary',
 'https://www.centris.ca/en/hobby-farms~for-sale~saint-edouard-de-maskinonge/26266459?view=Summary',
 'https://www.centris.ca/en/condos~for-sale~montreal-ahuntsic-cartierville/9938131?view=Summary',
 'https://www.centris.ca/en/duplexes~for-sale~montreal-cote-des-neiges-notre-dame-de-grace/25165775?view=Summary',
 'https://www.centris.ca/en/houses~for-sale~montreal-saint-leonard/25120106?view=Summary',
 'https://www.centris.ca/en/lots~for-sale~baie-saint-paul/13817190?view=Summary',
 'https://www.centris.ca/en/condos~for-sale~montreal-le-sud-ouest/27915966?view=Summary',
 'https://www.centris.ca/en/lots~for-sale~rouyn-noranda/23007347?view=Su

In [4]:
# def get_container_descriptions(url:str):
#     driver = start_driver(url)
#     container_descriptions = driver.find_elements_by_class_name("description")
#     return container_descriptions

In [10]:
# def get_next_page(driver):
#     try:
#         next_page = driver.find_element_by_xpath("//li[@class='next']")
#         next_page.click()
#     except:
#         print("Next page button not available")

In [11]:
# Click Next button
# driver = start_driver(centris)
# get_next_page(driver)