In [869]:
import pandas as pd
import numpy as np
from random import randint
import time
import re
# Scraping through Chrome webdriver
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

In [2]:
# Starting URLs
centris = "https://www.centris.ca/en/properties~for-sale?view=Thumbnail"
duproprio = "https://duproprio.com/en/search/list?search=true&is_for_sale=1&with_builders=1&parent=1&pageNumber=1&sort=-published_at"

In [906]:
class Centris:
    """
    Accessing 
    
    Attr:
    self.url - starting url for scraping process
    self.data - pandas.DataFrame object containing scraped data
    self.driver - Chrome webdriver
    self.containers - List of web-elements containing information on listings
        - eg: link to listing, price, picture, address,...
    self.links_to_listings - List of web-elements, each containing the link to a listing
    """
     
    def __init__(self, url="https://www.centris.ca/en/houses~for-sale~lac-simon/11851081?view=Summary&uc=3"): 
        self.url = url
        self.data = pd.DataFrame(\
#                     {\
#                         'title': None,\
#                         'address': None,\
#                         'price': None,\
#                         'lat': None,\
#                         'long': None,\
#                         'descriptions': None,\
#                         'neighbourhood_indicators': None,\
#                         'demographics': None
#                     }, index=[0]\
                                )
        # Path to Chromedriver
        self.DRIVER_PATH = 'C:/webdriver/chromedriver.exe'
        self.driver = None
        # Verification for new DOM
        self.old_DOM = {\
                        "title" : [],\
                        "address": [],\
                        "price": [],\
                        "lat": [],\
                        "long": [],\
                        "descriptions": [],\
                        'neighbourhood_top': [],\
                        'neighbourhood_buttom': [],\
                        "demographics_buttons": [],\
                    }

    def append_data(self, title, address, price,\
            lat, long, descriptions, neighbourhood_indicators,\
            population, demographics):
        """Appends new data to existing data frame."""
        new_data = pd.DataFrame({\
                        'title': title,\
                        'address': address,\
                        'price': price,\
                        'lat': lat,\
                        'long': long\
                    }, index=[0])
        
        # DESCRIPTIONS
        description_table = pd.DataFrame()
#         headers_of_interest = [\
#                 "rooms", "bedrooms", "powder room", "Number of units", "Building style",\
#                 "Condominium type", "Year built", "Building area", "Lot area", "walk_score",\
#                 "Net area", "Parking (total)", "Main unit", "Potential gross revenue", "Pool"\
#                               ]
#         # Ensures consistency accross listings
#         for header in headers_of_interest:
#             if header in descriptions.keys():
#                 value = descriptions[header]
#             else:
#                 value = np.nan
#             description_table[header] = pd.Series(value)

        for key in descriptions.keys():
            header = key
            value = descriptions[header]
            description_table[header] = pd.Series(value)
        
        # POPULATION AND DEMOGRAPHICS
        new_data = pd.concat([new_data, neighbourhood_indicators, description_table,\
                             population, demographics], axis=1)
        # LOGGING --------------------------     
        #print(new_data)
        
        self.data = self.data.append(new_data, sort=False,\
                                     ignore_index=True)
        
    
    def get_data(self):
        return self.data
        
    def start_driver(self):
        """
        Starts and returns Crome webdriver. 
        The page link in the url attribute 
        is opened in headless mode.
        """
        
        # Activate headless mode for fastest response
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-infobars"); # disabling infobars
        options.add_argument("--disable-extensions"); # disabling extensions
        options.add_argument("--disable-gpu"); # applicable to windows os only
        options.add_argument("--disable-dev-shm-usage"); # overcome limited resource problems
        options.add_argument("--no-sandbox"); # Bypass OS security model
        options.add_argument('--start-maximized') # open Browser in maximized mode
        options.add_argument('--incognito')

        # Start driver with url
        self.driver = webdriver.Chrome(executable_path=self.DRIVER_PATH)
        self.driver.get(self.url)

    def sort_listings(self):
        """Sorts listings in webdriver from newest to oldest."""
        
        # Click drop down menu
        drop_down = self.driver.find_element_by_xpath(\
                                    "//button[@id='dropdownSort']")
        drop_down.click()
        
        # Sort by most recent listings
        sort_by = self.driver.find_element_by_xpath("//a[@data-option-value='3']")
        sort_by.click()
    
    def goto_first_page(self):
        try:
            next_page = self.driver.find_element_by_xpath(\
                                        "//li[@class='goFirst']")
            next_page.click()
        except:
            print("goFirst button not available")
    
    def next_page(self):
        try:
            next_page = self.driver.find_element_by_xpath(\
                                        "//li[@class='next']")
            next_page.click()
            pass
        except:
            time.sleep(0.5)
            # Try again after waiting 0.5 sec.
            try:
                next_page = self.driver.find_element_by_xpath(\
                                            "//li[@class='next']")
                next_page.click()
                pass
            except:
                print("Next-page button not found!")
                
    def get_last_page(self):
        '''Returns page number of last page in browser.'''
        
        last_page = self.driver.find_element_by_xpath(\
                                    "//li[@class='pager-current']").text
        return int(last_page.split(" / ")[1].replace(",", ""))
    
    def refresh_page(self):
        "Refreshes current webdriver page."
        self.driver.refresh()
        # Wait until page fully loaded
        time.sleep(2)
        
    def distance(origin, destination):
        """Calculates distances from latitudinal/longitudinal data using
        the haversine formula"""
        lat1, lon1 = origin
        lat2, lon2 = destination
        radius = 6371 # km
        
        #Convert from degrees to radians
        dlat = math.radians(lat2-lat1)
        dlon = math.radians(lon2-lon1)
        
        # Haversine formula
        a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
            * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
        d = radius * c

        return d
                
                                                 
# Instantiate class object
centris = Centris()

The following functions need to be outside of the Class. wait_for_xpath() determined the approptiate time to call get_data(). Initially, both fuctions were part of the class object. It seems that after the get_data() call, the driver does not get updated within the class. This leads in some cases to old DOM's being accessed after the browser has already switched to the next page. To circumvent this issue, elements are called outside the class and tried until accessible. This allows the entire new DOM to be loaded before get_data() is called.

In [681]:
 def wait_for_xpath(xpath: str, old_element):
        """
        Wait until elements in new DOM are accessible.
        
        Returns:
        current_element - the element found in the new DOM at xpath
        """
        
        # Wait until DOM accessible
        time.sleep(0.2)
        centris_driver = centris.driver
        current_element = []
        try:
            current_element = centris_driver.find_elements_by_xpath(xpath)
        except:
            time.sleep(0.2)
            try:
                print("Second attempt loading element!")
                current_element = centris_driver.find_elements_by_xpath(xpath)
            except:
                time.sleep(0.15)
                try:
                    print("Third and last attempt loading element!")
                    current_element = centris_driver.find_elements_by_xpath(xpath)
                except:
                    print("TimeOutError: Unable to load element")
        
        # LOGGING --------------------------------
#         print("Previous:\n", old_element)
#         print("Current:\n", current_element)
#         print("="*50)
        
        # Ensure that the NEW rather than the previous DOM is active
        time_passed = 0
        while (current_element == old_element\
               and time_passed < 10):
            print("Waiting for new DOM...")
            time.sleep(0.15)
            time_passed += 0.15
            current_element = centris_driver.find_elements_by_xpath(xpath)
        return current_element

In [682]:
def scrape_description(old_DOM):
    """ Requires instantiated centris object. Scrapes and returns
    description data: Year build, price, Net area, etc."""
    
    descriptions = wait_for_xpath("//div[@class='col-lg-12 description']",\
                                 old_DOM)
    #First three elements not relevant
    descriptions_list = descriptions[0].text.split("\n")[3:]
    
    #LOGGING------------------------
    #print("DESCRIPTION:", descriptions_list)
    
    # Update old_DOM dictionary with new element for next verification
    centris.old_DOM['descriptions'] = descriptions
    
    return extract_descriptions(descriptions_list)

In [827]:
def extract_descriptions(descriptions_list):
    """Takes in data from scrape_description() and returns it 
    as a dictionary"""
    
    # The data_dict found on this part of the page is inconsistent across listings
    # The first row may contain the number of rooms, bedrooms and bathrooms without headers or may be missing
    # Following rows have heathers with associated values after a line break
    # The very last element may be a walking score without header
    # Listings without first row may supply first row information in subsequent rows with headers
    # Because of these inconsistencies, two seperate extractions need to be implemented: one for
    # first row lements (if they exist) and another for subsequent rows
    
    # Transformed data
    data_dict = {}
    # Distinguish between elements from first and subsequent rows if first row exists
    first_row = True
    # Starting point for second part of transformation
    second_row_index = 0
    
    # First Part
    while first_row == True:
        for description in descriptions_list:
            numeric = re.findall("\(*[0-9]+\)*", description) # numbers
            text = re.findall("[A-Za-z]+[A-Za-z\s\-]*", description) # text after/inbetween numbers 

            # Initial elements with numeric values correspond to first row
            if (numeric != []):
                # For each value there must be one text description
                if (len(numeric) == len(text)):
                    for description,value in zip(text, numeric):
                        # Save as column in data_dict
                        description_clean = description.replace("and", "").strip()
                        data_dict[description_clean] = value
                    second_row_index += 1 
                else:
                    print("Unequal number of first row keys and values!")
                    print("Numbers:", numeric)
                    print("Text:", text)
                    break
            else:
                first_row = False # No numeric information implies header
                break
    
    # Index range of second extraction
    # Headers are found at every second index (0,2,4,...)
    # Values are one index apart from their corresponding header (1,3,5,...)
    list_length = len(descriptions_list)
    if (list_length - second_row_index)%2 == 1: # Implies presence of element without header -> Walk Score
        walk_score_listed = True
        end_point = list_length -1
    else:
        walk_score_listed = False
        end_point = list_length
    # Indices corresponding to headers
    extraction_range = range(second_row_index, end_point, 2)
    
    #LOGGING----------------------
#     print("Second row index:", second_row_index)
#     print("Extraction range:", extraction_range)
#     print("List:", descriptions_list)
    
    # Second Part
    for header_index in extraction_range:
        # Headers as column names
        header = descriptions_list[header_index]
        # Values corresponding to headers are found at subsequent indices
        information = descriptions_list[header_index + 1] 
        data_dict[header] = information
    
    if walk_score_listed:
        data_dict["walk_score"] = descriptions_list[-1]
        #LOGGING----------------------
        #print("Walk Score:", descriptions_list[-1])
        
    #LOGGING--------------------------
#     print("Descriptions:", data_dict)
        
    return data_dict

In [871]:
def scrape_neighbourhood(old_DOM_top, old_DOM_buttom):
    """ Scrapes and returns a list of ratings 
    between 0-10 for a set of neighborhood indicators
    such as groceries, parks, noise, etc.)
    """
    driver = centris.driver
    
    # Extract elements from top section of scrollable list
    neighbourhood_top = wait_for_xpath(\
                            "//div[@class='ll-list ps ps--active-y']",\
                            old_DOM_top)
    # Split into indicators and ranking values
    top = [x.text for x in neighbourhood_top][0].split("\n")
    
    # LOGGING----------------------
#     print("Top neighbourhood:", top)
    
    # Extract middle section - only one element
    # Scroll and activate scrollable bar container
    scrollable_bar = driver.find_element_by_xpath(\
                                            "//div[@class='ps__thumb-y']")
    ActionChains(driver).\
        move_to_element(scrollable_bar).\
        send_keys(Keys.PAGE_DOWN).\
        click(scrollable_bar)

    # Elements from buttom of scrollable list
    neighbourhood_mid = driver.find_element_by_class_name('ll-list')
    # Split into indicators and ranking values
    middle = neighbourhood_mid.text.split("\n")
    
    # Extract buttom section
    # Scroll and load remaining elements
#     scrollable_bar = driver.find_element_by_xpath(\
#                                             "//div[@class='ps__thumb-y']")
    scrollable_bar.send_keys(Keys.PAGE_DOWN)
    # Activate container to find elements
    scrollable_bar.click()
    # Elements from buttom of scrollable list
    neighbourhood_buttom = driver.find_element_by_class_name('ll-list')
    # Split into indicators and ranking values
    buttom = neighbourhood_buttom.text.split("\n")
    
    # LOGGING----------------------
#     print("Buttom neighbourhood:", buttom)
    
    # Unite all three sections by storing tuples of indicator names and corresponding values
    united_list = []
    list_length = len(top)
    for i in range(0, list_length, 2):
        united_list.append((top[i], top[i+1]))
        united_list.append((middle[i], middle[i+1]))
        united_list.append((buttom[i], buttom[i+1]))
    
    # Create set of unique tuples
    neighbourhood_indicators = set(united_list)
    
    # LOGGING----------------------
#     print("Number of neighborhood indicators: ", len(neighbourhood_indicators))
#     print("UNITED:", united_list)
#     print("SET:", neighbourhood_indicators)
    
    # Verify size and extract information as list
    # If size unexpected, refresh page and restart process
    if len(neighbourhood_indicators) < 8:
            centris.refresh_page()
            scrape_neighbourhood(old_DOM_top, old_DOM_buttom)

    # Update old_DOM dictionary with new elements for next verification
    centris.old_DOM['neighbourhood_top'] = neighbourhood_top
    centris.old_DOM['neighbourhood_buttom'] = neighbourhood_buttom

    return extract_neighbourhood_indicators(neighbourhood_indicators)

In [763]:
def extract_neighbourhood_indicators(indicators):
    """Takes in neighbourhood data from scrape_neighbourhood() and returns it 
    in tabular form as a DataFrame object"""
    data = pd.DataFrame()
    for indicator in indicators:
        header = indicator[0]
        value = indicator[1]
        data[header] = pd.Series(value)
    
    return data

In [876]:
def scrape_population():
    """Scrapes and returns population summary data (density, variation etc.)"""
    population_summaries =  centris.driver.find_element_by_id('info')
    population_summaries_list = population_summaries\
                        .text.split("\n")
    
    # LOGGING-----------------------
    #print("Population:", population_summaries_list)
    
    return extract_population(population_summaries_list)

In [877]:
def extract_population(population):
    """Takes in population data from scrape_population() and returns it 
    in tabular form as a DataFrame object"""
    
    data = pd.DataFrame()
    for info in population:
        # Numeric data
        numeric = re.findall("[0-9]+[0-9,]*", info)
        numeric_clean = numeric[-1].replace(",","")

        # Text data for column names
        header = re.findall("[a-zA-Z\s]+", info)
        header_clean = header[0]
        # Add numeric data to header excluding the value at index -1
        for numeric_head_data in numeric[:-1]:
            header_clean = header_clean + str(numeric_head_data) + " "

        data[header_clean] = pd.Series(numeric_clean).astype("int")
    return data

In [879]:
def scrape_demographics(old_DOM):
    """Scrapes and return demographic data found in a clickable list"""
    
    #Buttons to access demographics data (education, incomes, etc.)
    demographics_buttons = wait_for_xpath(\
                        "//div[@class='centrisSocioDemobutton']",\
                                                 old_DOM)

    # LOGGING------------------------
    # print("DEMO. BUTTONS:", demographics_buttons)

    # First entry on clickable demographics list (pre-selected)
    demographics = []

    # Click buttons to access next demogrpahics elements
    for button in demographics_buttons:
        try: 
            button.click()
        except: 
            print("Demogrphics button missing!")
            demographics.append(None)
            
        # Get and append data after button click
        demographic_data = centris.driver.find_element_by_class_name(\
                         "socioDemoLabel")
        demographics.append(demographic_data.text)
    
    # Split each demographic component into separate list
    # Example: splits "Occupation" data into -> ["Owners", "35%", "Renters", "65%"]
    demographics = [demo.split("\n") for demo in demographics]
    
    #LOGGING------------------------
#     print("DEMO. DATA:", demographics)
#     print("-"*50)
    
    # Update old_DOM dictionary with new elements for next verification
    centris.old_DOM['demographics_buttons'] = demographics_buttons
    
    return extract_demographics(demographics)

In [889]:
def extract_demographics(demographics):
    """Takes in demographic data from extract_demographics() and returns it 
    in tabular form as a DataFrame object"""
    # Remove empty stings from splitting double line breaks \n\n
    demographics_clean = []
    data = pd.DataFrame()
    for demographic in demographics:
        demographics_clean.append([x for x in demographic if x != ""])

    for demographic in demographics_clean:
        header_index = range(0, len(demographic), 2)
        for i in header_index:
            header = demographic[i] + " (%)"
            value = demographic[i+1].replace("%", "")
            data[header] = pd.Series(value).astype("int")

    return data

In [868]:
def get_data_from_centris():
        """
        Requires instantiate Centris object. Scrapes information from the
        webdriver and appends it to the Centris object.
        """
        driver = centris.driver
        old_DOM = centris.old_DOM
        
        # Data from headers
        print("Start scraping new page...")
        title = wait_for_xpath("//span[@data-id='PageTitle']", old_DOM['title'])
        address = wait_for_xpath("//h2[@itemprop='address']", old_DOM['address'])
        price = wait_for_xpath("//span[@itemprop='price']", old_DOM['price'])
        lat = wait_for_xpath("//meta[@itemprop='latitude']", old_DOM['lat'])
        long = wait_for_xpath("//meta[@itemprop='longitude']", old_DOM['long'])
        
        # Save elements as old DOM
        centris.old_DOM['title'] = title
        centris.old_DOM['address'] = address
        centris.old_DOM['lat'] = lat
        centris.old_DOM['long'] = long
        
        # Scrape remaining elements
        descriptions = scrape_description(old_DOM['descriptions'])
        neighbourhood_indicators = scrape_neighbourhood(old_DOM['neighbourhood_top'],\
                                                           old_DOM['neighbourhood_buttom'])
        population = scrape_population()
        demographics = scrape_demographics(old_DOM['demographics_buttons'])
                
        # Register data in dataframe
        centris.append_data(
            title[0].text,\
            address[0].text,\
            price[0].text,\
            lat[0].get_attribute("content"),\
            long[0].get_attribute("content"),\
            descriptions,\
            neighbourhood_indicators,\
            population,\
            demographics\
        )
        
        # LOGGING--------------------------
        #print("GET DATA: DESCRIPTIONS:", descriptions)
        
        # Return to top of page, to access next-page button
        body = driver.find_element_by_tag_name("body")
        body.send_keys(Keys.HOME)
#         for i in range(7):
#             body.send_keys(Keys.PAGE_UP)

## Testing

In [909]:
# Test
start = time.time()
centris.start_driver()
centris.sort_listings()
print("Execution time:", time.time() - start)

Execution time: 7.710015773773193


Before running the next cell, search for the region(s) you want to scrape in the webdriver window.
This is not required but will substential limit run time and narrow results.

In [910]:
start = time.time() 
total_pages = centris.get_last_page() # Number of total listings
one_to_100 = range(1,100) # Used to print message after each 1% completion

print("Scraping initiated.")
print("Total number of pages to scrape:", total_pages)
print("="*50)

for i in range(5):
    # -0.01 corrects for overshoots
    percent_complete = round(100*((i-0.01)/total_pages),1) # percent completed of scraping
    print("="*50)
    print("Page:", i+1)
    time_passed = 0 # to exit while loop after 10 seconds
    
    #Refresh every 20 pages to clear memory build-up
    if (i+1)%20 == 0:
        print("Refreshing page")
        print("-"*50)
        
        # Each refresh frees some memory. Four seem to work best.
        for i in range(4):
            centris.driver.refresh()
            # Give time to load fresh page
            time.sleep(0.3)
            # Extra time for last refresh
            # Ensures that DOM is fully loaded
            if i == 3:
                time.sleep(2)
            
    #Retrieve data    
    get_data_from_centris()
    
    # Short delay for chrome to respond to PAGE_UP command
    time.sleep(0.5)
    centris.next_page()
            
    # Print after every 1% mark
    if percent_complete in one_to_100:
        print(percent_complete, "%", "completed")
        print("Estimated remaining runtime:", round(total_pages*((execution_time/N)/(60*60)), 1), "hours")
        print("="*50)

execution_time = time.time() - start
print("Total runtime:", execution_time/(60*60), "hours")
centris.data

Scraping initiated.
Total number of pages to scrap: 44719
Page: 1
Start scraping new page...
Page: 2
Start scraping new page...
Page: 3
Start scraping new page...
Page: 4
Start scraping new page...
Waiting for new DOM...
Page: 5
Start scraping new page...
Total runtime: 0.007531742387347751 hours


Unnamed: 0,title,address,price,lat,long,Daycares,High Schools,Transit friendly,Elementary Schools,Vibrant,...,Others languages (%),in basement,Pool,Population density 207,Cycling friendly,bathroom,Population density 588,Population density 305,Zoning,Population density 67
0,House for sale,"4325, 6e Rue, Laval (Chomedey), Neighbourhood ...","$379,900",45.5336,-73.753706,8,3,6.0,9,3,...,30,,,,,,,,,
1,House for sale,"158, Rue du Cardinal, Sherbrooke (Brompton/Roc...","$197,000",45.398287,-72.016106,1,0,1.0,10,1,...,1,(2,Above-ground,2.0,,,,,,
2,House for sale,"6541, Rue de Vénus, Québec (La Haute-Saint-Cha...","$354,000",46.878505,-71.330987,8,0,4.0,5,2,...,1,,,,5.0,1.0,2.0,,,
3,Cottage for sale,"2485, Chemin Hemming, Drummondville, Neighbour...","$454,900",45.861339,-72.399308,0,0,,0,0,...,1,,,,,,,2.0,,
4,Lot for sale,"boulevard des Hêtres, Shawinigan, Neighbourhoo...","$149,900",46.556794,-72.751599,1,3,,7,2,...,0,,,,,,,,"Commercial, Residential",2.0


# Expedited troubleshooting

In [911]:
centris.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 80 columns):
title                                      5 non-null object
address                                    5 non-null object
price                                      5 non-null object
lat                                        5 non-null object
long                                       5 non-null object
Daycares                                   5 non-null object
High Schools                               5 non-null object
Transit friendly                           3 non-null object
Elementary Schools                         5 non-null object
Vibrant                                    5 non-null object
Restaurants                                5 non-null object
Nightlife                                  5 non-null object
Parks                                      4 non-null object
Groceries                                  5 non-null object
Car friendly                               5 

In [442]:
centris.data.describe(include="all")

Unnamed: 0,title,address,price,lat,long,descriptions,neighbourhood_indicators,demographics
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
unique,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
top,,,,,,,,
freq,,,,,,,,


In [443]:
# # Elements from top of scrollable list
#     neighbourhood_top = wait_for_xpath(\
#                             "//div[@class='ll-list ps ps--active-y']",\
#                             old_DOM)
    
#     # DEBUGGING-----------------------------------
#     print("FIRST:", [x.text for x in centris.driver.find_elements_by_class_name("ll-header")])
#     for button in centris.driver.find_elements_by_class_name("ll-header"):
#         time.sleep(1)
#         button.click()
#     # DEBUGGING-----------------------------------
    
#     # Scroll to buttom of list to load remaining elements
#     scrollable_bar = centris.driver.find_element_by_xpath(\
#                                             "//div[@class='ps__thumb-y']")
#     scrollable_bar.send_keys(Keys.PAGE_DOWN)
    
#     # Elements from buttom of scrollable list
#     neighbourhood_buttom = wait_for_xpath(\
#                             "//div[@class='ll-list ps ps--active-y']",\
#                             old_DOM_buttom)
    
#     # LOGGING----------------------
#     print("Top neighbourhood:", [x.text for x in neighbourhood_top])
#     print("Buttom neighbourhood:", [x.text for x in neighbourhood_buttom])

In [524]:
# neighbourhood_buttom = centris.driver.find_elements_by_xpath(\
#                              "//div[@class='ll-list ps ps--active-y']")
neighbourhood_buttom = centris.driver.find_elements_by_xpath("//div[@class='ll-list']")
#last_button = neighbourhood_buttom[-1]
print("ONE:", [x.text for x in neighbourhood_buttom])
# Scroll to buttom of list to load remaining elements
scrollable_bar = centris.driver.find_element_by_xpath(\
                                        "//div[@class='ps__thumb-y']")
scrollable_bar.send_keys(Keys.PAGE_DOWN)
scrollable_bar.click()
neighbourhood_buttom = centris.driver.find_elements_by_class_name("ll-list")
#last_button = neighbourhood_buttom[-1]
neighbourhood_buttom[0].text

ONE: []


ElementClickInterceptedException: Message: element click intercepted: Element <div class="ps__thumb-y focus-visible" tabindex="0" style="top: 174px; height: 244px;" data-focus-visible-added=""></div> is not clickable at point (303, 40). Other element would receive the click: <nav class="navbar navbar-expand fixed-top">...</nav>
  (Session info: chrome=84.0.4147.105)


In [515]:
# Scroll to buttom of list to load remaining elements
scrollable_bar = centris.driver.find_element_by_xpath(\
                                        "//div[@class='ps__thumb-y']")
scrollable_bar.send_keys(Keys.PAGE_DOWN)
#scrollable_bar.click()

In [477]:
last_button.click()

In [504]:
scrollable_bar.click()

In [657]:
description = "1 bathroom and 1 powder room"
numeric = re.findall("\(*[0-9]+\)*", description)
text = re.findall("[A-Za-z]+\-*\s*[A-Za-z]*", description)
numeric

['1', '1']

In [733]:
headers_of_interest = ["rooms", "bedrooms", "powder room", "Building style",\
                               "Condominium type", "Year built", "Building area", "Lot area",\
                     "Net area", "Parking", "Main unit", "Potential gross revenue", "Pool"]

descriptions = {'rooms': '16', 'bedrooms ': '4', 'in basement': '(1', 'bathrooms': '2', 'powder room': '1',\
                'Building style': 'Two or more storey, Detached', 'Year built': '1999', 'Lot area': '13,424 sqft',\
                'Parking (total)': 'Driveway (6), Garage (3)', 'Fireplace/Stove': 'Wood stove',\
                'Additional features': 'Basement 6 feet or +'}
description_table = pd.DataFrame()
dat = pd.DataFrame()
# Ensures consistency accross listings
for header in headers_of_interest:
    if header in descriptions.keys():
        value = descriptions[header]
#         print(header, "found")
#         print(descriptions[header])
    else:
#         print(header, "NOT FOUND!!!")
        value = np.nan
    description_table[header] = pd.Series(value)
    print(header, ":", value)
    print(description_table)
    
# Concat description information
new_data = pd.concat([new_data, description_table], axis=1)

# Append new data to existing data    
dat = dat.append(new_data, ignore_index=True)

rooms : 16
  rooms
0    16
bedrooms : nan
  rooms  bedrooms
0    16       NaN
powder room : 1
  rooms  bedrooms powder room
0    16       NaN           1
Building style : Two or more storey, Detached
  rooms  bedrooms powder room                Building style
0    16       NaN           1  Two or more storey, Detached
Condominium type : nan
  rooms  bedrooms powder room                Building style  Condominium type
0    16       NaN           1  Two or more storey, Detached               NaN
Year built : 1999
  rooms  bedrooms powder room                Building style  Condominium type  \
0    16       NaN           1  Two or more storey, Detached               NaN   

  Year built  
0       1999  
Building area : nan
  rooms  bedrooms powder room                Building style  Condominium type  \
0    16       NaN           1  Two or more storey, Detached               NaN   

  Year built  Building area  
0       1999            NaN  
Lot area : 13,424 sqft
  rooms  bedrooms powder

In [840]:
indicators = {('Transit friendly', '9'), ('Restaurants', '7'), ('Nightlife', '4'), ('Vibrant', '5'), ('Car friendly', '6'), ('Elementary Schools', '9'), ('Quiet', '8'), ('High Schools', '7'), ('Shopping', '9'), ('Pedestrian friendly', '10'), ('Cycling friendly', '8'), ('Daycares', '10'), ('Groceries', '10'), ('Cafes', '6'), ('Greenery', '7'), ('Parks', '8')}
data = pd.DataFrame()
for indicator in indicators:
    header = indicator[0]
    value = indicator[1]
    data[header] = pd.Series(value)

data

Unnamed: 0,Transit friendly,Restaurants,Vibrant,Car friendly,Elementary Schools,Quiet,High Schools,Shopping,Pedestrian friendly,Cycling friendly,Daycares,Groceries,Cafes,Greenery,Nightlife,Parks
0,9,7,5,6,9,8,7,9,10,8,10,10,6,7,4,8


In [852]:
population = ['Population (2016) 136,024', 'Population variation between 2011 and 2016 3%',\
              'Population density 5,347 hab/km2', 'Unemployment rate (2016) 9%']
data = pd.DataFrame()
for info in population:
    # Numeric data
    numeric = re.findall("[0-9]+[0-9,]*", info)
    numeric_clean = numeric[-1].replace(",","")
    
    # Text data for column names
    header = re.findall("[a-zA-Z\s]+", info)
    header_clean = header[0]
    # Add numeric data to header excluding the value at index -1
    for numeric_head_data in numeric[:-1]:
        header_clean = header_clean + str(numeric_head_data) + " "
    
    data[header_clean] = pd.Series(numeric_clean).astype("int")
    
data

Unnamed: 0,Population 2016,Population variation between 2011 2016,"Population density 5,347",Unemployment rate 2016
0,136024,3,2,9


In [888]:
demographics = [['Less than $50,000', '46%', '', 'Between $50,000 and $80,000', '23%',\
                 '', 'Between $80,000 and $100,000', '10%', '', 'Between $100,000 and $150,000',\
                 '13%', '', 'More than $150,000', '8%'],\
                ['1-person households', '38%', '', '2-person households', '38%', '',\
                 '3-person households', '12%', '', '4-person households', '9%', '', '5-person or more households', '3%'],\
                ['Couples without children at home', '54%', '', 'Couples with children at home', '31%',\
                 '', 'Single-parent families', '15%'],\
                ['Owners', '69%', '', 'Renters', '31%'],\
                ['Before 1960', '22%', '', 'Between 1961 and 1980', '24%', '', 'Between 1981 and 1990', '16%', '',\
                 'Between 1991 and 2000', '11%', '', 'Between 2001 and 2010', '20%', '', 'Between 2011 and 2016', '6%'],\
                ['Single-family homes', '60%', '', 'Semi-detached or row houses', '9%', '', 'Buildings with less than 5 floors',\
                 '31%', '', 'Buildings with 5 or more floors', '0%', '', 'Mobile homes', '0%'],\
                ['University', '25%', '', 'College', '19%', '', 'Secondary (high) school', '22%', '',\
                 'Apprentice or trade school diploma', '17%', '', 'No diploma', '16%'],\
                ['Non-immigrant population', '94%', '', 'Immigrant population', '6%'],\
                ['French', '94%', '', 'English', '5%', '', 'Others languages', '1%']]

# Remove empty stings from splitting double line breaks \n\n
demographics_clean = []
data = pd.DataFrame()
for demographic in demographics:
    demographics_clean.append([x for x in demographic if x != ""])

for demographic in demographics_clean:
    header_index = range(0, len(demographic), 2)
    for i in header_index:
        header = demographic[i] + " (%)"
        value = demographic[i+1].replace("%", "")
        data[header] = pd.Series(value).astype("int")

data

Unnamed: 0,"Less than $50,000 (%)","Between $50,000 and $80,000 (%)","Between $80,000 and $100,000 (%)","Between $100,000 and $150,000 (%)","More than $150,000 (%)",1-person households (%),2-person households (%),3-person households (%),4-person households (%),5-person or more households (%),...,University (%),College (%),Secondary (high) school (%),Apprentice or trade school diploma (%),No diploma (%),Non-immigrant population (%),Immigrant population (%),French (%),English (%),Others languages (%)
0,46,23,10,13,8,38,38,12,9,3,...,25,19,22,17,16,94,6,94,5,1
