In [679]:
import pandas as pd
import numpy as np
from random import randint
import time
import re
# Scraping through Chrome webdriver
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys

In [2]:
# Starting URLs
centris = "https://www.centris.ca/en/properties~for-sale?view=Thumbnail"
duproprio = "https://duproprio.com/en/search/list?search=true&is_for_sale=1&with_builders=1&parent=1&pageNumber=1&sort=-published_at"

In [826]:
class Centris:
    """
    Accessing 
    
    Attr:
    self.url - starting url for scraping process
    self.data - pandas.DataFrame object containing scraped data
    self.driver - Chrome webdriver
    self.containers - List of web-elements containing information on listings
        - eg: link to listing, price, picture, address,...
    self.links_to_listings - List of web-elements, each containing the link to a listing
    """
     
    def __init__(self, url="https://www.centris.ca/en/houses~for-sale~lac-simon/11851081?view=Summary&uc=3"): 
        self.url = url
        self.data = pd.DataFrame(\
#                     {\
#                         'title': None,\
#                         'address': None,\
#                         'price': None,\
#                         'lat': None,\
#                         'long': None,\
#                         'descriptions': None,\
#                         'neighbourhood_indicators': None,\
#                         'demographics': None
#                     }, index=[0]\
                                )
        # Path to Chromedriver
        self.DRIVER_PATH = 'C:/webdriver/chromedriver.exe'
        self.driver = None
        # Verification for new DOM
        self.old_DOM = {\
                        "title" : [],\
                        "address": [],\
                        "price": [],\
                        "lat": [],\
                        "long": [],\
                        "descriptions": [],\
                        'neighbourhood_top': [],\
                        'neighbourhood_buttom': [],\
                        "demographics_buttons": [],\
                    }

    def append_data(self, title, address, price,\
            lat, long, descriptions, neighbourhood_indicators,\
            demographics):
        """Appends new data to existing data frame."""
        new_data = pd.DataFrame({\
                        'title': title,\
                        'address': address,\
                        'price': price,\
                        'lat': lat,\
                        'long': long,\
#                         'descriptions': descriptions,\
#                         'neighbourhood_indicators': neighbourhood_indicators,\
#                         'demographics': demographics\
                    }, index=[0])
        
        # DESCRIPTIONS
        description_table = pd.DataFrame()
        headers_of_interest = [\
                "rooms", "bedrooms", "powder room", "Number of units", "Building style",\
                "Condominium type", "Year built", "Building area", "Lot area", "walk_score"\
                "Net area", "Parking (total)", "Main unit", "Potential gross revenue", "Pool"\
                              ]
        # Ensures consistency accross listings
        for header in headers_of_interest:
            if header in descriptions.keys():
                value = descriptions[header]
            else:
                value = np.nan
            description_table[header] = pd.Series(value)
        # Concat description information
        # new_data = pd.concat([new_data, description_table], axis=1)  
        
        # NEIGHBOURHOOD
        new_data = pd.concat([new_data, neighbourhood_indicators, description_table], axis=1)
#         cols = [
#             "Nightlife", "Elementary Schools", "High Schools", "Parks", "Cafes", "Quiet",\
#             "Transit friendly", "Vibrant", "Restaurants", "Shopping", "Cycling friendly",\
#             "Groceries", "Daycares", "Greenery", "Car friendly", "Pedestrian friendly", "Historic"\
#                            ]
        self.data = self.data.append(new_data,\
                                     ignore_index=True)
        # DEMOGRAPHICS
             
    def get_data(self):
        return self.data
        
    def start_driver(self):
        """
        Starts and returns Crome webdriver. 
        The page link in the url attribute 
        is opened in headless mode.
        """
        
        # Activate headless mode for fastest response
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-infobars"); # disabling infobars
        options.add_argument("--disable-extensions"); # disabling extensions
        options.add_argument("--disable-gpu"); # applicable to windows os only
        options.add_argument("--disable-dev-shm-usage"); # overcome limited resource problems
        options.add_argument("--no-sandbox"); # Bypass OS security model
        options.add_argument('--start-maximized') # open Browser in maximized mode
        options.add_argument('--incognito')

        # Start driver with url
        self.driver = webdriver.Chrome(executable_path=self.DRIVER_PATH)
        self.driver.get(self.url)

    def sort_listings(self):
        """Sorts listings in webdriver from newest to oldest."""
        
        # Click drop down menu
        drop_down = self.driver.find_element_by_xpath(\
                                    "//button[@id='dropdownSort']")
        drop_down.click()
        
        # Sort by most recent listings
        sort_by = self.driver.find_element_by_xpath("//a[@data-option-value='3']")
        sort_by.click()
    
    def goto_first_page(self):
        try:
            next_page = self.driver.find_element_by_xpath(\
                                        "//li[@class='goFirst']")
            next_page.click()
        except:
            print("goFirst button not available")
    
    def next_page(self):
        try:
            next_page = self.driver.find_element_by_xpath(\
                                        "//li[@class='next']")
            next_page.click()
            pass
        except:
            time.sleep(0.5)
            # Try again after waiting 0.5 sec.
            try:
                next_page = self.driver.find_element_by_xpath(\
                                            "//li[@class='next']")
                next_page.click()
                pass
            except:
                print("Next-page button not found!")
                
    def get_last_page(self):
        '''Returns page number of last page in browser.'''
        
        last_page = self.driver.find_element_by_xpath(\
                                    "//li[@class='pager-current']").text
        return int(last_page.split(" / ")[1].replace(",", ""))
    
    def refresh_page(self):
        "Refreshes current webdriver page."
        self.driver.refresh()
        # Wait until page fully loaded
        time.sleep(2)
                

                                                  
# Instantiate class object
centris = Centris()

The following functions need to be outside of the Class. wait_for_xpath() determined the approptiate time to call get_data(). Initially, both fuctions were part of the class object. It seems that after the get_data() call, the driver does not get updated within the class. This leads in some cases to old DOM's being accessed after the browser has already switched to the next page. To circumvent this issue, elements are called outside the class and tried until accessible. This allows the entire new DOM to be loaded before get_data() is called.

In [681]:
 def wait_for_xpath(xpath: str, old_element):
        """
        Wait until elements in new DOM are accessible.
        
        Returns:
        current_element - the element found in the new DOM at xpath
        """
        
        # Wait until DOM accessible
        time.sleep(0.2)
        centris_driver = centris.driver
        current_element = []
        try:
            current_element = centris_driver.find_elements_by_xpath(xpath)
        except:
            time.sleep(0.2)
            try:
                print("Second attempt loading element!")
                current_element = centris_driver.find_elements_by_xpath(xpath)
            except:
                time.sleep(0.15)
                try:
                    print("Third and last attempt loading element!")
                    current_element = centris_driver.find_elements_by_xpath(xpath)
                except:
                    print("TimeOutError: Unable to load element")
        
        # LOGGING --------------------------------
#         print("Previous:\n", old_element)
#         print("Current:\n", current_element)
#         print("="*50)
        
        # Ensure that the NEW rather than the previous DOM is active
        time_passed = 0
        while (current_element == old_element\
               and time_passed < 10):
            print("Waiting for new DOM...")
            time.sleep(0.15)
            time_passed += 0.15
            current_element = centris_driver.find_elements_by_xpath(xpath)
        return current_element

In [682]:
def scrape_description(old_DOM):
    """ Requires instantiated centris object. Scrapes and returns
    description data: Year build, price, Net area, etc."""
    
    descriptions = wait_for_xpath("//div[@class='col-lg-12 description']",\
                                 old_DOM)
    #First three elements not relevant
    descriptions_list = descriptions[0].text.split("\n")[3:]
    
    #LOGGING------------------------
    #print("DESCRIPTION:", descriptions_list)
    
    # Update old_DOM dictionary with new element for next verification
    centris.old_DOM['descriptions'] = descriptions
    
    return extract_descriptions(descriptions_list)

In [827]:
def extract_descriptions(descriptions_list):
    """Extract data into a dictionary"""
    
    # The data_dict found on this part of the page is inconsistent across listings
    # The first row may contain the number of rooms, bedrooms and bathrooms without headers or may be missing
    # Following rows have heathers with associated values after a line break
    # The very last element may be a walking score without header
    # Listings without first row may supply first row information in subsequent rows with headers
    # Because of these inconsistencies, two seperate extractions need to be implemented: one for
    # first row lements (if they exist) and another for subsequent rows
    
    # Transformed data
    data_dict = {}
    # Distinguish between elements from first and subsequent rows if first row exists
    first_row = True
    # Starting point for second part of transformation
    second_row_index = 0
    
    # First Part
    while first_row == True:
        for description in descriptions_list:
            numeric = re.findall("\(*[0-9]+\)*", description) # numbers
            text = re.findall("[A-Za-z]+[A-Za-z\s\-]*", description) # text after/inbetween numbers 

            # Initial elements with numeric values correspond to first row
            if (numeric != []):
                # For each value there must be one text description
                if (len(numeric) == len(text)):
                    for description,value in zip(text, numeric):
                        # Save as column in data_dict
                        description_clean = description.replace("and", "").strip()
                        data_dict[description_clean] = value
                    second_row_index += 1 
                else:
                    print("Unequal number of first row keys and values!")
                    print("Numbers:", numeric)
                    print("Text:", text)
                    break
            else:
                first_row = False # No numeric information implies header
                break
    
    # Index range of second extraction
    # Headers are found at every second index (0,2,4,...)
    # Values are one index apart from their corresponding header (1,3,5,...)
    list_length = len(descriptions_list)
    if (list_length - second_row_index)%2 == 1: # Implies presence of element without header -> Walk Score
        walk_score_listed = True
        end_point = list_length -1
    else:
        walk_score_listed = False
        end_point = list_length
    # Indices corresponding to headers
    extraction_range = range(second_row_index, end_point, 2)
    
    #LOGGING----------------------
#     print("Second row index:", second_row_index)
#     print("Extraction range:", extraction_range)
#     print("List:", descriptions_list)
    
    # Second Part
    for header_index in extraction_range:
        # Headers as column names
        header = descriptions_list[header_index]
        # Values corresponding to headers are found at subsequent indices
        information = descriptions_list[header_index + 1] 
        data_dict[header] = information
    
    if walk_score_listed:
        data_dict["walk_score"] = descriptions_list[-1]
        #LOGGING----------------------
        #print("Walk Score:", descriptions_list[-1])
        
    #LOGGING--------------------------
#     print("Descriptions:", data_dict)
        
    return data_dict

In [807]:
def scrape_neighbourhood(old_DOM_top, old_DOM_buttom):
    """ Scrapes and returns a list of ratings 
    between 0-10 for a set of neighborhood indicators
    such as groceries, parks, noise, etc.)
    """
    driver = centris.driver
    
    # Extract elements from top section of scrollable list
    neighbourhood_top = wait_for_xpath(\
                            "//div[@class='ll-list ps ps--active-y']",\
                            old_DOM_top)
    # Split into indicators and ranking values
    top = [x.text for x in neighbourhood_top][0].split("\n")
    
    # LOGGING----------------------
#     print("Top neighbourhood:", top)
    
    # Extract middle section - only one element
    # Scroll and activate scrollable bar container
    scrollable_bar = driver.find_element_by_xpath(\
                                            "//div[@class='ps__thumb-y']")
    scrollable_bar.send_keys(Keys.PAGE_DOWN)
    # Activate container to find element
    scrollable_bar.click()
    # Elements from buttom of scrollable list
    neighbourhood_mid = driver.find_element_by_class_name('ll-list')
    # Split into indicators and ranking values
    middle = neighbourhood_mid.text.split("\n")
    
    # Extract buttom section
    # Scroll and load remaining elements
#     scrollable_bar = driver.find_element_by_xpath(\
#                                             "//div[@class='ps__thumb-y']")
    scrollable_bar.send_keys(Keys.PAGE_DOWN)
    # Activate container to find elements
    scrollable_bar.click()
    # Elements from buttom of scrollable list
    neighbourhood_buttom = driver.find_element_by_class_name('ll-list')
    # Split into indicators and ranking values
    buttom = neighbourhood_buttom.text.split("\n")
    
    # LOGGING----------------------
#     print("Buttom neighbourhood:", buttom)
    
    # Unite all three sections by storing tuples of indicator names and corresponding values
    united_list = []
    list_length = len(top)
    for i in range(0, list_length, 2):
        united_list.append((top[i], top[i+1]))
        united_list.append((middle[i], middle[i+1]))
        united_list.append((buttom[i], buttom[i+1]))
    
    # Create set of unique tuples
    neighbourhood_indicators = set(united_list)
    
    # LOGGING----------------------
#     print("Number of neighborhood indicators: ", len(neighbourhood_indicators))
    print("UNITED:", united_list)
    print("SET:", neighbourhood_indicators)
    
    # Verify size and extract information as list
    # If size unexpected, refresh page and restart process
    if len(neighbourhood_indicators) < 8:
            centris.refresh_page()
            scrape_neighbourhood(old_DOM_top, old_DOM_buttom)

    # Update old_DOM dictionary with new elements for next verification
    centris.old_DOM['neighbourhood_top'] = neighbourhood_top
    centris.old_DOM['neighbourhood_buttom'] = neighbourhood_buttom

    return extract_neighbourhood_indicators(neighbourhood_indicators)

In [763]:
def extract_neighbourhood_indicators(indicators):
    data = pd.DataFrame()
    for indicator in indicators:
        header = indicator[0]
        value = indicator[1]
        data[header] = pd.Series(value)
    
    return data

In [685]:
def scrape_population():
    """Scrapes and returns population summary data (density, variation etc.)"""
    population_summaries =  centris.driver.find_element_by_id('info')
    population_summaries_list = population_summaries\
                        .text.split("\n")
    
    # LOGGING-----------------------
#     print("Population:", population_summaries_list)
    
    return population_summaries_list

In [686]:
def scrape_demographics(old_DOM):
    """Scrapes and return demographic data found in a clickable list"""
    
    #Buttons to access demographics data (education, incomes, etc.)
    demographics_buttons = wait_for_xpath(\
                        "//div[@class='centrisSocioDemobutton']",\
                                                 old_DOM)

    # LOGGING------------------------
    # print("DEMO. BUTTONS:", demographics_buttons)

    # First entry on clickable demographics list (pre-selected)
    demographics = []

    # Click buttons to access next demogrpahics elements
    for button in demographics_buttons:
        try: 
            button.click()
        except: 
            print("Demogrphics button missing!")
            demographics.append(None)
            
        # Get and append data after button click
        demographic_data = centris.driver.find_element_by_class_name(\
                         "socioDemoLabel")
        demographics.append(demographic_data.text)
    
    # Split each demographic component into separate list
    # Example: splits "Occupation" data into -> ["Owners", "35%", "Renters", "65%"]
    demographics = [demo.split("\n") for demo in demographics]
    
    #LOGGING------------------------
#     print("DEMO. DATA:", demographics)
#     print("-"*50)
    
    # Update old_DOM dictionary with new elements for next verification
    centris.old_DOM['demographics_buttons'] = demographics_buttons
    
    return demographics

In [828]:
def get_data_from_centris():
        """
        Requires instantiate Centris object. Scrapes information from the
        webdriver and appends it to the Centris object.
        """
        driver = centris.driver
        old_DOM = centris.old_DOM
        
        # Data from headers
        print("Start scraping new page...")
        title = wait_for_xpath("//span[@data-id='PageTitle']", old_DOM['title'])
        address = wait_for_xpath("//h2[@itemprop='address']", old_DOM['address'])
        price = wait_for_xpath("//span[@itemprop='price']", old_DOM['price'])
        lat = wait_for_xpath("//meta[@itemprop='latitude']", old_DOM['lat'])
        long = wait_for_xpath("//meta[@itemprop='longitude']", old_DOM['long'])
        
        # Save elements as old DOM
        centris.old_DOM['title'] = title
        centris.old_DOM['address'] = address
        centris.old_DOM['lat'] = lat
        centris.old_DOM['long'] = long
        
        descriptions = scrape_description(old_DOM['descriptions'])
        neighbourhood_indicators_list = scrape_neighbourhood(old_DOM['neighbourhood_top'],\
                                                           old_DOM['neighbourhood_buttom'])
        population_summaries_list = scrape_population()
        demographics = scrape_demographics(old_DOM['demographics_buttons'])
                
        # Register data in dataframe
        centris.append_data(
            title[0].text,\
            address[0].text,\
            price[0].text,\
            lat[0].get_attribute("content"),\
            long[0].get_attribute("content"),\
            descriptions,\
            neighbourhood_indicators_list,\
            demographics\
        )
        
        # LOGGING--------------------------
        #print("GET DATA: DESCRIPTIONS:", descriptions)
        
        # Return to top of page, to access next-page button
        body = driver.find_element_by_tag_name("body")
        for i in range(7):
            body.send_keys(Keys.PAGE_UP)

## Testing

In [829]:
# Test
start = time.time()
centris.start_driver()
centris.sort_listings()
print("Execution time:", time.time() - start)

Execution time: 8.305063009262085


Before running the next cell, search for the region(s) you want to scrape in the webdriver window.
This is not required but will substential limit run time and narrow results.

In [830]:
start = time.time() 
total_pages = centris.get_last_page() # Number of total listings
one_to_100 = range(1,100) # Used to print message after each 1% completion

print("Scraping initiated.")
print("Totla number of pages to scrap:", total_pages)
print("="*50)

for i in range(100):
    # -0.01 corrects for overshoots
    percent_complete = round(100*((i-0.01)/total_pages),1) # percent completed of scraping
    print("="*50)
    print("Page:", i+1)
    time_passed = 0 # to exit while loop after 10 seconds
    
    #Refresh every 20 pages to clear memory build-up
    if (i+1)%20 == 0:
        print("Refreshing page")
        print("-"*50)
        
        # Each refresh frees some memory. Four seem to work best.
        for i in range(4):
            centris.driver.refresh()
            # Give time to load fresh page
            time.sleep(0.3)
            # Extra time for last refresh
            # Ensures that DOM is fully loaded
            if i == 3:
                time.sleep(2)
            
    #Retrieve data    
    get_data_from_centris()
    
    # Short delay for chrome to respond to PAGE_UP command
    time.sleep(0.5)
    centris.next_page()
            
    # Print after every 1% mark
    if percent_complete in one_to_100:
        print(percent_complete, "%", "completed")
        print("Estimated remaining runtime:", round(total_pages*((execution_time/N)/(60*60)), 1), "hours")
        print("="*50)

execution_time = time.time() - start
print("Total runtime:", execution_time/(60*60), "hours")
centris.data

Scraping initiated.
Totla number of pages to scrap: 5467
Page: 1
Start scraping new page...
UNITED: [('Elementary Schools', '10'), ('Restaurants', '8'), ('Cafes', '7'), ('High Schools', '10'), ('Cafes', '7'), ('Car friendly', '6'), ('Daycares', '10'), ('Car friendly', '6'), ('Vibrant', '5'), ('Parks', '10'), ('Vibrant', '5'), ('Cycling friendly', '5'), ('Groceries', '10'), ('Cycling friendly', '5'), ('Nightlife', '5'), ('Transit friendly', '9'), ('Nightlife', '5'), ('Greenery', '4'), ('Pedestrian friendly', '9'), ('Greenery', '4'), ('Historic', '2'), ('Shopping', '9'), ('Historic', '2'), ('Quiet', '0')]
SET: {('Cafes', '7'), ('Transit friendly', '9'), ('Restaurants', '8'), ('Vibrant', '5'), ('Car friendly', '6'), ('Pedestrian friendly', '9'), ('Elementary Schools', '10'), ('Parks', '10'), ('Shopping', '9'), ('High Schools', '10'), ('Groceries', '10'), ('Daycares', '10'), ('Greenery', '4'), ('Cycling friendly', '5'), ('Historic', '2'), ('Nightlife', '5'), ('Quiet', '0')}
Population: ['P

DEMO. DATA: [['Less than $50,000', '52%', '', 'Between $50,000 and $80,000', '22%', '', 'Between $80,000 and $100,000', '9%', '', 'Between $100,000 and $150,000', '11%', '', 'More than $150,000', '7%'], ['1-person households', '49%', '', '2-person households', '29%', '', '3-person households', '11%', '', '4-person households', '8%', '', '5-person or more households', '3%'], ['Couples without children at home', '43%', '', 'Couples with children at home', '36%', '', 'Single-parent families', '21%'], ['Owners', '30%', '', 'Renters', '70%'], ['Before 1960', '52%', '', 'Between 1961 and 1980', '24%', '', 'Between 1981 and 1990', '9%', '', 'Between 1991 and 2000', '5%', '', 'Between 2001 and 2010', '5%', '', 'Between 2011 and 2016', '4%'], ['Single-family homes', '2%', '', 'Semi-detached or row houses', '3%', '', 'Buildings with less than 5 floors', '89%', '', 'Buildings with 5 or more floors', '6%', '', 'Mobile homes', '0%'], ['University', '41%', '', 'College', '17%', '', 'Secondary (high)

Page: 8
Start scraping new page...
UNITED: [('Quiet', '10'), ('Pedestrian friendly', '5'), ('Restaurants', '5'), ('Daycares', '9'), ('Restaurants', '5'), ('Cycling friendly', '4'), ('Car friendly', '8'), ('Cycling friendly', '4'), ('Nightlife', '4'), ('Elementary Schools', '7'), ('Nightlife', '4'), ('Cafes', '4'), ('Groceries', '7'), ('Cafes', '4'), ('High Schools', '3'), ('Transit friendly', '6'), ('High Schools', '3'), ('Greenery', '2'), ('Parks', '6'), ('Greenery', '2'), ('Historic', '2'), ('Shopping', '6'), ('Historic', '2'), ('Vibrant', '2')]
SET: {('Groceries', '7'), ('Cafes', '4'), ('High Schools', '3'), ('Pedestrian friendly', '5'), ('Transit friendly', '6'), ('Restaurants', '5'), ('Shopping', '6'), ('Vibrant', '2'), ('Cycling friendly', '4'), ('Parks', '6'), ('Daycares', '9'), ('Quiet', '10'), ('Historic', '2'), ('Nightlife', '4'), ('Elementary Schools', '7'), ('Greenery', '2'), ('Car friendly', '8')}
Population: ['Population (2016) 136,024', 'Population variation between 2011

DEMO. DATA: [['Less than $50,000', '42%', '', 'Between $50,000 and $80,000', '24%', '', 'Between $80,000 and $100,000', '10%', '', 'Between $100,000 and $150,000', '13%', '', 'More than $150,000', '12%'], ['1-person households', '28%', '', '2-person households', '27%', '', '3-person households', '17%', '', '4-person households', '17%', '', '5-person or more households', '11%'], ['Couples without children at home', '29%', '', 'Couples with children at home', '53%', '', 'Single-parent families', '18%'], ['Owners', '49%', '', 'Renters', '51%'], ['Before 1960', '27%', '', 'Between 1961 and 1980', '31%', '', 'Between 1981 and 1990', '12%', '', 'Between 1991 and 2000', '9%', '', 'Between 2001 and 2010', '16%', '', 'Between 2011 and 2016', '6%'], ['Single-family homes', '11%', '', 'Semi-detached or row houses', '15%', '', 'Buildings with less than 5 floors', '45%', '', 'Buildings with 5 or more floors', '30%', '', 'Mobile homes', '0%'], ['University', '39%', '', 'College', '16%', '', 'Seconda

Page: 15
Start scraping new page...
Waiting for new DOM...
UNITED: [('Daycares', '10'), ('Parks', '7'), ('Restaurants', '7'), ('Transit friendly', '9'), ('Restaurants', '7'), ('Greenery', '6'), ('Elementary Schools', '9'), ('Greenery', '6'), ('Cafes', '6'), ('Car friendly', '9'), ('Cafes', '6'), ('Historic', '4'), ('Groceries', '9'), ('Historic', '4'), ('Vibrant', '3'), ('High Schools', '8'), ('Vibrant', '3'), ('Cycling friendly', '3'), ('Shopping', '8'), ('Cycling friendly', '3'), ('Nightlife', '3'), ('Pedestrian friendly', '7'), ('Nightlife', '3'), ('Quiet', '1')]
SET: {('Restaurants', '7'), ('Greenery', '6'), ('Transit friendly', '9'), ('Car friendly', '9'), ('High Schools', '8'), ('Vibrant', '3'), ('Parks', '7'), ('Elementary Schools', '9'), ('Shopping', '8'), ('Pedestrian friendly', '7'), ('Nightlife', '3'), ('Daycares', '10'), ('Groceries', '9'), ('Cafes', '6'), ('Cycling friendly', '3'), ('Historic', '4'), ('Quiet', '1')}
Population: ['Population (2016) 98,828', 'Population vari

DEMO. DATA: [['Less than $50,000', '50%', '', 'Between $50,000 and $80,000', '22%', '', 'Between $80,000 and $100,000', '9%', '', 'Between $100,000 and $150,000', '11%', '', 'More than $150,000', '7%'], ['1-person households', '46%', '', '2-person households', '32%', '', '3-person households', '11%', '', '4-person households', '7%', '', '5-person or more households', '4%'], ['Couples without children at home', '44%', '', 'Couples with children at home', '32%', '', 'Single-parent families', '24%'], ['Owners', '35%', '', 'Renters', '65%'], ['Before 1960', '43%', '', 'Between 1961 and 1980', '18%', '', 'Between 1981 and 1990', '11%', '', 'Between 1991 and 2000', '5%', '', 'Between 2001 and 2010', '10%', '', 'Between 2011 and 2016', '13%'], ['Single-family homes', '2%', '', 'Semi-detached or row houses', '7%', '', 'Buildings with less than 5 floors', '71%', '', 'Buildings with 5 or more floors', '20%', '', 'Mobile homes', '0%'], ['University', '38%', '', 'College', '16%', '', 'Secondary (h

Page: 22
Start scraping new page...
UNITED: [('Quiet', '10'), ('Greenery', '6'), ('Pedestrian friendly', '6'), ('Elementary Schools', '10'), ('Pedestrian friendly', '6'), ('Cycling friendly', '6'), ('Daycares', '9'), ('Cycling friendly', '6'), ('Cafes', '5'), ('Car friendly', '9'), ('Cafes', '5'), ('Restaurants', '5'), ('High Schools', '8'), ('Restaurants', '5'), ('Shopping', '5'), ('Parks', '8'), ('Shopping', '5'), ('Nightlife', '4'), ('Groceries', '8'), ('Nightlife', '4'), ('Historic', '3'), ('Transit friendly', '6'), ('Historic', '3'), ('Vibrant', '3')]
SET: {('Car friendly', '9'), ('Greenery', '6'), ('High Schools', '8'), ('Groceries', '8'), ('Historic', '3'), ('Shopping', '5'), ('Restaurants', '5'), ('Transit friendly', '6'), ('Elementary Schools', '10'), ('Vibrant', '3'), ('Cafes', '5'), ('Daycares', '9'), ('Quiet', '10'), ('Nightlife', '4'), ('Cycling friendly', '6'), ('Pedestrian friendly', '6'), ('Parks', '8')}
Population: ['Population (2016) 76,853', 'Population variation bet

DEMO. DATA: [['Less than $50,000', '58%', '', 'Between $50,000 and $80,000', '18%', '', 'Between $80,000 and $100,000', '7%', '', 'Between $100,000 and $150,000', '8%', '', 'More than $150,000', '9%'], ['1-person households', '55%', '', '2-person households', '31%', '', '3-person households', '9%', '', '4-person households', '4%', '', '5-person or more households', '2%'], ['Couples without children at home', '57%', '', 'Couples with children at home', '26%', '', 'Single-parent families', '17%'], ['Owners', '27%', '', 'Renters', '73%'], ['Before 1960', '33%', '', 'Between 1961 and 1980', '28%', '', 'Between 1981 and 1990', '11%', '', 'Between 1991 and 2000', '8%', '', 'Between 2001 and 2010', '13%', '', 'Between 2011 and 2016', '8%'], ['Single-family homes', '1%', '', 'Semi-detached or row houses', '2%', '', 'Buildings with less than 5 floors', '48%', '', 'Buildings with 5 or more floors', '49%', '', 'Mobile homes', '0%'], ['University', '52%', '', 'College', '15%', '', 'Secondary (high

Page: 29
Start scraping new page...
UNITED: [('Car friendly', '10'), ('Transit friendly', '1'), ('Daycares', '1'), ('Parks', '10'), ('Daycares', '1'), ('Vibrant', '1'), ('Quiet', '9'), ('Vibrant', '1'), ('Cafes', '1'), ('Greenery', '6'), ('Cafes', '1'), ('Shopping', '1'), ('Elementary Schools', '4'), ('Shopping', '1'), ('High Schools', '0'), ('Cycling friendly', '4'), ('High Schools', '0'), ('Historic', '0'), ('Restaurants', '4'), ('Historic', '0'), ('Groceries', '0'), ('Pedestrian friendly', '2'), ('Groceries', '0'), ('Nightlife', '0')]
SET: {('Nightlife', '0'), ('Greenery', '6'), ('Groceries', '0'), ('Transit friendly', '1'), ('Pedestrian friendly', '2'), ('Cafes', '1'), ('High Schools', '0'), ('Parks', '10'), ('Shopping', '1'), ('Car friendly', '10'), ('Cycling friendly', '4'), ('Elementary Schools', '4'), ('Quiet', '9'), ('Vibrant', '1'), ('Historic', '0'), ('Restaurants', '4'), ('Daycares', '1')}
Population: ['Population (2016) 18,413', 'Population variation between 2011 and 2016 

DEMO. DATA: [['Less than $50,000', '58%', '', 'Between $50,000 and $80,000', '18%', '', 'Between $80,000 and $100,000', '7%', '', 'Between $100,000 and $150,000', '8%', '', 'More than $150,000', '9%'], ['1-person households', '55%', '', '2-person households', '31%', '', '3-person households', '9%', '', '4-person households', '4%', '', '5-person or more households', '2%'], ['Couples without children at home', '57%', '', 'Couples with children at home', '26%', '', 'Single-parent families', '17%'], ['Owners', '27%', '', 'Renters', '73%'], ['Before 1960', '33%', '', 'Between 1961 and 1980', '28%', '', 'Between 1981 and 1990', '11%', '', 'Between 1991 and 2000', '8%', '', 'Between 2001 and 2010', '13%', '', 'Between 2011 and 2016', '8%'], ['Single-family homes', '1%', '', 'Semi-detached or row houses', '2%', '', 'Buildings with less than 5 floors', '48%', '', 'Buildings with 5 or more floors', '49%', '', 'Mobile homes', '0%'], ['University', '52%', '', 'College', '15%', '', 'Secondary (high

Page: 36
Start scraping new page...
UNITED: [('Pedestrian friendly', '10'), ('Transit friendly', '9'), ('Historic', '9'), ('Parks', '10'), ('Historic', '9'), ('Cycling friendly', '9'), ('Groceries', '10'), ('Cycling friendly', '9'), ('Elementary Schools', '8'), ('Vibrant', '10'), ('Elementary Schools', '8'), ('Daycares', '8'), ('Nightlife', '10'), ('Daycares', '8'), ('Car friendly', '5'), ('Cafes', '10'), ('Car friendly', '5'), ('Quiet', '4'), ('Restaurants', '10'), ('Quiet', '4'), ('Greenery', '4'), ('Shopping', '10'), ('Greenery', '4'), ('High Schools', '3')]
SET: {('Transit friendly', '9'), ('Historic', '9'), ('Daycares', '8'), ('Cycling friendly', '9'), ('High Schools', '3'), ('Nightlife', '10'), ('Parks', '10'), ('Cafes', '10'), ('Vibrant', '10'), ('Elementary Schools', '8'), ('Shopping', '10'), ('Groceries', '10'), ('Restaurants', '10'), ('Greenery', '4'), ('Pedestrian friendly', '10'), ('Quiet', '4'), ('Car friendly', '5')}
Population: ['Population (2016) 87,168', 'Population va

DEMO. DATA: [['Less than $50,000', '52%', '', 'Between $50,000 and $80,000', '24%', '', 'Between $80,000 and $100,000', '9%', '', 'Between $100,000 and $150,000', '11%', '', 'More than $150,000', '5%'], ['1-person households', '45%', '', '2-person households', '32%', '', '3-person households', '12%', '', '4-person households', '8%', '', '5-person or more households', '4%'], ['Couples without children at home', '42%', '', 'Couples with children at home', '35%', '', 'Single-parent families', '23%'], ['Owners', '34%', '', 'Renters', '66%'], ['Before 1960', '43%', '', 'Between 1961 and 1980', '31%', '', 'Between 1981 and 1990', '10%', '', 'Between 1991 and 2000', '5%', '', 'Between 2001 and 2010', '7%', '', 'Between 2011 and 2016', '4%'], ['Single-family homes', '5%', '', 'Semi-detached or row houses', '5%', '', 'Buildings with less than 5 floors', '86%', '', 'Buildings with 5 or more floors', '4%', '', 'Mobile homes', '0%'], ['University', '29%', '', 'College', '17%', '', 'Secondary (high

Page: 43
Start scraping new page...
UNITED: [('Car friendly', '10'), ('Shopping', '6'), ('Daycares', '4'), ('Quiet', '8'), ('Daycares', '4'), ('Nightlife', '4'), ('Parks', '8'), ('Nightlife', '4'), ('Cafes', '4'), ('Transit friendly', '7'), ('Cafes', '4'), ('High Schools', '3'), ('Groceries', '7'), ('High Schools', '3'), ('Vibrant', '3'), ('Pedestrian friendly', '6'), ('Vibrant', '3'), ('Greenery', '2'), ('Cycling friendly', '6'), ('Greenery', '2'), ('Elementary Schools', '1'), ('Restaurants', '6'), ('Elementary Schools', '1'), ('Historic', '0')]
SET: {('Parks', '8'), ('Groceries', '7'), ('Greenery', '2'), ('Cafes', '4'), ('High Schools', '3'), ('Vibrant', '3'), ('Quiet', '8'), ('Restaurants', '6'), ('Car friendly', '10'), ('Historic', '0'), ('Transit friendly', '7'), ('Nightlife', '4'), ('Cycling friendly', '6'), ('Shopping', '6'), ('Pedestrian friendly', '6'), ('Daycares', '4'), ('Elementary Schools', '1')}
Population: ['Population (2016) 69,229', 'Population variation between 2011 a

DEMO. DATA: [['Less than $50,000', '50%', '', 'Between $50,000 and $80,000', '22%', '', 'Between $80,000 and $100,000', '9%', '', 'Between $100,000 and $150,000', '11%', '', 'More than $150,000', '7%'], ['1-person households', '46%', '', '2-person households', '32%', '', '3-person households', '11%', '', '4-person households', '7%', '', '5-person or more households', '4%'], ['Couples without children at home', '44%', '', 'Couples with children at home', '32%', '', 'Single-parent families', '24%'], ['Owners', '35%', '', 'Renters', '65%'], ['Before 1960', '43%', '', 'Between 1961 and 1980', '18%', '', 'Between 1981 and 1990', '11%', '', 'Between 1991 and 2000', '5%', '', 'Between 2001 and 2010', '10%', '', 'Between 2011 and 2016', '13%'], ['Single-family homes', '2%', '', 'Semi-detached or row houses', '7%', '', 'Buildings with less than 5 floors', '71%', '', 'Buildings with 5 or more floors', '20%', '', 'Mobile homes', '0%'], ['University', '38%', '', 'College', '16%', '', 'Secondary (h

Page: 50
Start scraping new page...
UNITED: [('Quiet', '9'), ('Car friendly', '6'), ('Pedestrian friendly', '6'), ('Elementary Schools', '9'), ('Pedestrian friendly', '6'), ('Shopping', '6'), ('Daycares', '9'), ('Shopping', '6'), ('Greenery', '5'), ('Parks', '9'), ('Greenery', '5'), ('Cafes', '5'), ('Groceries', '8'), ('Cafes', '5'), ('Restaurants', '5'), ('Cycling friendly', '8'), ('Restaurants', '5'), ('Nightlife', '4'), ('Transit friendly', '7'), ('Nightlife', '4'), ('Vibrant', '3'), ('High Schools', '6'), ('Vibrant', '3'), ('Historic', '0')]
SET: {('Parks', '9'), ('Nightlife', '4'), ('Groceries', '8'), ('High Schools', '6'), ('Car friendly', '6'), ('Elementary Schools', '9'), ('Restaurants', '5'), ('Vibrant', '3'), ('Cycling friendly', '8'), ('Transit friendly', '7'), ('Daycares', '9'), ('Quiet', '9'), ('Cafes', '5'), ('Shopping', '6'), ('Pedestrian friendly', '6'), ('Historic', '0'), ('Greenery', '5')}
Population: ['Population (2016) 76,853', 'Population variation between 2011 and

DEMO. DATA: [['Less than $50,000', '58%', '', 'Between $50,000 and $80,000', '18%', '', 'Between $80,000 and $100,000', '7%', '', 'Between $100,000 and $150,000', '8%', '', 'More than $150,000', '9%'], ['1-person households', '55%', '', '2-person households', '31%', '', '3-person households', '9%', '', '4-person households', '4%', '', '5-person or more households', '2%'], ['Couples without children at home', '57%', '', 'Couples with children at home', '26%', '', 'Single-parent families', '17%'], ['Owners', '27%', '', 'Renters', '73%'], ['Before 1960', '33%', '', 'Between 1961 and 1980', '28%', '', 'Between 1981 and 1990', '11%', '', 'Between 1991 and 2000', '8%', '', 'Between 2001 and 2010', '13%', '', 'Between 2011 and 2016', '8%'], ['Single-family homes', '1%', '', 'Semi-detached or row houses', '2%', '', 'Buildings with less than 5 floors', '48%', '', 'Buildings with 5 or more floors', '49%', '', 'Mobile homes', '0%'], ['University', '52%', '', 'College', '15%', '', 'Secondary (high

1.0 % completed
Estimated remaining runtime: 0.2 hours
Page: 57
Start scraping new page...
UNITED: [('Quiet', '9'), ('Pedestrian friendly', '6'), ('Transit friendly', '5'), ('Car friendly', '9'), ('Transit friendly', '5'), ('Cafes', '5'), ('Cycling friendly', '9'), ('Cafes', '5'), ('Restaurants', '5'), ('Parks', '8'), ('Restaurants', '5'), ('Shopping', '5'), ('Groceries', '8'), ('Shopping', '5'), ('Vibrant', '3'), ('Greenery', '7'), ('Vibrant', '3'), ('High Schools', '2'), ('Elementary Schools', '6'), ('High Schools', '2'), ('Nightlife', '2'), ('Daycares', '6'), ('Nightlife', '2'), ('Historic', '0')]
SET: {('Car friendly', '9'), ('Groceries', '8'), ('Cycling friendly', '9'), ('Shopping', '5'), ('Restaurants', '5'), ('Vibrant', '3'), ('High Schools', '2'), ('Elementary Schools', '6'), ('Daycares', '6'), ('Historic', '0'), ('Greenery', '7'), ('Quiet', '9'), ('Cafes', '5'), ('Transit friendly', '5'), ('Pedestrian friendly', '6'), ('Parks', '8'), ('Nightlife', '2')}
Population: ['Populatio

DEMO. DATA: [['Less than $50,000', '38%', '', 'Between $50,000 and $80,000', '25%', '', 'Between $80,000 and $100,000', '12%', '', 'Between $100,000 and $150,000', '17%', '', 'More than $150,000', '8%'], ['1-person households', '28%', '', '2-person households', '32%', '', '3-person households', '17%', '', '4-person households', '15%', '', '5-person or more households', '8%'], ['Couples without children at home', '34%', '', 'Couples with children at home', '43%', '', 'Single-parent families', '23%'], ['Owners', '64%', '', 'Renters', '36%'], ['Before 1960', '13%', '', 'Between 1961 and 1980', '28%', '', 'Between 1981 and 1990', '31%', '', 'Between 1991 and 2000', '14%', '', 'Between 2001 and 2010', '11%', '', 'Between 2011 and 2016', '3%'], ['Single-family homes', '26%', '', 'Semi-detached or row houses', '24%', '', 'Buildings with less than 5 floors', '47%', '', 'Buildings with 5 or more floors', '3%', '', 'Mobile homes', '0%'], ['University', '16%', '', 'College', '17%', '', 'Secondary

Page: 64
Start scraping new page...
UNITED: [('Transit friendly', '10'), ('Cafes', '10'), ('Restaurants', '10'), ('Elementary Schools', '10'), ('Restaurants', '10'), ('Shopping', '10'), ('Pedestrian friendly', '10'), ('Shopping', '10'), ('Quiet', '8'), ('Parks', '10'), ('Quiet', '8'), ('High Schools', '7'), ('Groceries', '10'), ('High Schools', '7'), ('Historic', '7'), ('Vibrant', '10'), ('Historic', '7'), ('Greenery', '6'), ('Cycling friendly', '10'), ('Greenery', '6'), ('Daycares', '6'), ('Nightlife', '10'), ('Daycares', '6'), ('Car friendly', '4')]
SET: {('Greenery', '6'), ('Cycling friendly', '10'), ('Quiet', '8'), ('Nightlife', '10'), ('Elementary Schools', '10'), ('High Schools', '7'), ('Parks', '10'), ('Cafes', '10'), ('Transit friendly', '10'), ('Vibrant', '10'), ('Daycares', '6'), ('Shopping', '10'), ('Restaurants', '10'), ('Groceries', '10'), ('Car friendly', '4'), ('Pedestrian friendly', '10'), ('Historic', '7')}
Population: ['Population (2016) 87,168', 'Population variation

DEMO. DATA: [['Less than $50,000', '52%', '', 'Between $50,000 and $80,000', '22%', '', 'Between $80,000 and $100,000', '9%', '', 'Between $100,000 and $150,000', '11%', '', 'More than $150,000', '7%'], ['1-person households', '49%', '', '2-person households', '29%', '', '3-person households', '11%', '', '4-person households', '8%', '', '5-person or more households', '3%'], ['Couples without children at home', '43%', '', 'Couples with children at home', '36%', '', 'Single-parent families', '21%'], ['Owners', '30%', '', 'Renters', '70%'], ['Before 1960', '52%', '', 'Between 1961 and 1980', '24%', '', 'Between 1981 and 1990', '9%', '', 'Between 1991 and 2000', '5%', '', 'Between 2001 and 2010', '5%', '', 'Between 2011 and 2016', '4%'], ['Single-family homes', '2%', '', 'Semi-detached or row houses', '3%', '', 'Buildings with less than 5 floors', '89%', '', 'Buildings with 5 or more floors', '6%', '', 'Mobile homes', '0%'], ['University', '41%', '', 'College', '17%', '', 'Secondary (high)

Page: 71
Start scraping new page...
UNITED: [('Elementary Schools', '10'), ('Greenery', '7'), ('Car friendly', '7'), ('Daycares', '10'), ('Car friendly', '7'), ('Restaurants', '6'), ('Pedestrian friendly', '9'), ('Restaurants', '6'), ('Shopping', '6'), ('Parks', '9'), ('Shopping', '6'), ('Cycling friendly', '5'), ('Groceries', '9'), ('Cycling friendly', '5'), ('Cafes', '5'), ('Quiet', '8'), ('Cafes', '5'), ('Vibrant', '4'), ('Transit friendly', '8'), ('Vibrant', '4'), ('Nightlife', '4'), ('High Schools', '8'), ('Nightlife', '4'), ('Historic', '2')]
SET: {('Parks', '9'), ('Vibrant', '4'), ('Nightlife', '4'), ('High Schools', '8'), ('Quiet', '8'), ('Elementary Schools', '10'), ('Restaurants', '6'), ('Daycares', '10'), ('Car friendly', '7'), ('Groceries', '9'), ('Pedestrian friendly', '9'), ('Historic', '2'), ('Greenery', '7'), ('Cafes', '5'), ('Shopping', '6'), ('Cycling friendly', '5'), ('Transit friendly', '8')}
Population: ['Population (2016) 84,234', 'Population variation between 201

DEMO. DATA: [['Less than $50,000', '48%', '', 'Between $50,000 and $80,000', '25%', '', 'Between $80,000 and $100,000', '10%', '', 'Between $100,000 and $150,000', '11%', '', 'More than $150,000', '6%'], ['1-person households', '36%', '', '2-person households', '30%', '', '3-person households', '15%', '', '4-person households', '12%', '', '5-person or more households', '6%'], ['Couples without children at home', '34%', '', 'Couples with children at home', '43%', '', 'Single-parent families', '23%'], ['Owners', '40%', '', 'Renters', '60%'], ['Before 1960', '25%', '', 'Between 1961 and 1980', '50%', '', 'Between 1981 and 1990', '10%', '', 'Between 1991 and 2000', '7%', '', 'Between 2001 and 2010', '4%', '', 'Between 2011 and 2016', '3%'], ['Single-family homes', '4%', '', 'Semi-detached or row houses', '8%', '', 'Buildings with less than 5 floors', '85%', '', 'Buildings with 5 or more floors', '4%', '', 'Mobile homes', '0%'], ['University', '25%', '', 'College', '17%', '', 'Secondary (hi

Page: 78
Start scraping new page...
UNITED: [('Elementary Schools', '10'), ('Transit friendly', '9'), ('Parks', '9'), ('Daycares', '10'), ('Parks', '9'), ('Vibrant', '9'), ('Pedestrian friendly', '10'), ('Vibrant', '9'), ('Nightlife', '8'), ('Groceries', '10'), ('Nightlife', '8'), ('Quiet', '7'), ('Cycling friendly', '10'), ('Quiet', '7'), ('Greenery', '7'), ('Cafes', '10'), ('Greenery', '7'), ('High Schools', '7'), ('Restaurants', '10'), ('High Schools', '7'), ('Historic', '6'), ('Shopping', '10'), ('Historic', '6'), ('Car friendly', '3')]
SET: {('Transit friendly', '9'), ('Cycling friendly', '10'), ('Historic', '6'), ('Elementary Schools', '10'), ('High Schools', '7'), ('Car friendly', '3'), ('Cafes', '10'), ('Vibrant', '9'), ('Groceries', '10'), ('Daycares', '10'), ('Restaurants', '10'), ('Shopping', '10'), ('Nightlife', '8'), ('Pedestrian friendly', '10'), ('Parks', '9'), ('Greenery', '7'), ('Quiet', '7')}
Population: ['Population (2016) 139,590', 'Population variation between 2011

DEMO. DATA: [['Less than $50,000', '52%', '', 'Between $50,000 and $80,000', '22%', '', 'Between $80,000 and $100,000', '9%', '', 'Between $100,000 and $150,000', '11%', '', 'More than $150,000', '7%'], ['1-person households', '49%', '', '2-person households', '29%', '', '3-person households', '11%', '', '4-person households', '8%', '', '5-person or more households', '3%'], ['Couples without children at home', '43%', '', 'Couples with children at home', '36%', '', 'Single-parent families', '21%'], ['Owners', '30%', '', 'Renters', '70%'], ['Before 1960', '52%', '', 'Between 1961 and 1980', '24%', '', 'Between 1981 and 1990', '9%', '', 'Between 1991 and 2000', '5%', '', 'Between 2001 and 2010', '5%', '', 'Between 2011 and 2016', '4%'], ['Single-family homes', '2%', '', 'Semi-detached or row houses', '3%', '', 'Buildings with less than 5 floors', '89%', '', 'Buildings with 5 or more floors', '6%', '', 'Mobile homes', '0%'], ['University', '41%', '', 'College', '17%', '', 'Secondary (high)

Page: 85
Start scraping new page...
UNITED: [('Parks', '10'), ('Pedestrian friendly', '9'), ('Groceries', '9'), ('Cycling friendly', '10'), ('Groceries', '9'), ('Vibrant', '9'), ('Nightlife', '10'), ('Vibrant', '9'), ('Cafes', '9'), ('Restaurants', '10'), ('Cafes', '9'), ('Daycares', '8'), ('Shopping', '10'), ('Daycares', '8'), ('Historic', '5'), ('Transit friendly', '9'), ('Historic', '5'), ('Greenery', '4'), ('Elementary Schools', '9'), ('Greenery', '4'), ('Car friendly', '4'), ('High Schools', '9'), ('Car friendly', '4'), ('Quiet', '0')]
SET: {('Transit friendly', '9'), ('Cycling friendly', '10'), ('Daycares', '8'), ('Elementary Schools', '9'), ('Nightlife', '10'), ('Parks', '10'), ('High Schools', '9'), ('Vibrant', '9'), ('Restaurants', '10'), ('Shopping', '10'), ('Groceries', '9'), ('Greenery', '4'), ('Pedestrian friendly', '9'), ('Car friendly', '4'), ('Historic', '5'), ('Cafes', '9'), ('Quiet', '0')}
Population: ['Population (2016) 139,590', 'Population variation between 2011 an

DEMO. DATA: [['Less than $50,000', '48%', '', 'Between $50,000 and $80,000', '27%', '', 'Between $80,000 and $100,000', '9%', '', 'Between $100,000 and $150,000', '10%', '', 'More than $150,000', '5%'], ['1-person households', '30%', '', '2-person households', '29%', '', '3-person households', '16%', '', '4-person households', '15%', '', '5-person or more households', '9%'], ['Couples without children at home', '30%', '', 'Couples with children at home', '49%', '', 'Single-parent families', '21%'], ['Owners', '35%', '', 'Renters', '65%'], ['Before 1960', '14%', '', 'Between 1961 and 1980', '64%', '', 'Between 1981 and 1990', '8%', '', 'Between 1991 and 2000', '5%', '', 'Between 2001 and 2010', '6%', '', 'Between 2011 and 2016', '2%'], ['Single-family homes', '6%', '', 'Semi-detached or row houses', '3%', '', 'Buildings with less than 5 floors', '83%', '', 'Buildings with 5 or more floors', '8%', '', 'Mobile homes', '0%'], ['University', '22%', '', 'College', '16%', '', 'Secondary (high

Page: 92
Start scraping new page...
UNITED: [('Pedestrian friendly', '10'), ('Vibrant', '9'), ('High Schools', '8'), ('Groceries', '10'), ('High Schools', '8'), ('Daycares', '8'), ('Cycling friendly', '10'), ('Daycares', '8'), ('Nightlife', '8'), ('Restaurants', '10'), ('Nightlife', '8'), ('Cafes', '8'), ('Shopping', '10'), ('Cafes', '8'), ('Greenery', '6'), ('Transit friendly', '9'), ('Greenery', '6'), ('Historic', '5'), ('Elementary Schools', '9'), ('Historic', '5'), ('Car friendly', '4'), ('Parks', '9'), ('Car friendly', '4'), ('Quiet', '3')]
SET: {('Transit friendly', '9'), ('High Schools', '8'), ('Cycling friendly', '10'), ('Daycares', '8'), ('Greenery', '6'), ('Quiet', '3'), ('Historic', '5'), ('Elementary Schools', '9'), ('Vibrant', '9'), ('Groceries', '10'), ('Restaurants', '10'), ('Shopping', '10'), ('Car friendly', '4'), ('Nightlife', '8'), ('Pedestrian friendly', '10'), ('Parks', '9'), ('Cafes', '8')}
Population: ['Population (2016) 139,590', 'Population variation between 20

DEMO. DATA: [['Less than $50,000', '52%', '', 'Between $50,000 and $80,000', '24%', '', 'Between $80,000 and $100,000', '9%', '', 'Between $100,000 and $150,000', '11%', '', 'More than $150,000', '5%'], ['1-person households', '45%', '', '2-person households', '32%', '', '3-person households', '12%', '', '4-person households', '8%', '', '5-person or more households', '4%'], ['Couples without children at home', '42%', '', 'Couples with children at home', '35%', '', 'Single-parent families', '23%'], ['Owners', '34%', '', 'Renters', '66%'], ['Before 1960', '43%', '', 'Between 1961 and 1980', '31%', '', 'Between 1981 and 1990', '10%', '', 'Between 1991 and 2000', '5%', '', 'Between 2001 and 2010', '7%', '', 'Between 2011 and 2016', '4%'], ['Single-family homes', '5%', '', 'Semi-detached or row houses', '5%', '', 'Buildings with less than 5 floors', '86%', '', 'Buildings with 5 or more floors', '4%', '', 'Mobile homes', '0%'], ['University', '29%', '', 'College', '17%', '', 'Secondary (high

Page: 99
Start scraping new page...
UNITED: [('Transit friendly', '9'), ('Restaurants', '8'), ('Greenery', '7'), ('Elementary Schools', '9'), ('Greenery', '7'), ('Car friendly', '6'), ('Daycares', '9'), ('Car friendly', '6'), ('Cycling friendly', '6'), ('Groceries', '9'), ('Cycling friendly', '6'), ('Cafes', '6'), ('Shopping', '9'), ('Cafes', '6'), ('Vibrant', '5'), ('High Schools', '8'), ('Vibrant', '5'), ('Nightlife', '5'), ('Pedestrian friendly', '8'), ('Nightlife', '5'), ('Historic', '2'), ('Parks', '8'), ('Historic', '2'), ('Quiet', '0')]
SET: {('Transit friendly', '9'), ('High Schools', '8'), ('Restaurants', '8'), ('Vibrant', '5'), ('Car friendly', '6'), ('Elementary Schools', '9'), ('Pedestrian friendly', '8'), ('Shopping', '9'), ('Groceries', '9'), ('Daycares', '9'), ('Cafes', '6'), ('Historic', '2'), ('Greenery', '7'), ('Cycling friendly', '6'), ('Nightlife', '5'), ('Parks', '8'), ('Quiet', '0')}
Population: ['Population (2016) 143,853', 'Population variation between 2011 and 

Unnamed: 0,Building area,Building style,Cafes,Car friendly,Condominium type,Cycling friendly,Daycares,Elementary Schools,Greenery,Groceries,...,Year built,address,bedrooms,lat,long,powder room,price,rooms,title,walk_scoreNet area
0,,,7,6,Divided,5,10,10,4,10,...,2011,"7639, Avenue Léonard-De Vinci, apt. 6, Montréa...",2,45.5660642000,-73.6001460700,,"$274,000",6,Condo for sale,
1,,,10,4,Divided,10,9,9,8,10,...,1900,"3719, Rue de Mentana, Montréal (Le Plateau-Mon...",3,45.5219340000,-73.5685080000,,"$535,000",7,Condo for sale,
2,,"Two or more storey, Attached",5,8,,7,5,5,2,8,...,2004,"2811, Avenue Ernest-Hemingway, Montréal (Saint...",4,45.5104960000,-73.7136220000,1,"$796,000",8,House for sale,
3,,,9,3,Divided,10,10,10,6,10,...,2019,"5620, Rue De La Roche, apt. 105, Montréal (Ros...",2,45.5350426700,-73.5927857500,1,"$539,000",9,Condo for sale,
4,,,10,5,Divided,10,9,9,3,10,...,2020,"1000, Rue Ottawa, apt. 606, Montréal (Le Sud-O...",,45.4947180000,-73.5586710000,,"$55,000",,Condo for sale,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,,,7,6,Divided,7,9,8,6,10,...,2014,"3100, Rue Rachel Est, apt. 415, Montréal (Rose...",2,45.5430273700,-73.5612345600,,"$489,800",4,Condo for sale,
96,,,5,7,Divided,6,9,5,5,7,...,1993,"1010, Rue Thierry, Montréal (LaSalle)",4,45.4330675400,-73.6069172100,,"$649,000",9,Condo for sale,
97,,,10,4,Divided,4,10,9,6,10,...,1988,"1080, Rue Saint-Mathieu, apt. 304, Montréal (V...",2,45.4919480000,-73.5763680000,,"$548,000",8,Condo for sale,
98,,Detached,6,6,,6,9,9,7,9,...,1955,"7930, Rue des Écores, Montréal (Villeray/Saint...",,45.5566430000,-73.6129840000,,"$975,000",,Quadruplex for sale,


# Expedited troubleshooting

In [831]:
centris.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 36 columns):
Building area              0 non-null float64
Building style             37 non-null object
Cafes                      100 non-null object
Car friendly               100 non-null object
Condominium type           65 non-null object
Cycling friendly           100 non-null object
Daycares                   100 non-null object
Elementary Schools         100 non-null object
Greenery                   100 non-null object
Groceries                  100 non-null object
High Schools               100 non-null object
Historic                   100 non-null object
Lot area                   30 non-null object
Main unit                  15 non-null object
Nightlife                  100 non-null object
Number of units            15 non-null object
Parking (total)            63 non-null object
Parks                      100 non-null object
Pedestrian friendly        100 non-null object
Pool      

In [442]:
centris.data.describe(include="all")

Unnamed: 0,title,address,price,lat,long,descriptions,neighbourhood_indicators,demographics
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
unique,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
top,,,,,,,,
freq,,,,,,,,


In [443]:
# # Elements from top of scrollable list
#     neighbourhood_top = wait_for_xpath(\
#                             "//div[@class='ll-list ps ps--active-y']",\
#                             old_DOM)
    
#     # DEBUGGING-----------------------------------
#     print("FIRST:", [x.text for x in centris.driver.find_elements_by_class_name("ll-header")])
#     for button in centris.driver.find_elements_by_class_name("ll-header"):
#         time.sleep(1)
#         button.click()
#     # DEBUGGING-----------------------------------
    
#     # Scroll to buttom of list to load remaining elements
#     scrollable_bar = centris.driver.find_element_by_xpath(\
#                                             "//div[@class='ps__thumb-y']")
#     scrollable_bar.send_keys(Keys.PAGE_DOWN)
    
#     # Elements from buttom of scrollable list
#     neighbourhood_buttom = wait_for_xpath(\
#                             "//div[@class='ll-list ps ps--active-y']",\
#                             old_DOM_buttom)
    
#     # LOGGING----------------------
#     print("Top neighbourhood:", [x.text for x in neighbourhood_top])
#     print("Buttom neighbourhood:", [x.text for x in neighbourhood_buttom])

In [524]:
# neighbourhood_buttom = centris.driver.find_elements_by_xpath(\
#                              "//div[@class='ll-list ps ps--active-y']")
neighbourhood_buttom = centris.driver.find_elements_by_xpath("//div[@class='ll-list']")
#last_button = neighbourhood_buttom[-1]
print("ONE:", [x.text for x in neighbourhood_buttom])
# Scroll to buttom of list to load remaining elements
scrollable_bar = centris.driver.find_element_by_xpath(\
                                        "//div[@class='ps__thumb-y']")
scrollable_bar.send_keys(Keys.PAGE_DOWN)
scrollable_bar.click()
neighbourhood_buttom = centris.driver.find_elements_by_class_name("ll-list")
#last_button = neighbourhood_buttom[-1]
neighbourhood_buttom[0].text

ONE: []


ElementClickInterceptedException: Message: element click intercepted: Element <div class="ps__thumb-y focus-visible" tabindex="0" style="top: 174px; height: 244px;" data-focus-visible-added=""></div> is not clickable at point (303, 40). Other element would receive the click: <nav class="navbar navbar-expand fixed-top">...</nav>
  (Session info: chrome=84.0.4147.105)


In [515]:
# Scroll to buttom of list to load remaining elements
scrollable_bar = centris.driver.find_element_by_xpath(\
                                        "//div[@class='ps__thumb-y']")
scrollable_bar.send_keys(Keys.PAGE_DOWN)
#scrollable_bar.click()

In [477]:
last_button.click()

In [504]:
scrollable_bar.click()

In [657]:
description = "1 bathroom and 1 powder room"
numeric = re.findall("\(*[0-9]+\)*", description)
text = re.findall("[A-Za-z]+\-*\s*[A-Za-z]*", description)
numeric

['1', '1']

In [733]:
headers_of_interest = ["rooms", "bedrooms", "powder room", "Building style",\
                               "Condominium type", "Year built", "Building area", "Lot area",\
                     "Net area", "Parking", "Main unit", "Potential gross revenue", "Pool"]

descriptions = {'rooms': '16', 'bedrooms ': '4', 'in basement': '(1', 'bathrooms': '2', 'powder room': '1',\
                'Building style': 'Two or more storey, Detached', 'Year built': '1999', 'Lot area': '13,424 sqft',\
                'Parking (total)': 'Driveway (6), Garage (3)', 'Fireplace/Stove': 'Wood stove',\
                'Additional features': 'Basement 6 feet or +'}
description_table = pd.DataFrame()
dat = pd.DataFrame()
# Ensures consistency accross listings
for header in headers_of_interest:
    if header in descriptions.keys():
        value = descriptions[header]
#         print(header, "found")
#         print(descriptions[header])
    else:
#         print(header, "NOT FOUND!!!")
        value = np.nan
    description_table[header] = pd.Series(value)
    print(header, ":", value)
    print(description_table)
    
# Concat description information
new_data = pd.concat([new_data, description_table], axis=1)

# Append new data to existing data    
dat = dat.append(new_data, ignore_index=True)

rooms : 16
  rooms
0    16
bedrooms : nan
  rooms  bedrooms
0    16       NaN
powder room : 1
  rooms  bedrooms powder room
0    16       NaN           1
Building style : Two or more storey, Detached
  rooms  bedrooms powder room                Building style
0    16       NaN           1  Two or more storey, Detached
Condominium type : nan
  rooms  bedrooms powder room                Building style  Condominium type
0    16       NaN           1  Two or more storey, Detached               NaN
Year built : 1999
  rooms  bedrooms powder room                Building style  Condominium type  \
0    16       NaN           1  Two or more storey, Detached               NaN   

  Year built  
0       1999  
Building area : nan
  rooms  bedrooms powder room                Building style  Condominium type  \
0    16       NaN           1  Two or more storey, Detached               NaN   

  Year built  Building area  
0       1999            NaN  
Lot area : 13,424 sqft
  rooms  bedrooms powder

In [762]:
indicators = {('Transit friendly', '9'), ('Restaurants', '7'), ('Nightlife', '4'), ('Vibrant', '5'), ('Car friendly', '6'), ('Elementary Schools', '9'), ('Quiet', '8'), ('High Schools', '7'), ('Shopping', '9'), ('Pedestrian friendly', '10'), ('Cycling friendly', '8'), ('Daycares', '10'), ('Groceries', '10'), ('Cafes', '6'), ('Greenery', '7'), ('Parks', '8')}
data = pd.DataFrame()
for indicator in indicators:
    header = indicator[0]
    value = indicator[1]
    data[header] = pd.Series(value)

data

Unnamed: 0,Transit friendly,Restaurants,Vibrant,Car friendly,Elementary Schools,Quiet,High Schools,Shopping,Pedestrian friendly,Cycling friendly,Daycares,Groceries,Cafes,Greenery,Nightlife,Parks
0,9,7,5,6,9,8,7,9,10,8,10,10,6,7,4,8
