# Scraping

In [82]:
"""
Created on June 2020
author: Jahnic Beck-Joseph
url: https://github.com/Jahnic/Portfolio/tree/master/RealEstate
"""

import pandas as pd
import numpy as np
from random import randint
import random
import time
import re

# Scraping through Chrome webdriver
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

In [83]:
# Starting URLs
centris_url = "https://www.centris.ca/en/properties~for-sale?view=Thumbnail"
duproprio = "https://duproprio.com/en/search/list?search=true&is_for_sale=1&with_builders=1&parent=1&pageNumber=1&sort=-published_at"

In [130]:
class Centris:
    """
    This class represents a chrome webdriver object with access to centris.ca.
    
    Attr:
    self.url - starting url for scraping process
    self.data - pandas.DataFrame object containing scraped data
    self.driver - Chrome webdriver
    self.DRIVER_PATH - path to Chrome webdriver
    self.old_DOM - web elements found in previous DOM 
    """
     
    def __init__(self, url=centris_url): 
        self.url = url
        self.data = pd.DataFrame()
        # Path to Chromedriver
        self.DRIVER_PATH = "/usr/bin/chromedriver"
        self.driver = None
        # Verification of new DOM
        self.old_DOM = {\
                        'title' : [],\
                        'address' : [],\
                        'price' : [],\
                        'lat' : [],\
                        'long' : [],\
                        'descriptions' : [],\
                        'neighbourhood_top' : [],\
                        # 'neighbourhood_mid' : [],\
                        # 'neighbourhood_buttom' : [],\
                        'demographics_buttons' : [],\
                    }
        
    def reset_old_DOM(self):
        self.old_DOM = {\
                        'title' : [],\
                        'address' : [],\
                        'price' : [],\
                        'lat' : [],\
                        'long' : [],\
                        'descriptions' : [],\
                        'neighbourhood_top' : [],\
                        # 'neighbourhood_mid' : [],\
                        # 'neighbourhood_buttom' : [],\
                        'demographics_buttons' : [],\
                    }

    def append_data(self, title, address, price,\
            lat, long, descriptions, neighbourhood_indicators,\
            population, demographics):
        """Appends new data to existing data frame.
        
        Args:
        title - string
        address - string 
        price - 
        lat - 
        long - 
        descriptions - 
        neighbourhood_indicators -
        population - 
        demographics - 
        """
        new_data = pd.DataFrame({\
                        'title': title,\
                        'address': address,\
                        'price': price,\
                        'lat': lat,\
                        'long': long\
                    }, index=[0])

        # DESCRIPTIONS
        description_table = pd.DataFrame()
        for key in descriptions.keys():
            header = key
            value = descriptions[header]
            description_table[header] = pd.Series(value)
        
        # POPULATION AND DEMOGRAPHICS
        new_data = pd.concat([new_data, neighbourhood_indicators, description_table,\
                             population, demographics], axis=1)

        # URL
        new_data['url'] = self.url

        # LOGGING --------------------------     
        # print(new_data)
        self.data = self.data.append(new_data, sort=False,\
                                     ignore_index=True)
        
    
    def get_data(self):
        return self.data
        
    def start_driver(self):
        """
        Starts and returns Crome webdriver. 
        The page link in the url attribute 
        is opened in headless mode.
        """
        
        # Activate headless mode for fastest response
        options = Options()
        options.add_argument('--start-maximized') # open Browser in maximized mode
        options.add_argument('--incognito')
        options.add_argument("--headless")
        options.add_argument("--disable-dev-shm-usage"); # overcome limited resource problems
        
        # Start driver with url
        self.driver = webdriver.Chrome(executable_path=self.DRIVER_PATH)
        self.driver.set_window_size(1120, 1000)
        self.driver.get(self.url)

    def sort_listings(self):
        """Sorts listings in webdriver from newest to oldest."""
        
        # Click drop down menu
        drop_down = self.driver.find_element_by_xpath(\
                                    "//button[@id='dropdownSort']")
        drop_down.click()
        
        # Sort by most recent listings
        sort_by = self.driver.find_element_by_xpath('//a[@data-option-value="3"]')
        sort_by.click()
    
    def goto_first_page(self):
        try:
            next_page = self.driver.find_element_by_xpath(\
                                        "//li[@class='goFirst']")
            next_page.click()
        except:
            print("goFirst button not available")
    
    def next_page(self):
        # Return to top of page, to access next-page button
        body = self.driver.find_element_by_tag_name("body")
        body.send_keys(Keys.HOME)
        for i in range(2):
            body.send_keys(Keys.PAGE_UP)

        try:
            next_page = self.driver.find_element_by_xpath(\
                                        "//li[@class='next']")
            next_page.click()
        except:
            time.sleep(0.5)
            # Try again after waiting 0.5 sec.
            try:
                next_page = self.driver.find_element_by_xpath(\
                                            "//li[@class='next']")
                next_page.click()
            except:
                print("Next-page button not found!")
                
    def get_page_position(self):
        '''Returns the first and last page of the current search.
        
        Returns
        tuple - (current_page, last_page), '''
        
        pages = self.driver.find_element_by_xpath(\
                                    "//li[@class='pager-current']").text.\
                                    split(" / ")
        
        current_page, last_page = (int(page.replace(",","")) for page in pages)
        
        return (current_page, last_page)
    
    def refresh_page(self):
        "Refreshes current webdriver page."
        self.driver.refresh()
        print("Page is being refreshed.")
        # Wait until page fully loaded
        time.sleep(3)
        
    def distance(origin, destination):
        """Calculates distances from latitudinal/longitudinal data using
        the haversine formula"""
        lat1, lon1 = origin
        lat2, lon2 = destination
        radius = 6371 # km
        
        #Convert from degrees to radians
        dlat = math.radians(lat2-lat1)
        dlon = math.radians(lon2-lon1)
        
        # Haversine formula
        a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
            * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
        d = radius * c

        return d
                
                                                 
# Instantiate class object
centris = Centris()

The following functions need to be outside of the Class. wait_for_xpath() determined the approptiate time to call get_data(). Initially, both fuctions were part of the class object. It seems that after the get_data() call, the driver does not get updated within the class. This leads in some cases to old DOM's being accessed after the browser has already switched to the next page. To circumvent this issue, elements are called outside the class and tried until accessible. This allows the entire new DOM to be loaded before get_data() is called.

In [126]:
 def wait_for_xpath(xpath: str, old_element):
        """
        Wait until elements in new DOM are accessible.
        
        Arg.
        xpath - xpath to new element 
        old_element - element at xpath from previous DOM (found in centris.old_DOM)
        
        Returns:
        current_element - the element found in the new DOM at xpath
        """
        
        centris_driver = centris.driver
        element_at_xpath = []
        
        # Ensure that the NEW rather than the previous or no DOM is active
        # Maximum wait time 10 sec.
        time_passed = 0
        
        # # ---------------------- LOGGING
        # print('Waiting for xpath')
        
        while (\
            (element_at_xpath == old_element or  element_at_xpath == [])\
            and (time_passed <= 10)\
              ):
            # # ------------------------- LOGGING
            # print("Old element:", element_at_xpath == old_element)
            # print("Empty element:", element_at_xpath == [])
            
            # Wait for DOM to load
            time.sleep(0.2)
            time_passed += 0.2
            
            # # Print every 2 seconds
            # if time_passed%2 == 0:
            #     print("Waiting for new DOM...")
            
            # Attempt to load new DOM
            try: 
                element_at_xpath = centris_driver.find_elements_by_xpath(xpath)
                print('--------------Found element at xpath-------------------')
            except: pass
        
        # After 10 seconds unlikely to load at all -> restart entire process
        if time_passed > 10:
            print("RuntimeError: element not found.")
            centris.driver.refresh()
            time.sleep(5)
            get_data_from_centris()
            wait_for_xpath(xpath, old_element)
            
        return element_at_xpath

In [86]:
def scrape_description(old_DOM):
    """ Requires instantiated centris object. Scrapes and returns
    description data: Year build, price, Net area, etc."""
    
    descriptions = wait_for_xpath("//div[@class='col-lg-12 description']",\
                                 old_DOM)
    #First three elements not relevant
    descriptions_list = descriptions[0].text.split("\n")[3:]
    
    #LOGGING------------------------
    #print("DESCRIPTION:", descriptions_list)
    
    # Update old_DOM dictionary with new element for next verification
    centris.old_DOM['descriptions'] = descriptions
    
    return extract_descriptions(descriptions_list)

In [87]:
# The data_dict found on this part of the page is inconsistent across listings
    # The first row may contain the number of rooms, bedrooms and bathrooms without headers or may be missing
    # Following rows have heathers with associated values after a line break
    # The very last element may be a walking score without header
    # Listings without first row may supply first row information in subsequent rows with headers
    # Because of these inconsistencies, two seperate extractions need to be implemented: one for
    # first row lements (if they exist) and another for subsequent rows

def extract_descriptions(descriptions_list):
    """Takes in data from scrape_description() and returns it 
    as a dictionary"""
    
    # Transformed data
    data_dict = {}
    # Distinguish between elements from first and subsequent rows if first row exists
    first_row = True
    # Starting point for second part of transformation
    second_row_index = 0
    
    # First Part
    while first_row == True:
        for description in descriptions_list:
            numeric = re.findall("\(*[0-9]+\)*", description) # numbers
            text = re.findall("[A-Za-z]+[A-Za-z\s\-]*", description) # text after/inbetween numbers 

            # Initial elements with numeric values correspond to first row
            if (numeric != []):
                # For each value there must be one text description
                if (len(numeric) == len(text)):
                    for description,value in zip(text, numeric):
                        # Save as column in data_dict
                        description_clean = description.replace("and", "").strip()
                        data_dict[description_clean] = value
                    second_row_index += 1 
                else:
                    print("Unequal number of first row keys and values!")
                    print("Numbers:", numeric)
                    print("Text:", text)
                    break
            else:
                first_row = False # No numeric information implies header
                break
    
    # Index range of second extraction
    # Headers are found at every second index (0,2,4,...)
    # Values are one index apart from their corresponding header (1,3,5,...)
    list_length = len(descriptions_list)
    if (list_length - second_row_index)%2 == 1: # Implies presence of element without header -> Walk Score
        walk_score_listed = True
        end_point = list_length -1
    else:
        walk_score_listed = False
        end_point = list_length
    # Indices corresponding to headers
    extraction_range = range(second_row_index, end_point, 2)
    
    #LOGGING----------------------
#     print("Second row index:", second_row_index)
#     print("Extraction range:", extraction_range)
#     print("List:", descriptions_list)
    
    # Second Part
    for header_index in extraction_range:
        # Headers as column names
        header = descriptions_list[header_index]
        # Values corresponding to headers are found at subsequent indices
        information = descriptions_list[header_index + 1] 
        data_dict[header] = information
    
    if walk_score_listed:
        data_dict["walk_score"] = descriptions_list[-1]
        #LOGGING----------------------
        #print("Walk Score:", descriptions_list[-1])
        
    #LOGGING--------------------------
#     print("Descriptions:", data_dict)
        
    return data_dict

In [88]:
# def scrape_neighbourhood(old_DOM_top, old_DOM_mid, old_DOM_buttom):
#     """ Scrapes and returns a list of ratings 
#     between 0-10 for a set of neighborhood indicators
#     such as groceries, parks, noise, etc.)
#     """
#     driver = centris.driver
    
#     # Extract elements from top section of scrollable list
#     neighbourhood_top = wait_for_xpath(\
#                             "//div[@class='ll-list ps ps--active-y']",\
#                             old_DOM_top)
#     # Split into indicators and ranking values
#     top = [x.text for x in neighbourhood_top][0].split("\n")
    
#     # LOGGING----------------------
#     print("Top neighbourhood:", top)
    
#     # Extract middle section - only one element
#     # Scroll and activate scrollable bar container
#     scrollable_bar = driver.find_element_by_xpath(\
#                                             "//div[@class='ps__thumb-y']")
#     ActionChains(driver).\
#         move_to_element(scrollable_bar).\
#         send_keys(Keys.PAGE_DOWN).\
#         click(scrollable_bar).perform()

#     # Elements from buttom of scrollable list
#     neighbourhood_mid = wait_for_xpath(\
#                             "//div[@class='ll-list ps ps--active-y']",\
#                             old_DOM_mid)
#     # Split into indicators and ranking values
    
#     # LOGGING----------------------
#     print("Neighbourhoud mid section:", neighbourhood_mid)
    
#     middle = [x.text for x in neighbourhood_mid][0].split("\n")
    
#     # Extract buttom section
#     # Scroll and load remaining elements
# #     scrollable_bar = driver.find_element_by_xpath(\
# #                                             "//div[@class='ps__thumb-y']")
#     ActionChains(driver).\
#         move_to_element(scrollable_bar).\
#         send_keys(Keys.PAGE_DOWN).\
#         click(scrollable_bar).perform()
    
#     # Elements from buttom of scrollable list
#     neighbourhood_buttom = wait_for_xpath(\
#                             "//div[@class='ll-list ps ps--active-y']",\
#                             old_DOM_buttom)
#     # Split into indicators and ranking values
#     buttom = [x.text for x in neighbourhood_buttom][0].split("\n")
    
#     # LOGGING----------------------
#     print("Buttom neighbourhood:", buttom)
    
#     # Unite all three sections by storing tuples of indicator names and corresponding values
#     united_list = []
#     list_length = len(top)
#     for i in range(0, list_length, 2):
#         united_list.append((top[i], top[i+1]))
#         united_list.append((middle[i], middle[i+1]))
#         united_list.append((buttom[i], buttom[i+1]))
    
#     # Create set of unique tuples
#     neighbourhood_indicators = set(united_list)
    
#     # LOGGING----------------------
#     print("Number of neighborhood indicators: ", len(neighbourhood_indicators))
#     print("UNITED:", united_list)
#     print("SET:", neighbourhood_indicators)
    
#     # Verify size and extract information as list
#     # If size unexpected, refresh page and restart process
#     if len(neighbourhood_indicators) < 8:
#             centris.refresh_page()
#             scrape_neighbourhood(old_DOM_top, old_DOM_buttom)

#     # Update old_DOM dictionary with new elements for next verification
#     centris.old_DOM['neighbourhood_top'] = neighbourhood_top
#     centris.old_DOM['neighbourhood_mid'] = neighbourhood_mid
#     centris.old_DOM['neighbourhood_buttom'] = neighbourhood_buttom

#     return extract_neighbourhood_indicators(neighbourhood_indicators)

In [134]:
def scrape_neighbourhood(old_DOM_top):
    """ Scrapes and returns a list of ratings 
    between 0-10 for a set of neighborhood indicators
    such as groceries, parks, noise, etc.)
    """
    driver = centris.driver
    
    # Extract elements from top section of scrollable list
    neighbourhood_top = wait_for_xpath(\
                            '//ul[@class="ll-module__list"]',\
                            old_DOM_top)
   
    # Split into indicators and ranking values
    top = [x.text for x in neighbourhood_top][0].split("\n")
    
    # Create set of unique tuples
    neighbourhood_indicators = list(top)
    
#     # LOGGING----------------------
#     print("Number of neighborhood indicators: ", len(neighbourhood_indicators))
#     print(len(neighbourhood_top))
#     print("List:", neighbourhood_indicators)
    
    # Verify size and extract information as list
    # If size unexpected, refresh page and restart process
    if len(neighbourhood_indicators) < 30:
            print("Incorrect number of neighbourhood indicators")
            centris.driver.refresh()
            time.sleep(3)
            scrape_neighbourhood(old_DOM_top)

    # Update old_DOM dictionary with new elements for next verification
    centris.old_DOM['neighbourhood_top'] = neighbourhood_top
    # return indicators
    return extract_neighbourhood_indicators(neighbourhood_indicators)

In [90]:
def extract_neighbourhood_indicators(indicators):
    """Takes in neighbourhood data from scrape_neighbourhood() and returns it 
    in tabular form as a DataFrame object"""
    data = pd.DataFrame()
    n_indicators = len(indicators)
    for index in range(0, (n_indicators - 1), 2):
        header = indicators[index]
        value = indicators[index + 1]
        data[header] = pd.Series(value)
    
    return data

In [91]:
def scrape_population():
    """Scrapes and returns population summary data (density, variation etc.)"""
    population_summaries =  centris.driver.find_element_by_id('info')
    population_summaries_list = population_summaries\
                        .text.split("\n")
    
    # LOGGING-----------------------
    #print("Population:", population_summaries_list)
    
    return extract_population(population_summaries_list)

In [92]:
def extract_population(population):
    """Takes in population data from scrape_population() and returns it 
    in tabular form as a DataFrame object"""
    
    data = pd.DataFrame()
    for info in population:
        units_removed =  info.replace("hab/km2", "").strip()
        # Numeric data
        numeric = re.findall("[0-9]+[0-9,]*", units_removed)
        numeric_clean = numeric[-1].replace(",","")

        # Text data for column names
        header = re.findall("[a-zA-Z\s]+", units_removed)
        header_clean = header[0]
        # Add numeric data to header excluding the value at index -1
        for numeric_head_data in numeric[:-1]:
            header_clean = header_clean + str(numeric_head_data) + " "

        data[header_clean] = pd.Series(numeric_clean).astype("int")
    return data

In [93]:
def scrape_demographics(old_DOM):
    """Scrapes and return demographic data found in a clickable list"""
    
    driver = centris.driver
    # Clickable list containing demographic data
    menu = driver.find_element_by_id("menu")
    # Load menu by moving browser to it
    ActionChains(driver).\
    move_to_element(menu).perform()
    
    #Buttons to access demographics data (education, incomes, etc.)
    demographics_buttons = wait_for_xpath(\
                        "//div[@class='centrisSocioDemobutton']",\
                                                 old_DOM)

    # LOGGING------------------------
    # print("DEMO. BUTTONS:", demographics_buttons)

    # First entry on clickable demographics list (pre-selected)
    demographics = []

    # Click buttons to access next demogrpahics elements
    for button in demographics_buttons:
        try: 
            button.click()
        except: 
            print("Demographics button missing!")
            # Reattempt loading buttons
            centris.refresh_page()
            ActionChains(driver).\
            move_to_element(menu).\
            perform()
            time.sleep(2) # extra time to load
            demographics_buttons = wait_for_xpath(\
                        "//div[@class='centrisSocioDemobutton']",\
                                                 old_DOM)
            
        # Get and append data after button click
        demographic_data = driver.find_element_by_class_name(\
                         "socioDemoLabel")
        demographics.append(demographic_data.text)
    
    # Split each demographic component into separate list
    # Example: splits "Occupation" data into -> ["Owners", "35%", "Renters", "65%"]
    demographics = [demo.split("\n") for demo in demographics]
    
    #LOGGING------------------------
#     print("DEMO. DATA:", demographics)
#     print("-"*50)
    
    # Update old_DOM dictionary with new elements for next verification
    centris.old_DOM['demographics_buttons'] = demographics_buttons
    return extract_demographics(demographics)

In [94]:
def extract_demographics(demographics):
    """Takes in demographic data from extract_demographics() and returns it 
    in tabular form as a DataFrame object"""
    
    data = pd.DataFrame()
    for demographic in demographics:
        # Remove empty stings from splitting double line breaks \n\n
        removed_empty_strings = [x for x in demographic if x != ""]
        demographic = removed_empty_strings
        # Format of demographic: [header, value, header, value, ...]
        header_index = range(0, len(demographic), 2)
        for i in header_index:
            header = demographic[i] + " (%)" # add units to column names
            value = demographic[i+1].replace("%", "") # remove units from values 
            data[header] = pd.Series(value).astype("int")

    return data

In [137]:
def get_data_from_centris():
        """
        Requires instantiate Centris object. Scrapes information from the
        webdriver and appends it to the Centris object.
        """
        driver = centris.driver
        old_DOM = centris.old_DOM
        
        # Data from headers
        print("Start scraping new page...")
        title = wait_for_xpath("//span[@data-id='PageTitle']", old_DOM['title'])
        address = wait_for_xpath("//h2[@itemprop='address']", old_DOM['address'])
        price = wait_for_xpath("//span[@itemprop='price']", old_DOM['price'])
        lat = wait_for_xpath("//meta[@itemprop='latitude']", old_DOM['lat'])
        lon = wait_for_xpath("//meta[@itemprop='longitude']", old_DOM['long'])

        # Save new elements as old DOM
        centris.old_DOM['price'] = price
        centris.old_DOM['title'] = title
        centris.old_DOM['address'] = address
        centris.old_DOM['lat'] = lat
        centris.old_DOM['long'] = lon

        # Extract text
        title = title[0].text
        address = address[0].text
        price = price[0].text
        lat = lat[0].get_attribute("content")
        lon = lon[0].get_attribute("content")
        
        # Scrape remaining elements and store in dataframe
        # print('Start TESTING')
        descriptions = scrape_description(old_DOM['descriptions'])
        # print('Descriptions:', descriptions)
        # print('++++++++++++++++++++Neighborhood+++++++++++++++++++++++')
        neighbourhood_indicators = scrape_neighbourhood(old_DOM['neighbourhood_top']
                                                        # old_DOM['neighbourhood_mid'],\
                                                        # old_DOM['neighbourhood_buttom']
                                                            )
        # print('++++++++++++++++++++Demographics+++++++++++++++++++++++')
        population = scrape_population()
        demographics = scrape_demographics(old_DOM['demographics_buttons'])
                
        # Unify data in single dataframe and append to results table
        centris.append_data(
            title,\
            address,\
            price,\
            lat,\
            lon,\
            descriptions,\
            neighbourhood_indicators,\
            population,\
            demographics\
        )
        
        # # LOGGING--------------------------
        # print("GET DATA: DESCRIPTIONS:", descriptions)

## Initiate scraping

In [144]:
# Create driver object
centris = Centris()
start = time.time()
centris.start_driver()
print("Execution time:", time.time() - start)

Execution time: 5.7538886070251465


Before running the next cell, search for the region(s) you want to scrape in the webdriver window.
This is not required but will narrow results and reduce runtime.

In [60]:
# # Do not include already scraped listings
# centris.sort_listings() # newest listings first
# already_scraped = pd.read_csv('data/centris_montreal_complete.csv')
# already_scraped = set(already_scraped.address)

In [145]:
start = time.time() 
current_page, last_page = centris.get_page_position() 
pages_to_scrape = last_page - current_page + 1 # in case scraping is interupted
one_to_100 = range(1,100) # to print message each 1% completion
exclude_old_data = False # set False if already scraped listings should be skipped 

print("Scraping initiated.")
print("Total number of pages to scrape:", pages_to_scrape)
print("Estimated runtime:", round(pages_to_scrape*((9.6)/(60*60)), 2), "hours")
print("="*50)

for i in range(pages_to_scrape):
    
    print("="*50)
    print("Page:", i+1)
    time_passed = 0 # to exit while loop after 10 seconds
    
    #Refresh every 20 pages to clear memory build-up
    if (i+1)%20 == 0:
        print("Clearing memory")
        print("-"*50)
        
        # Each refresh frees some memory. Four seem to work best.
        for i in range(4):
            centris.driver.refresh()
            # Extra time for last refresh
            # Ensures that DOM is fully loaded
            if i == 3:
                time.sleep(2)

    # Check listing was already scraped
    if exclude_old_data:
        current_address = wait_for_xpath("//h2[@itemprop='address']", centris.old_DOM['address'])[0]           
        if current_address.text not in already_scraped:          
            get_data_from_centris() # Retrieve data only if not already scraped
        else:
            print(current_address, 'was already scraped!')
    else:
        get_data_from_centris() # Retrieve data

    
    # Short delay for chrome to respond to PAGE_UP command
    centris.next_page()
    time.sleep(2) # wait for next page
    centris.driver.refresh()
    time.sleep(2)
    
    # Percent completed of scraping 
    percent_complete = round(100*((i)/last_page),2)      
    # Print after every 1% mark
    if percent_complete in one_to_100:
        execution_time = (time.time() - start)/(i+1) # seconds per page
        print(percent_complete, "%", "completed")
        print("Average execution time per page:", round(execution_time, 2), "sec.")
        print("Estimated remaining runtime:", round(\
                                (total_pages - (i+1))\
                                *(execution_time\
                                /(60*60)), 1\
                                ), "hours <", "-"*50)
        print("="*50)


print("Total runtime:", execution_time/(60*60), "hours")
centris.data

ment at xpath-------------------
Number of neighborhood indicators:  34
11
List: ['Car friendly', '10', 'Parks', '7', 'Grocery stores', '7', 'Coffee Shops', '6', 'Quiet', '5', 'Greenery', '5', 'Elementary schools', '5', 'Shopping', '5', 'Transit friendly', '4', 'Pedestrian friendly', '4', 'Nightlife', '4', 'Restaurants', '4', 'High schools', '3', 'Cycling friendly', '3', 'Daycares', '2', 'Historic', '2', 'Vibrancy', '2']
++++++++++++++++++++Demographics+++++++++++++++++++++++
Waiting for xpath
Old element: False
Empty element: True
--------------Found element at xpath-------------------
[['Less than $50,000', '29%', '', 'Between $50,000 and $80,000', '21%', '', 'Between $80,000 and $100,000', '11%', '', 'Between $100,000 and $150,000', '20%', '', 'More than $150,000', '18%'], ['1-person households', '30%', '', '2-person households', '31%', '', '3-person households', '15%', '', '4-person households', '17%', '', '5-person or more households', '7%'], ['Couples without children at home', '

AttributeError: 'NoneType' object has no attribute 'text'

In [None]:
# Save data
# centris.data.to_csv("centris_montreal_complete.csv")

In [143]:
centris.data

Unnamed: 0,title,address,price,lat,long,Pedestrian friendly,Grocery stores,Vibrancy,Coffee Shops,Restaurants,...,Others languages (%),url,bathroom,Fireplace/Stove,powder room,Gross area,Additional features,bedroom,Building style,in basement
0,Condo for sale,"350, Rue Eleanor, apt. 323, Montréal (Le Sud-O...","$788,000",45.492995,-73.56258,10.0,10.0,10.0,10.0,10.0,...,15,https://www.centris.ca/en/properties~for-sale?...,,,,,,,,
1,Condo for sale,"350, Rue Eleanor, apt. 323, Montréal (Le Sud-O...","$788,000",45.492995,-73.56258,10.0,10.0,10.0,10.0,10.0,...,15,https://www.centris.ca/en/properties~for-sale?...,,,,,,,,
2,Condo for sale,"2486, Rue Duvernay, apt. 203, Montréal (Le Sud...","$638,000",45.482259,-73.574221,10.0,9.0,8.0,9.0,10.0,...,15,https://www.centris.ca/en/properties~for-sale?...,1.0,Fireplace - Other,,,,,,
3,Condo for sale,"1719, Rue Saint-Patrick, apt. PH401, Montréal ...","$5,750,000",45.48561639,-73.56354297,10.0,8.0,7.0,8.0,10.0,...,15,https://www.centris.ca/en/properties~for-sale?...,,Gas fireplace,1.0,"4,554 sqft",Elevator,,,
4,Condo for sale,"10224, boulevard Saint-Laurent, apt. 202, Mont...","$359,900",45.54790968,-73.66368966,10.0,10.0,5.0,6.0,8.0,...,25,https://www.centris.ca/en/properties~for-sale?...,1.0,,,960 sqft,,,,
5,Condo for sale,"3535, Avenue Papineau, apt. 408, Montréal (Le ...","$430,000",45.529408,-73.566471,,,,,,...,10,https://www.centris.ca/en/properties~for-sale?...,,,,,"Adapted for reduced mobility, Elevator",,,
6,Condo for sale,"3465, Chemin de la Côte-des-Neiges, apt. 85, M...","$332,000",45.496619,-73.583415,,,,,,...,20,https://www.centris.ca/en/properties~for-sale?...,,,,,Elevator,,,
7,Condo for sale,"4478, Rue Clark, apt. 3, Montréal (Le Plateau-...","$399,000",45.519038,-73.586264,10.0,10.0,10.0,10.0,10.0,...,10,https://www.centris.ca/en/properties~for-sale?...,1.0,,,,,,,
8,Condo for sale,"1254, Rue Saint-Marc, apt. 42, Montréal (Ville...","$425,000",45.492286,-73.579815,10.0,10.0,10.0,10.0,10.0,...,20,https://www.centris.ca/en/properties~for-sale?...,1.0,,,,Elevator,,,
9,Condo for sale,"1188, Rue Saint-Antoine Ouest, apt. 1101, Mont...","$459,000",45.49557984,-73.56791385,,,,,,...,20,https://www.centris.ca/en/properties~for-sale?...,1.0,,,,,1.0,,
