In [34]:
#Import required packages
import requests
import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

import time

import os

In [35]:
#Set chromedriver executable path. 
chromedriver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

#Define function to extract walkability, transit, and bike indexes
def get_info_and_scores(url, neighborhood_list):
    driver = webdriver.Chrome(chromedriver)
    df = pd.DataFrame(columns=['neighborhood', 'walk_score_of_100','transit_score_of_100','bike_score_of_100','median_sale_price_$'])
    print('Scraping Neighborhoods from Redfin\n')
    
    for neighborhood in neighborhood_list:
        print('Scraping ' + neighborhood)
        redfin_url=url
        driver.get(redfin_url)
        
        search_bar = driver.find_element_by_id("search-box-input")
        search_bar.send_keys(neighborhood + ', San Francisco, CA')
        search_bar.send_keys(Keys.RETURN)
        
        #Wait 10 seconds for page to load
        WebDriverWait(driver, 10).until(EC.url_changes(redfin_url))
        
        try:
            #Find div containing walk-scores class and median sale price 
            content_walk = driver.find_element_by_xpath("//div[contains(@class, 'walk-score')]")
            content_med_sale_price = driver.find_element_by_xpath("//div[contains(@class, 'trends')]//li[5]//div//span[2]//span")
            
            #convert div html to text
            content_walk_text =content_walk.text
            content_med_sale_text = content_med_sale_price.text
            
            #Extract out of 100 scores from div
            info_scores = [int(s) for s in content_walk_text.split() if s.isdigit()]
            info_scores = [info_scores[0],info_scores[2],info_scores[4]]
            
            #Add neighborhood and median sale price to info_list
            info_scores.insert(0,neighborhood)
            info_scores.insert(len(info_scores),content_med_sale_text)
        
            #add to walkability dataframe
            df.loc[-1] = info_scores  # adding a row
            df.index = df.index + 1  # shifting index
            df = df.sort_index()
        
        except NoSuchElementException:
            filler=[neighborhood,np.NaN,np.NaN,np.NaN,np.NaN]
            df.loc[-1] = filler  # adding a row
            df.index = df.index + 1  # shifting index
            df = df.sort_index()
    
    print('Scraping Done!')
            
    return df

#Define site URL for scraping
url="https://www.redfin.com/"

#Define neighborhood list as available in the SF data
neighborhood_list = ['Western Addition', 'Bernal Heights', 'Haight Ashbury', 'Mission',\
       'Potrero Hill', 'Civic Center / Van Ness', 'Castro','Upper Market',\
       'Inner Sunset', 'South of Market', 'Noe Valley', 'Outer Richmond',\
       'Presidio Heights', 'Nob Hill', 'Ocean View Terrace', 'Pacific Heights',\
       'Financial District', 'Twin Peaks', 'Russian Hill', 'Outer Sunset',\
       'North Beach', 'Glen Park', 'Marina Distric', 'Inner Richmond',\
       'Excelsior', 'Seacliff', 'Chinatown', 'Bayview', 'Diamond Heights',\
       'West of Twin Peaks', 'Outer Mission', 'Parkside', 'Lakeshore',\
       'Crocker Amazon', 'Golden Gate Park', 'Visitacion Valley','Presidio Heights']


In [36]:
df = get_info_and_scores(url, neighborhood_list)

Scraping Neighborhoods from Redfin

Scraping Western Addition
Scraping Bernal Heights
Scraping Haight Ashbury
Scraping Mission
Scraping Potrero Hill
Scraping Civic Center / Van Ness
Scraping Castro
Scraping Upper Market
Scraping Inner Sunset
Scraping South of Market
Scraping Noe Valley
Scraping Outer Richmond
Scraping Presidio Heights
Scraping Nob Hill
Scraping Ocean View Terrace
Scraping Pacific Heights
Scraping Financial District
Scraping Twin Peaks
Scraping Russian Hill
Scraping Outer Sunset
Scraping North Beach
Scraping Glen Park
Scraping Marina Distric
Scraping Inner Richmond
Scraping Excelsior
Scraping Seacliff
Scraping Chinatown
Scraping Bayview
Scraping Diamond Heights
Scraping West of Twin Peaks
Scraping Outer Mission
Scraping Parkside
Scraping Lakeshore
Scraping Crocker Amazon
Scraping Golden Gate Park
Scraping Visitacion Valley
Scraping Presidio Heights
Scraping Done!


In [37]:
#Save data to csv file
df.to_csv('..\Data\San Francisco Data\sf_walk_and_sale_df.csv', index=False)