# Web Scraping using Selenium 

Here we demonstrate how to scrape historical stock data available on Yahoo finance, using the selenium browser automation tool for Python 3. In the script, which I wrote directly in Jupyter notebook, I allow the user to set the browser type, and then a browser instance with Yahoo finance is opened, and the page is scrolled so that all available information is obtained.

Note: for user information, yahoo finance cannot be navigated with Firefox, use chrome to continue the demo. Other sites behave differently

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
from selenium import webdriver
import selenium.webdriver.chrome.service as svc

  return f(*args, **kwds)


In [2]:
cd YahooFinanceScrape

/home/user1/Scripts/Python27/repos/Projects/YahooFinanceScrape


In [4]:
os.system("chmod +rwx chromedriver.exe")

256

In [8]:
from selenium import webdriver
import selenium.webdriver.chrome.service as svc


def get_hist_data(tick, five_years = False, browser = "firefox"):
    
    if browser == "chrome":
        service = svc.Service(r'./chromedriver')
        service.start()
        capabilities = {'chrome.binary': r'C:\Users\Ian\Scripts\webscrape\driver'}
        driver = webdriver.Remote(service.service_url, capabilities)
    else:        
        fp = webdriver.FirefoxProfile()
        fp.set_preference("https.response.timeout", 10)
        fp.set_preference("dom.max_script_run_time", 10)
        driver = webdriver.Firefox(firefox_profile=fp)


    if not five_years:
        url = "https://finance.yahoo.com/quote/" + tick + "/history?p=" + tick
    else:
        url = "https://finance.yahoo.com/quote/" + tick + "/history?period1=1382932800&period2=1540699200&interval=1d&filter=history&frequency=1d"
        
    driver.get(url)
    driver.implicitly_wait(10)
    driver.execute_script("window.scrollTo(0, 5000)") 
    driver.execute_script("window.scrollTo(0, 10000)") 
    driver.execute_script("window.scrollTo(0, 10000)") 
    driver.execute_script("window.scrollTo(0, 20000)") 
    driver.execute_script("window.scrollTo(0, 20000)") 
    driver.execute_script("window.scrollTo(0, 40000)") 
    driver.execute_script("window.scrollTo(0, 40000)")
    driver.execute_script("window.scrollTo(0, 80000)")
    driver.execute_script("window.scrollTo(0, 80000)") 
    driver.execute_script("window.scrollTo(0, 160000)")
    driver.execute_script("window.scrollTo(0, 160000)")
    driver.execute_script("window.scrollTo(0, 320000)")
    
    #get tables
    table = driver.find_element_by_tag_name("table")
    table_html = table.get_attribute("innerHTML")
    table_bs4 = BeautifulSoup(table_html, "html.parser")
    
    #Get rows
    rows = table_bs4.findAll("tr")
    
    #Saving data to file
    with open("data/" + tick + ".csv", "w") as file:
        for row in rows:
            cols = row.findAll("span")
            if len(cols) == 7:
                for col in cols:
                    print(col.get_text())
                    file.write('"' + col.get_text() + '",')
                file.write("\n")
    file.close()
    
    #Additionally creating manipulable pandas dataframe
    data = pd.read_csv("data/" + tick + ".csv").reset_index()[["Date", "Open", "High", "Low", "Close*", "Adj Close**", "Volume"]]
    return data
    

In [9]:
data = get_hist_data("GOOG", browser="chrome")

Date
Open
High
Low
Close*
Adj Close**
Volume
Dec 28, 2018
1,049.62
1,055.56
1,033.26
1,037.08
1,037.08
1,398,885
Dec 27, 2018
1,017.15
1,043.89
997.00
1,043.88
1,043.88
2,104,100
Dec 26, 2018
989.01
1,040.00
983.00
1,039.46
1,039.46
2,373,300
Dec 24, 2018
973.90
1,003.54
970.11
976.22
976.22
1,590,300
Dec 21, 2018
1,015.30
1,024.02
973.69
979.54
979.54
4,596,000
Dec 20, 2018
1,018.13
1,034.22
996.36
1,009.41
1,009.41
2,673,500
Dec 19, 2018
1,033.99
1,062.00
1,008.05
1,023.01
1,023.01
2,479,300
Dec 18, 2018
1,026.09
1,049.48
1,021.44
1,028.71
1,028.71
2,192,500
Dec 17, 2018
1,037.51
1,053.15
1,007.90
1,016.53
1,016.53
2,385,400
Dec 14, 2018
1,049.98
1,062.60
1,040.79
1,042.10
1,042.10
1,686,600
Dec 13, 2018
1,068.07
1,079.76
1,053.93
1,061.90
1,061.90
1,329,800
Dec 12, 2018
1,068.00
1,081.65
1,062.79
1,063.68
1,063.68
1,523,800
Dec 11, 2018
1,056.49
1,060.60
1,039.84
1,051.75
1,051.75
1,394,700
Dec 10, 2018
1,035.05
1,048.45
1,023.29
1,039.55
1,039.55
1,807,700
Dec 07, 2018
1,060.01
1,0