# Documentation and Logic
Yahoo finance does not explicitly forbid web scraping.  They do offer an API, but according to the documentation the historical data was limited to hard values like 1 month, 1 day, ect.  You could not specify dates.  There is also a yahoo finance web scrapping package for python, but I found that it failed to get a lot of the data.  Random dates would come up as NA even though the data existed.  I’m making a de novo scraping to get around that issue.  We are primarily interested in historical data, so the code focuses on fetching the historical data table for the specified entities.

In [1]:
import pandas as pd
import time

from splinter import Browser
from bs4 import BeautifulSoup as bs
from webdriver_manager.chrome import ChromeDriverManager

# Set the executable path and initialize Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless = True)



Current google-chrome version is 93.0.4577
Get LATEST driver version for 93.0.4577
Driver [C:\Users\karar\.wdm\drivers\chromedriver\win32\93.0.4577.63\chromedriver.exe] found in cache


In [2]:
market_tickers = [
    '%5EGSPC', # S&P 500
    '%5EIXIC' # NASDAQ Composite
]

# cryptcurrency tickers
crypto_tickers = [
#     'AAVE',
#     'BNB',
    'BTC',
#     'ADA',
#     'LINK',
#     'ATOM',
#     'CRO',
#     'DOGE',
#     'EOS',
#     'ETH',
#     'MIOTA',
#     'LTC',
#     'XMR',
#     'XEM',
#     'DOT',
#     'SOL',
#     'XLM',
#     'USDT',
#     'TRX',
#     'UNI',
#     'USDC',
#     'WBTC',
#     'XRP'
]

crypto_tickers = [f"{tick}-USD" for tick in crypto_tickers]

In [3]:
def get_historical_data(browser, ticker, start, end):
    """Given a list of crypto tickers and a start and end data (format = ‘mm/dd/yyyy’) return the historical price data from Yahoo Finance."""
    
    final_df = pd.DataFrame()
    
    # get the base URL and UNIX date ranges
    base_url = 'https://finance.yahoo.com/quote/'
    start = create_epoch(start) - 86400
    end = create_epoch(end)
    days_100 = 86400 * 100
    
    # get info in chunks of 100 because of how YF loads
    while start < end:
        
        # get 100 days past the start as the end
        current_end = end if start + days_100 > end else start + days_100
        
        # visit website and prase out data
        url = f"{base_url}{ticker}/history?period1={start}&period2={current_end}"
        browser.visit(url)
        html = browser.html
        df = parse_html(bs(html, 'html.parser'))
        
        # add data to final df and make the current end_date next start date 
        final_df = final_df.append(df)
        start = current_end
        
    # final df cleaning
    final_df['Date'] = pd.to_datetime(final_df['Date'])
    final_df = final_df.sort_values(by = 'Date')
    
    return final_df.reset_index(drop = True) 

In [4]:
def create_epoch(date):
    """Given data (format = ‘mm/dd/yyyy’) return the Epoch."""
    date_time = f"{date} 12:59:59"
    pattern = '%m/%d/%Y %H:%M:%S'
    return int(time.mktime(time.strptime(date_time, pattern)))

In [5]:
# This is the function that will have to change if Yahoo Finance changes their website
def parse_html(fin_soup):
    """Given html data from Yahoo Finance return a price history dataframe."""
    
    # Parse out the rows
    rows = []
    for section in fin_soup.find('table').children:
        for tr in section:
            row = []
            for tx in tr:
                row.append(tx.text)
            rows.append(row)

    # make dataframe
    return pd.DataFrame(rows[1:len(rows) - 1], columns = rows[0])

In [6]:
history = []
# for tick in market_tickers:
#     history.append(get_historical_data(browser, ticker, '10/20/2015', '10/30/2015'))
for tick in crypto_tickers:
    history.append(get_historical_data(browser, tick, '12/1/2017', '12/1/2018'))
browser.quit()
history[0].shape

(367, 7)

In [7]:
history[0].head(50)

Unnamed: 0,Date,Open,High,Low,Close*,Adj Close**,Volume
0,2017-12-01,10198.6,11046.7,9694.65,10975.6,10975.6,6783119872
1,2017-12-02,10978.3,11320.2,10905.1,11074.6,11074.6,5138500096
2,2017-12-03,11082.7,11858.7,10862.0,11323.2,11323.2,6608309760
3,2017-12-04,11315.4,11657.2,11081.8,11657.2,11657.2,6132409856
4,2017-12-05,11685.7,12032.0,11604.6,11916.7,11916.7,6895260160
5,2017-12-06,11923.4,14369.1,11923.4,14291.5,14291.5,12656300032
6,2017-12-07,14266.1,17899.7,14057.3,17899.7,17899.7,17950699520
7,2017-12-08,17802.9,18353.4,14336.9,16569.4,16569.4,21135998976
8,2017-12-09,16523.3,16783.0,13674.9,15178.2,15178.2,13911300096
9,2017-12-10,15168.4,15850.6,13226.6,15455.4,15455.4,13433299968


In [8]:
history[0].tail(20)

Unnamed: 0,Date,Open,High,Low,Close*,Adj Close**,Volume
347,2018-11-12,6411.76,6434.21,6360.47,6371.27,6371.27,4295770000
348,2018-11-13,6373.19,6395.27,6342.67,6359.49,6359.49,4503800000
349,2018-11-14,6351.24,6371.55,5544.09,5738.35,5738.35,7398940000
350,2018-11-15,5736.15,5774.82,5358.38,5648.03,5648.03,7032140000
351,2018-11-16,5645.32,5657.02,5498.94,5575.55,5575.55,5279320000
352,2018-11-17,5578.58,5578.58,5519.56,5554.33,5554.33,4303150000
353,2018-11-18,5559.74,5653.61,5559.74,5623.54,5623.54,4159680000
354,2018-11-19,5620.78,5620.78,4842.91,4871.49,4871.49,7039560000
355,2018-11-20,4863.93,4951.61,4272.11,4451.87,4451.87,8428290000
356,2018-11-21,4465.54,4675.73,4343.98,4602.17,4602.17,6120120000
