**Adaptation of Yahoo Finance Stock Price Scraper by Debra Ray with lxml**

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time
import lxml
from lxml import html

In [2]:
def format_date(date_datetime):
    date_timetuple = date_datetime.timetuple()
    date_mktime = str(int(time.mktime(date_timetuple)))
    return date_mktime

def subdomain(symbol, start, end, filter='history'):
    subdomain = f'/quote/{symbol}/history?period1={start}&period2={end}&interval=1d&filter={filter}&frequency=1d'
    return subdomain
    
def header_function(subdomain):
    hdrs =  {"authority": "finance.yahoo.com",
               "method": "GET",
               "path": subdomain,
               "scheme": "https",
               "accept": "text/html",
               "accept-encoding": "gzip, deflate, br",
               "accept-language": "en-US,en;q=0.9",
               "cache-control": "no-cache",
               "cookie": "Cookie:identifier",
               "dnt": "1",
               "pragma": "no-cache",
               "sec-fetch-mode": "navigate",
               "sec-fetch-site": "same-origin",
               "sec-fetch-user": "?1",
               "upgrade-insecure-requests": "1",
               "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64)"}
    return hdrs

In [3]:
def scrape_page(url, header):
    page = requests.get(url, headers=header)
    
    element_html = html.fromstring(page.content)
    
    table = element_html.xpath('//table')
    
    table_tree = lxml.etree.tostring(table[0], method='xml')
    
    df = pd.read_html(table_tree)
    
    return df

In [4]:
symbol = 'TSLA'

dt_start = datetime.today() - timedelta(days = 100)
dt_end = datetime.today()


price_history = []

for _ in range(45):
    start = format_date(dt_start)
    end = format_date(dt_end)
    
    sub = subdomain(symbol, start, end)
    header = header_function(sub)

    base_url = 'https://finance.yahoo.com/'
    url = base_url + sub
    price_history += scrape_page(url, header)
    dt_end = dt_start
    dt_start -= timedelta(days = 100)

In [5]:
price_history[0].head()

Unnamed: 0,Date,Open,High,Low,Close*,Adj Close**,Volume
0,"Jun 08, 2020",919.0,933.44,909.16,928.66,928.66,7985397
1,"Jun 05, 2020",877.84,886.52,866.2,885.66,885.66,7796100
2,"Jun 04, 2020",889.88,895.75,858.44,864.38,864.38,8887700
3,"Jun 03, 2020",888.12,897.94,880.1,882.96,882.96,7949500
4,"Jun 02, 2020",894.7,908.66,871.0,881.56,881.56,13565600


In [6]:
df = pd.concat([df.set_index('Date') for df in price_history])
df.drop(df.tail(5).index, inplace = True)
df

Unnamed: 0_level_0,Open,High,Low,Close*,Adj Close**,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Jun 08, 2020",919.00,933.44,909.16,928.66,928.66,7985397
"Jun 05, 2020",877.84,886.52,866.20,885.66,885.66,7796100
"Jun 04, 2020",889.88,895.75,858.44,864.38,864.38,8887700
"Jun 03, 2020",888.12,897.94,880.10,882.96,882.96,7949500
"Jun 02, 2020",894.70,908.66,871.00,881.56,881.56,13565600
...,...,...,...,...,...,...
"Jul 06, 2010",20.00,20.00,15.83,16.11,16.11,6866900
"Jul 02, 2010",23.00,23.10,18.71,19.20,19.20,5139800
"Jul 01, 2010",25.00,25.92,20.27,21.96,21.96,8218800
"Jun 30, 2010",25.79,30.42,23.30,23.83,23.83,17187100


In [7]:
df.to_csv('TSLA.csv', index = True)