# Yahoo Finance Web Scraper by Lidor ES

In [1]:
import re
import json
import csv
import requests
from io import StringIO
from bs4 import BeautifulSoup

In [2]:
url_stats = 'https://finance.yahoo.com/quote/{}/key-statistics?p={}'
url_profile = 'https://finance.yahoo.com/quote/{}/profile?p={}'
url_financials = 'https://finance.yahoo.com/quote/{}/financials?p={}'

In [3]:
stock = 'F'

# Financials Data

In [4]:
response = requests.get(url_financials.format(stock, stock))

In [5]:
soup = BeautifulSoup(response.text, 'html.parser')

In [8]:
pattern = re.compile(r'\s--\sData\s--\s')
script_data = soup.find('script', text=pattern).contents[0]

In [9]:
# Beginning
script_data[:500]

'\n(function (root) {\n/* -- Data -- */\nroot.App || (root.App = {});\nroot.App.now = 1621176660770;\nroot.App.main = {"context":{"dispatcher":{"stores":{"PageStore":{"currentPageName":"quote","currentEvent":{"eventName":"NEW_PAGE_SUCCESS"},"currentRenderTargetId":"default","pagesConfigRaw":{"base":{"quote":{"layout":{"bundleName":"yahoodotcom-layout.TwoColumnLayout","name":"TwoColumnLayout","config":{"enableHeaderCollapse":true,"additionalBodyWrapperClasses":"Bgc($layoutBgColor)!","contentWrapperClas'

In [10]:
# The End
script_data[-500:]

'put":{"strings":1},"tdv2-applet-sponsored-moments":{"strings":1},"tdv2-applet-stream":{"strings":1},"tdv2-applet-stream-hero":{"strings":1},"tdv2-applet-swisschamp":{"strings":1},"tdv2-applet-uh":{"strings":1},"tdv2-applet-userintent":{"strings":1},"tdv2-applet-video-lightbox":{"strings":1},"tdv2-applet-video-modal":{"strings":1},"tdv2-wafer-adfeedback":{"strings":1},"tdv2-wafer-header":{"strings":1},"yahoodotcom-layout":{"strings":1}}},"options":{"defaultBundle":"td-app-finance"}}}};\n}(this));\n'

In [11]:
start = script_data.find("context") - 2

In [12]:
json_data = json.loads(script_data[start:-12])

In [13]:
json_data['context'].keys()

dict_keys(['dispatcher', 'options', 'plugins'])

In [14]:
json_data['context']['dispatcher']['stores']['QuoteSummaryStore'].keys()

dict_keys(['financialsTemplate', 'cashflowStatementHistory', 'balanceSheetHistoryQuarterly', 'earnings', 'price', 'incomeStatementHistoryQuarterly', 'incomeStatementHistory', 'balanceSheetHistory', 'cashflowStatementHistoryQuarterly', 'quoteType', 'summaryDetail', 'symbol', 'pageViews'])

In [16]:
anual_is = json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['incomeStatementHistory']['incomeStatementHistory']
quarterly_is = json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['incomeStatementHistoryQuarterly']['incomeStatementHistory']

anual_cf = json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['cashflowStatementHistory']['cashflowStatements']
quarterly_cf = json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['cashflowStatementHistoryQuarterly']['cashflowStatements']

anual_bs = json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['balanceSheetHistory']['balanceSheetStatements']
quarterly_bs = json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['balanceSheetHistoryQuarterly']['balanceSheetStatements']

In [17]:
print(anual_is[0])

{'researchDevelopment': {}, 'effectOfAccountingCharges': {}, 'incomeBeforeTax': {'raw': -1116000000, 'fmt': '-1.12B', 'longFmt': '-1,116,000,000'}, 'minorityInterest': {'raw': 121000000, 'fmt': '121M', 'longFmt': '121,000,000'}, 'netIncome': {'raw': -1279000000, 'fmt': '-1.28B', 'longFmt': '-1,279,000,000'}, 'sellingGeneralAdministrative': {'raw': 8628000000, 'fmt': '8.63B', 'longFmt': '8,628,000,000'}, 'grossProfit': {'raw': 6123000000, 'fmt': '6.12B', 'longFmt': '6,123,000,000'}, 'ebit': {'raw': -2505000000, 'fmt': '-2.5B', 'longFmt': '-2,505,000,000'}, 'endDate': {'raw': 1609372800, 'fmt': '2020-12-31'}, 'operatingIncome': {'raw': -2505000000, 'fmt': '-2.5B', 'longFmt': '-2,505,000,000'}, 'otherOperatingExpenses': {}, 'interestExpense': {'raw': -1651000000, 'fmt': '-1.65B', 'longFmt': '-1,651,000,000'}, 'extraordinaryItems': {}, 'nonRecurring': {}, 'otherItems': {}, 'incomeTaxExpense': {'raw': 160000000, 'fmt': '160M', 'longFmt': '160,000,000'}, 'totalRevenue': {'raw': 127144000000,

In [19]:
anual_is[0]['operatingIncome']

{'raw': -2505000000, 'fmt': '-2.5B', 'longFmt': '-2,505,000,000'}

In [22]:
anual_is_stmts = []
for s in anual_is:
    statement = {}
    for key, val in s.items():
        try:
            statement[key] = val['raw']
        except TypeError:
            continue
        except KeyError:
            continue
    anual_is_stmts.append(statement)

In [23]:
anual_is_stmts[0]

{'incomeBeforeTax': -1116000000,
 'minorityInterest': 121000000,
 'netIncome': -1279000000,
 'sellingGeneralAdministrative': 8628000000,
 'grossProfit': 6123000000,
 'ebit': -2505000000,
 'endDate': 1609372800,
 'operatingIncome': -2505000000,
 'interestExpense': -1651000000,
 'incomeTaxExpense': 160000000,
 'totalRevenue': 127144000000,
 'totalOperatingExpenses': 129649000000,
 'costOfRevenue': 121021000000,
 'totalOtherIncomeExpenseNet': 1389000000,
 'netIncomeFromContinuingOps': -1276000000,
 'netIncomeApplicableToCommonShares': -1279000000}

In [28]:
anual_cf_stmts = []
quarterly_cf_stmts = []
# Annual
for s in anual_cf:
    statement = {}
    for key, val in s.items():
        try:
            statement[key] = val['raw']
        except TypeError:
            continue
        except KeyError:
            continue
    anual_cf_stmts.append(statement)
# Quarterly
for s in quarterly_cf:
    statement = {}
    for key, val in s.items():
        try:
            statement[key] = val['raw']
        except TypeError:
            continue
        except KeyError:
            continue
    quarterly_cf_stmts.append(statement)

In [29]:
anual_cf_stmts[0]

{'investments': -7229000000,
 'changeToLiabilities': 6809000000,
 'totalCashflowsFromInvestingActivities': -18615000000,
 'netBorrowings': 3095000000,
 'totalCashFromFinancingActivities': 2315000000,
 'changeToOperatingActivities': 12104000000,
 'netIncome': -1279000000,
 'changeInCash': 8194000000,
 'endDate': 1609372800,
 'effectOfExchangeRate': 225000000,
 'totalCashFromOperatingActivities': 24269000000,
 'depreciation': 7312000000,
 'otherCashflowsFromInvestingActivities': 171000000,
 'dividendsPaid': -596000000,
 'changeToInventory': 148000000,
 'changeToAccountReceivables': -63000000,
 'otherCashflowsFromFinancingActivities': -184000000,
 'changeToNetincome': -762000000,
 'capitalExpenditures': -5742000000}

# Profile Data

In [57]:
response = requests.get(url_profile.format(stock, stock))
soup = BeautifulSoup(response.text, 'html.parser')
pattern = re.compile(r'\s--\sData\s--\s')
script_data = soup.find('script', text=pattern).contents[0]
start = script_data.find("context") - 2
json_data = json.loads(script_data[start:-12])

In [54]:
json_data['context']['dispatcher']['stores']['QuoteSummaryStore'].keys()

dict_keys(['financialsTemplate', 'price', 'secFilings', 'quoteType', 'calendarEvents', 'summaryDetail', 'symbol', 'assetProfile', 'pageViews'])

In [55]:
json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['assetProfile'].keys()

dict_keys(['zip', 'sector', 'fullTimeEmployees', 'compensationRisk', 'auditRisk', 'longBusinessSummary', 'city', 'phone', 'state', 'shareHolderRightsRisk', 'compensationAsOfEpochDate', 'governanceEpochDate', 'boardRisk', 'country', 'companyOfficers', 'website', 'maxAge', 'overallRisk', 'address1', 'industry'])

In [58]:
json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['assetProfile']['companyOfficers']

[{'totalPay': {'raw': 3190109, 'fmt': '3.19M', 'longFmt': '3,190,109'},
  'exercisedValue': {'raw': 0, 'fmt': None, 'longFmt': '0'},
  'yearBorn': 1957,
  'name': 'Mr. William Clay Ford Jr.',
  'title': 'Exec. Chairman',
  'maxAge': 1,
  'fiscalYear': 2020,
  'unexercisedValue': {'raw': 0, 'fmt': None, 'longFmt': '0'},
  'age': 63},
 {'totalPay': {'raw': 2571416, 'fmt': '2.57M', 'longFmt': '2,571,416'},
  'exercisedValue': {'raw': 0, 'fmt': None, 'longFmt': '0'},
  'yearBorn': 1962,
  'name': 'Mr. James D. Farley Jr.',
  'title': 'Pres, CEO & Director',
  'maxAge': 1,
  'fiscalYear': 2020,
  'unexercisedValue': {'raw': 0, 'fmt': None, 'longFmt': '0'},
  'age': 58},
 {'totalPay': {'raw': 1072096, 'fmt': '1.07M', 'longFmt': '1,072,096'},
  'exercisedValue': {'raw': 0, 'fmt': None, 'longFmt': '0'},
  'yearBorn': 1966,
  'name': 'Mr. John T. Lawler',
  'title': 'VP & CFO',
  'maxAge': 1,
  'fiscalYear': 2020,
  'unexercisedValue': {'raw': 0, 'fmt': None, 'longFmt': '0'},
  'age': 54},
 {'t

In [34]:
json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['assetProfile']['longBusinessSummary']

'Ford Motor Company designs, manufactures, markets, and services a range of Ford trucks, cars, sport utility vehicles, electrified vehicles, and Lincoln luxury vehicles worldwide. It operates through three segments: Automotive, Mobility, and Ford Credit. The Automotive segment sells Ford and Lincoln vehicles, service parts, and accessories through distributors and dealers, as well as through dealerships to commercial fleet customers, daily rental car companies, and governments. The Mobility segment designs and builds mobility services; and provides self-driving systems development services. The Ford Credit segment primarily engages in vehicle-related financing and leasing activities to and through automotive dealers. It provides retail installment sale contracts for new and used vehicles; and direct financing leases for new vehicles to retail and commercial customers, such as leasing companies, government entities, daily rental companies, and fleet customers. This segment also offers w

In [35]:
json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['secFilings']['filings']

[{'date': '2021-04-29',
  'epochDate': 1619694715,
  'type': '10-Q',
  'title': 'Quarterly Report',
  'edgarUrl': 'https://yahoo.brand.edgar-online.com/DisplayFiling.aspx?TabIndex=2&dcn=0000037996-21-000026&nav=1&src=Yahoo',
  'maxAge': 1},
 {'date': '2021-04-28',
  'epochDate': 1619644387,
  'type': '8-K',
  'title': 'Results of Operations and Financial Condition, Financial Statements and Exhibits',
  'edgarUrl': 'https://yahoo.brand.edgar-online.com/DisplayFiling.aspx?TabIndex=2&dcn=0000037996-21-000025&nav=1&src=Yahoo',
  'maxAge': 1},
 {'date': '2021-04-14',
  'epochDate': 1618434853,
  'type': '8-K',
  'title': 'Other Events, Financial Statements and Exhibits',
  'edgarUrl': 'https://yahoo.brand.edgar-online.com/DisplayFiling.aspx?TabIndex=2&dcn=0000037996-21-000022&nav=1&src=Yahoo',
  'maxAge': 1},
 {'date': '2021-04-01',
  'epochDate': 1617309508,
  'type': '8-K',
  'title': 'Other Events, Financial Statements and Exhibits',
  'edgarUrl': 'https://yahoo.brand.edgar-online.com/Di

In [36]:
json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['summaryDetail']

{'previousClose': {'raw': 11.55, 'fmt': '11.55'},
 'regularMarketOpen': {'raw': 11.65, 'fmt': '11.65'},
 'twoHundredDayAverage': {'raw': 10.67927, 'fmt': '10.68'},
 'trailingAnnualDividendYield': {'raw': 0, 'fmt': '0.00%'},
 'payoutRatio': {'raw': 0, 'fmt': '0.00%'},
 'volume24Hr': {},
 'regularMarketDayHigh': {'raw': 11.87, 'fmt': '11.87'},
 'navPrice': {},
 'averageDailyVolume10Day': {'raw': 51558871,
  'fmt': '51.56M',
  'longFmt': '51,558,871'},
 'totalAssets': {},
 'regularMarketPreviousClose': {'raw': 11.55, 'fmt': '11.55'},
 'fiftyDayAverage': {'raw': 12.039706, 'fmt': '12.04'},
 'trailingAnnualDividendRate': {'raw': 0, 'fmt': '0.00'},
 'open': {'raw': 11.65, 'fmt': '11.65'},
 'toCurrency': None,
 'averageVolume10days': {'raw': 51558871,
  'fmt': '51.56M',
  'longFmt': '51,558,871'},
 'expireDate': {},
 'yield': {},
 'algorithm': None,
 'dividendRate': {},
 'exDividendDate': {'raw': 1580256000, 'fmt': '2020-01-29'},
 'beta': {'raw': 1.16631, 'fmt': '1.17'},
 'circulatingSupply':

# Statistics

In [37]:
response = requests.get(url_stats.format(stock, stock))
soup = BeautifulSoup(response.text, 'html.parser')
pattern = re.compile(r'\s--\sData\s--\s')
script_data = soup.find('script', text=pattern).contents[0]
start = script_data.find("context") - 2
json_data = json.loads(script_data[start:-12])

In [38]:
json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']

{'annualHoldingsTurnover': {},
 'enterpriseToRevenue': {'raw': 1.318, 'fmt': '1.32'},
 'beta3Year': {},
 'profitMargins': {'raw': 0.03081, 'fmt': '3.08%'},
 'enterpriseToEbitda': {'raw': 20.126, 'fmt': '20.13'},
 '52WeekChange': {'raw': 1.2297552, 'fmt': '122.98%'},
 'morningStarRiskRating': {},
 'forwardEps': {'raw': 1.69, 'fmt': '1.69'},
 'revenueQuarterlyGrowth': {},
 'sharesOutstanding': {'raw': 3920790016,
  'fmt': '3.92B',
  'longFmt': '3,920,790,016'},
 'fundInceptionDate': {},
 'annualReportExpenseRatio': {},
 'totalAssets': {},
 'bookValue': {'raw': 8.475, 'fmt': '8.48'},
 'sharesShort': {'raw': 65787961, 'fmt': '65.79M', 'longFmt': '65,787,961'},
 'sharesPercentSharesOut': {'raw': 0.0165, 'fmt': '1.65%'},
 'fundFamily': None,
 'lastFiscalYearEnd': {'raw': 1609372800, 'fmt': '2020-12-31'},
 'heldPercentInstitutions': {'raw': 0.54066, 'fmt': '54.07%'},
 'netIncomeToCommon': {'raw': 3976000000,
  'fmt': '3.98B',
  'longFmt': '3,976,000,000'},
 'trailingEps': {'raw': 0.994, 'fmt'

# Historical Stock Data

In [39]:
stock_url = 'https://query1.finance.yahoo.com/v7/finance/download/F?period1=1589659203&period2=1621195203&interval=1d&events=history&includeAdjustedClose=true'

In [41]:
response = requests.get(stock_url)

In [43]:
response.text

'Date,Open,High,Low,Close,Adj Close,Volume\n2020-05-18,5.120000,5.330000,5.120000,5.310000,5.310000,136157700\n2020-05-19,5.270000,5.450000,5.150000,5.300000,5.300000,94836300\n2020-05-20,5.380000,5.540000,5.370000,5.490000,5.490000,87685400\n2020-05-21,5.450000,5.690000,5.440000,5.630000,5.630000,87098200\n2020-05-22,5.680000,5.730000,5.500000,5.650000,5.650000,76656500\n2020-05-26,5.920000,5.990000,5.820000,5.840000,5.840000,86046300\n2020-05-27,6.010000,6.090000,5.840000,6.030000,6.030000,110413900\n2020-05-28,6.050000,6.070000,5.810000,5.850000,5.850000,76930200\n2020-05-29,5.750000,5.800000,5.570000,5.710000,5.710000,92936300\n2020-06-01,5.730000,5.880000,5.670000,5.870000,5.870000,61879700\n2020-06-02,5.960000,6.000000,5.830000,5.900000,5.900000,80046000\n2020-06-03,5.990000,6.270000,5.920000,6.190000,6.190000,98876600\n2020-06-04,6.200000,6.580000,6.120000,6.570000,6.570000,115798600\n2020-06-05,7.240000,7.500000,7.020000,7.340000,7.340000,208648200\n2020-06-08,7.530000,7.740000

In [45]:
stock_url = 'https://query1.finance.yahoo.com/v7/finance/download/F?'
params = {
    'period1':'1589659203',
    'period2':'1621195203',
    'interval':'1d',
    'events':'history',
    'includeAdjustedClose':'true',
}

In [48]:
params = {
    'range':'5y',
    'interval':'1d',
    'events':'history',
    'includeAdjustedClose':'true',
}

In [49]:
response = requests.get(stock_url.format(stock), params=params)

In [50]:
response.text

'Date,Open,High,Low,Close,Adj Close,Volume\n2016-05-16,13.220000,13.380000,13.210000,13.320000,10.600907,21262700\n2016-05-17,13.140000,13.310000,13.070000,13.140000,10.457650,34165300\n2016-05-18,13.120000,13.340000,13.050000,13.160000,10.473568,23509700\n2016-05-19,13.100000,13.200000,13.020000,13.090000,10.417858,24117800\n2016-05-20,13.150000,13.260000,13.130000,13.190000,10.497443,19465000\n2016-05-23,13.140000,13.200000,13.110000,13.130000,10.449692,16234300\n2016-05-24,13.160000,13.300000,13.160000,13.290000,10.577031,19721600\n2016-05-25,13.330000,13.600000,13.330000,13.520000,10.760079,26188300\n2016-05-26,13.590000,13.680000,13.370000,13.460000,10.712329,18369900\n2016-05-27,13.480000,13.540000,13.400000,13.450000,10.704370,17263300\n2016-05-31,13.490000,13.560000,13.400000,13.490000,10.736203,26096000\n2016-06-01,13.430000,13.440000,12.970000,13.110000,10.433773,58219600\n2016-06-02,13.090000,13.320000,13.050000,13.210000,10.513361,41614600\n2016-06-03,13.140000,13.140000,12

In [51]:
file = StringIO(response.text)
reader = csv.reader(file)
data = list(reader)
for row in data[:5]:
    print(row)

['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
['2016-05-16', '13.220000', '13.380000', '13.210000', '13.320000', '10.600907', '21262700']
['2016-05-17', '13.140000', '13.310000', '13.070000', '13.140000', '10.457650', '34165300']
['2016-05-18', '13.120000', '13.340000', '13.050000', '13.160000', '10.473568', '23509700']
['2016-05-19', '13.100000', '13.200000', '13.020000', '13.090000', '10.417858', '24117800']
