In [1]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install bs4

Collecting bs4Note: you may need to restart the kernel to use updated packages.
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.9.3-py3-none-any.whl (115 kB)
Collecting soupsieve>1.2
  Downloading soupsieve-2.2.1-py3-none-any.whl (33 kB)
Using legacy 'setup.py install' for bs4, since package 'wheel' is not installed.
Installing collected packages: soupsieve, beautifulsoup4, bs4
    Running setup.py install for bs4: started
    Running setup.py install for bs4: finished with status 'done'
Successfully installed beautifulsoup4-4.9.3 bs4-0.0.1 soupsieve-2.2.1



In [1]:
#importing libraries
import re
import json
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
#url set up
url_stats = 'https://finance.yahoo.com/quote/{}/key-statistics'

#stock to scrap
stock = 'MSFT'

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

### Extracting & parsing html & json data

In [3]:
#request webpage, passing in the stock variable
r = requests.get(url_stats.format(stock,stock),headers=headers)

In [4]:
print(r.status_code)

200


In [5]:
#parse the html using BeautifulSoup
soup = BeautifulSoup(r.text, 'html.parser')

Quoting from Izzy Analytics for own reference:
'If you were to look at the raw html, you would notice that there is a lot of javascript code and not a lot of html to work with. You may also notice that embedded in the code there are json formatted text strings. Fortunately for us, there is a javascript function, appropriately commented with "--Data--". This function is located inside of a generic "script" tag. However, we can use regular expressions with BeautifulSoup in order to identify the script tag with the function we're looking for.'

In [6]:
pattern = re.compile(r'\s--\sData\s--\s')
script_data = soup.find('script', text=pattern).contents[0]

'There's a lot of good json data here, but it's wrapped in a javascript function, as you can clearly see. However, if we can identify the starting and ending position of this json data, we can slice it and then parse it with the `json.loads` function.'

In [7]:
# find the starting position of the json string
start = script_data.find("context")-2

# slice the json string
json_data = json.loads(script_data[start:-12])

### Exploring what is in the data by exploring the dictionary

In [8]:
json_data['context'].keys()

dict_keys(['dispatcher', 'options', 'plugins'])

In [9]:
#you can further explore these here:
json_data['context']['dispatcher']['stores']['QuoteSummaryStore'].keys()

dict_keys(['defaultKeyStatistics', 'financialsTemplate', 'price', 'financialData', 'quoteType', 'calendarEvents', 'summaryDetail', 'symbol', 'pageViews'])

In [10]:
#exploring the financial data
stats_fin_data = json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['financialData']

In [95]:
#exploring...
json_data['context']['dispatcher']['stores']['QuoteSummaryStore']['defaultKeyStatistics']

{'annualHoldingsTurnover': {},
 'enterpriseToRevenue': {'raw': 13.027, 'fmt': '13.03'},
 'beta3Year': {},
 'profitMargins': {'raw': 0.35016, 'fmt': '35.02%'},
 'enterpriseToEbitda': {'raw': 27.574, 'fmt': '27.57'},
 '52WeekChange': {'raw': 0.38520312, 'fmt': '38.52%'},
 'morningStarRiskRating': {},
 'forwardEps': {'raw': 8.37, 'fmt': '8.37'},
 'revenueQuarterlyGrowth': {},
 'sharesOutstanding': {'raw': 7531570176,
  'fmt': '7.53B',
  'longFmt': '7,531,570,176'},
 'fundInceptionDate': {},
 'annualReportExpenseRatio': {},
 'totalAssets': {},
 'bookValue': {'raw': 17.853, 'fmt': '17.85'},
 'sharesShort': {'raw': 50822833, 'fmt': '50.82M', 'longFmt': '50,822,833'},
 'sharesPercentSharesOut': {'raw': 0.0067000003, 'fmt': '0.67%'},
 'fundFamily': None,
 'lastFiscalYearEnd': {'raw': 1593475200, 'fmt': '2020-06-30'},
 'heldPercentInstitutions': {'raw': 0.72033995, 'fmt': '72.03%'},
 'netIncomeToCommon': {'raw': 56014999552,
  'fmt': '56.01B',
  'longFmt': '56,014,999,552'},
 'trailingEps': {'r

In [13]:
print(stats_fin_data)

{'ebitdaMargins': {'raw': 0.47245, 'fmt': '47.24%'}, 'profitMargins': {'raw': 0.35016, 'fmt': '35.02%'}, 'grossMargins': {'raw': 0.68381, 'fmt': '68.38%'}, 'operatingCashflow': {'raw': 72703000576, 'fmt': '72.7B', 'longFmt': '72,703,000,576'}, 'revenueGrowth': {'raw': 0.191, 'fmt': '19.10%'}, 'operatingMargins': {'raw': 0.40150002, 'fmt': '40.15%'}, 'ebitda': {'raw': 75577999360, 'fmt': '75.58B', 'longFmt': '75,577,999,360'}, 'targetLowPrice': {'raw': 256.7, 'fmt': '256.70'}, 'recommendationKey': 'buy', 'grossProfits': {'raw': 96937000000, 'fmt': '96.94B', 'longFmt': '96,937,000,000'}, 'freeCashflow': {'raw': 37776875520, 'fmt': '37.78B', 'longFmt': '37,776,875,520'}, 'targetMedianPrice': {'raw': 300, 'fmt': '300.00'}, 'currentPrice': {'raw': 286.14, 'fmt': '286.14'}, 'earningsGrowth': {'raw': 0.45, 'fmt': '45.00%'}, 'currentRatio': {'raw': 2.294, 'fmt': '2.29'}, 'returnOnAssets': {'raw': 0.13508001, 'fmt': '13.51%'}, 'numberOfAnalystOpinions': {'raw': 33, 'fmt': '33', 'longFmt': '33'}

In [14]:
#different number format available: raw, formatted, long formatted
stats_fin_data['ebitdaMargins']

{'raw': 0.47245, 'fmt': '47.24%'}

In [15]:
#tring to loop through nested dictionary, stats_fin_data -- not using this
def get_all_values(nested_dictionary):
    for key,value in nested_dictionary.items():
        if type(value) is dict:
            get_all_values(value)
        else:
            print(key, ':', value)
            
get_all_values(stats_fin_data)

raw : 0.47245
fmt : 47.24%
raw : 0.35016
fmt : 35.02%
raw : 0.68381
fmt : 68.38%
raw : 72703000576
fmt : 72.7B
longFmt : 72,703,000,576
raw : 0.191
fmt : 19.10%
raw : 0.40150002
fmt : 40.15%
raw : 75577999360
fmt : 75.58B
longFmt : 75,577,999,360
raw : 256.7
fmt : 256.70
recommendationKey : buy
raw : 96937000000
fmt : 96.94B
longFmt : 96,937,000,000
raw : 37776875520
fmt : 37.78B
longFmt : 37,776,875,520
raw : 300
fmt : 300.00
raw : 286.14
fmt : 286.14
raw : 0.45
fmt : 45.00%
raw : 2.294
fmt : 2.29
raw : 0.13508001
fmt : 13.51%
raw : 33
fmt : 33
longFmt : 33
raw : 298.92
fmt : 298.92
raw : 60.414
fmt : 60.41
raw : 0.44991
fmt : 44.99%
raw : 340
fmt : 340.00
raw : 125013000192
fmt : 125.01B
longFmt : 125,013,000,192
raw : 81260003328
fmt : 81.26B
longFmt : 81,260,003,328
raw : 159969001472
fmt : 159.97B
longFmt : 159,969,001,472
raw : 16.599
fmt : 16.6
financialCurrency : USD
maxAge : 86400
raw : 21.156
fmt : 21.16
raw : 2.096
fmt : 2.10
raw : 1.6
fmt : 1.60


In [16]:
def parse_dict(f, lkey=''):
    data = {}
    for rkey,val in f.items():
        key = lkey+rkey
        if isinstance(val, dict):
            data.update(parse_dict(val, key+'_'))
        else:
            data[key] = val
    return data

stats_findatas = parse_dict(stats_fin_data,'')

In [17]:
stats_findatas

{'ebitdaMargins_raw': 0.47245,
 'ebitdaMargins_fmt': '47.24%',
 'profitMargins_raw': 0.35016,
 'profitMargins_fmt': '35.02%',
 'grossMargins_raw': 0.68381,
 'grossMargins_fmt': '68.38%',
 'operatingCashflow_raw': 72703000576,
 'operatingCashflow_fmt': '72.7B',
 'operatingCashflow_longFmt': '72,703,000,576',
 'revenueGrowth_raw': 0.191,
 'revenueGrowth_fmt': '19.10%',
 'operatingMargins_raw': 0.40150002,
 'operatingMargins_fmt': '40.15%',
 'ebitda_raw': 75577999360,
 'ebitda_fmt': '75.58B',
 'ebitda_longFmt': '75,577,999,360',
 'targetLowPrice_raw': 256.7,
 'targetLowPrice_fmt': '256.70',
 'recommendationKey': 'buy',
 'grossProfits_raw': 96937000000,
 'grossProfits_fmt': '96.94B',
 'grossProfits_longFmt': '96,937,000,000',
 'freeCashflow_raw': 37776875520,
 'freeCashflow_fmt': '37.78B',
 'freeCashflow_longFmt': '37,776,875,520',
 'targetMedianPrice_raw': 300,
 'targetMedianPrice_fmt': '300.00',
 'currentPrice_raw': 286.14,
 'currentPrice_fmt': '286.14',
 'earningsGrowth_raw': 0.45,
 'ea

In [94]:
type(stats_findatas)

dict

In [22]:
#converting dict to df
df_stats_findatas = pd.DataFrame(list(stats_findatas.items()),columns = ['financial_data','value'])

In [23]:
df_stats_findatas

Unnamed: 0,financial_data,value
0,ebitdaMargins_raw,0.47245
1,ebitdaMargins_fmt,47.24%
2,profitMargins_raw,0.35016
3,profitMargins_fmt,35.02%
4,grossMargins_raw,0.68381
...,...,...
60,revenuePerShare_fmt,21.16
61,quickRatio_raw,2.096
62,quickRatio_fmt,2.10
63,recommendationMean_raw,1.6


In [104]:
df_stats_findatas[df_stats_findatas['financial_data'].str.contains("_fmt")]

Unnamed: 0,financial_data,value
1,ebitdaMargins_fmt,47.24%
3,profitMargins_fmt,35.02%
5,grossMargins_fmt,68.38%
7,operatingCashflow_fmt,72.7B
10,revenueGrowth_fmt,19.10%
12,operatingMargins_fmt,40.15%
14,ebitda_fmt,75.58B
17,targetLowPrice_fmt,256.70
20,grossProfits_fmt,96.94B
23,freeCashflow_fmt,37.78B
