# SEC - Filtering for Ratios and Expenses

In [43]:
import os
import json

JSON_folder_path = './Json'
files = os.listdir(JSON_folder_path)
file = files[0]


with open(JSON_folder_path + "/" + file) as jsonFile:
    data = json.load(jsonFile)
    us_gaap = data['facts']['us-gaap']
    keys=[]
    formatted_data = {}
    formatted_data['entity_name'] = data['entityName']
    for key in us_gaap:
        keys.append(key.lower())
        formatted_data[key] = us_gaap[key]

search_key = ["cash", "receivable", "payable", "asset", "liabili", "loan", "equit", "income", "expense", "interest", "deposit"]
print(keys)

results={}
for key in search_key:
    matches = []
    for account_name in keys:
        if key in account_name:
            matches.append(account_name)
    results[key]=matches

print(results['cash'])

['accountsreceivablegrosscurrent', 'accretionamortizationofdiscountsandpremiumsinvestments', 'accruedincometaxescurrent', 'accumulateddepreciationdepletionandamortizationpropertyplantandequipment', 'accumulatedothercomprehensiveincomelossnetoftax', 'additionalpaidincapital', 'adjustmentstoadditionalpaidincapitalincometaxdeficiencyfromsharebasedcompensation', 'adjustmentstoadditionalpaidincapitalsharebasedcompensationrequisiteserviceperiodrecognitionvalue', 'adjustmentstoadditionalpaidincapitaltaxeffectfromsharebasedcompensation', 'adjustmenttoadditionalpaidincapitalincometaxeffectfromsharebasedcompensationnet', 'affordablehousingtaxcreditsandothertaxbenefitsamount', 'allowanceforloanandleaselossesforeigncurrencytranslation', 'allowanceforloanandleaselossesperiodincreasedecrease', 'allowanceforloanandleaselossesprovisionforlossnet', 'allowanceforloanandleaselosseswriteoffs', 'allowanceforloanandleaselossrecoveryofbaddebts', 'alternativeinvestmentsfairvaluedisclosure', 'amortizationmetho

# Yahoo Finance Webscraping w/ BeautifulSoup and Selenium

How to use:
1. Update login() with username and password params
2. Update stock_tickers_arr to include tickers of companies to be scraped
3. Hit run

In [42]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

# Array of stock tickers to scrape
stock_tickers_arr = [
    "SIVBQ","JPM","FRCB",
    "WFC","BAC","DFS",
    "MS","PNC","CS",
    "DB","COF","C",
    "KEY","BK","GS",
    "HSBC","MTB","TD",
    "FHN","OVLY","PACW",
    "WAL","SBNY","ZION",
    "CMA","FCNCA","EWBC",
    "CFG","MCBC","NYCB",
]

# Initialize default variables - don't change these
path_to = {
    'income_statement': "/financials?p=",
    'balance_sheet':"/balance-sheet?p=",
    'cashflow_statement':"/cash-flow?p=",
    'key-statistics':"/key-statistics?p="
}
stock_ticker = "JPM"
stock_URL = (
    "https://finance.yahoo.com/quote/" + stock_ticker + "/financials?p=" + stock_ticker
)

# Functions

# Login to account. Yahoo Finance only gives the last 4 reporting periods, need Yahoo Finance Plus for complete hitorical FS data
def login(username:str, password:str):
    driver.get(stock_URL)

    # Login to my account
    sign_in_button=driver.find_element(By.ID, 'header-signin-link')
    sign_in_button.click()
    wait = WebDriverWait(driver, 10)  
    element = wait.until(EC.presence_of_element_located((By.ID, 'login-username')))
    login_username=driver.find_element(By.ID, 'login-username')

    # Enter username
    login_username.send_keys(username)
    wait = WebDriverWait(driver, 10)  
    element = wait.until(EC.presence_of_element_located((By.ID, 'login-signin')))
    login_sign_in_button=driver.find_element(By.ID, 'login-signin')
    login_sign_in_button.click()

    # Maximum wait time of 10 seconds, wait for element to load
    wait = WebDriverWait(driver, 10)  
    element = wait.until(EC.presence_of_element_located((By.ID, 'login-passwd')))
    login_passwd = driver.find_element(By.ID, 'login-passwd')

    # Enter password
    # FIXME: Please delete password before pushing
    login_passwd.send_keys(password)
    login_sign_in_button=driver.find_element(By.ID, 'login-signin')
    login_sign_in_button.click()

def get_URL_to(ticker: str, financial_statement_name: str):
    stock_ticker = ticker
    path = path_to[financial_statement_name]
    URL = (
        "https://finance.yahoo.com/quote/" + stock_ticker + path + stock_ticker
    )
    return URL

#Only go to URL if it is different than the current URL
def go_to_URL(URL):
    current_url = driver.current_url
    driver.get(URL)

def expand_all_get_html():
    time.sleep(2)
    # Wait for expand button to load before clicking
    wait = WebDriverWait(driver, 10)  
    element = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'expandPf')))
    expand_button = driver.find_element(By.CLASS_NAME, 'expandPf')
    expand_button.click()

    # Let page load new html before extracting
    time.sleep(2)

    # Download HTML file
    html_source = driver.page_source

    return html_source

def create_directory_if_not_exists(ticker):
    directory = f'./CSV/{ticker}'
    # Create the directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)

def scrape_and_save_to_csv(html, ticker:str, financial_statement_name:str, period:str):
    soup = BeautifulSoup(html, "html.parser")

    # Find all table headers (Dates)
    table_headers = soup.find("div", {"class": "D(tbhg)"})
    table_headers_row = table_headers.find_all("div", {"class": "D(ib)"})
    table_headers_names = []

    for div in table_headers_row:
        span = div.span
        name = span.get_text()
        table_headers_names.append(name)

    table_column_length = len(table_headers_names)

    # Find all table rows (Amounts)
    table_rows = soup.find("div", {"class": "D(tbrg)"})
    table_rows_data = table_rows.find_all("div", {"class": "D(tbr)"})
    table_rows_data_arrays = []

    for table_row in table_rows_data:
        row_header_text = table_row.find("div", {'class':"Ta(start)"}).find("span").get_text()
        row_data_array = table_row.find_all("div", {'class':"Ta(c)"})
        row_data_text_array = [row_header_text]

        for div in row_data_array:
            span = div.find("span") if div.find("span") != None else div
            text = span.get_text()
            row_data_text_array.append(text)
        table_rows_data_arrays.append(row_data_text_array)


    data_frame_object = {table_headers_names[0]:table_headers_names[1:]}

    # First element in row array contains the row header
    # Map it to an object and then convert into a dataframe
    for data in table_rows_data_arrays:
        data_frame_object[data[0]] = data[1:]

    df = pd.DataFrame(data_frame_object)
    # Save CSV
    df.to_csv(f'./CSV/{ticker}/{financial_statement_name}_{period}.csv')

def save_one_financial_statement(ticker:str, financial_statement_name:str, period:str):
    create_directory_if_not_exists(ticker)
    if period == 'quarterly':
        URL = get_URL_to(ticker, financial_statement_name)
        go_to_URL(URL)
        wait = WebDriverWait(driver, 10)  
        element = wait.until(EC.presence_of_element_located((By.XPATH, '//button[div[span[text()="Quarterly"]]]')))
        quarterly_button = driver.find_element(By.XPATH, '//button[div[span[text()="Quarterly"]]]')
        quarterly_button.click()
        html_source = expand_all_get_html()
        scrape_and_save_to_csv(html_source, ticker, financial_statement_name, period)
    else:
        URL = get_URL_to(ticker, financial_statement_name)
        go_to_URL(URL)
        html_source = expand_all_get_html()
        scrape_and_save_to_csv(html_source, ticker, financial_statement_name, period)

def save_all_financial_statements(ticker:str):
    save_one_financial_statement(ticker, 'income_statement', 'annual')
    save_one_financial_statement(ticker, 'income_statement', 'quarterly')
    save_one_financial_statement(ticker, 'balance_sheet', 'annual')
    save_one_financial_statement(ticker, 'balance_sheet', 'quarterly')
    save_one_financial_statement(ticker, 'cashflow_statement', 'annual')
    save_one_financial_statement(ticker, 'cashflow_statement', 'quarterly')

def save_market_cap(ticker:str, period:str):
    URL = get_URL_to(ticker, 'key-statistics')
    go_to_URL(URL)
    wait = WebDriverWait(driver, 10)  
    element = wait.until(EC.presence_of_element_located((By.XPATH, f'//button[div[span[text()="{period}"]]]')))
    button = driver.find_element(By.XPATH, f'//button[div[span[text()="{period}"]]]')
    button.click()
    html_source = driver.page_source
    soup = BeautifulSoup(html_source, "html.parser")

    # Find all table headers (Dates)
    table_headers = soup.find("thead")
    table_headers_row = table_headers.find_all("th")
    table_headers_names = []
    for th in table_headers_row:
        print(th)
        span = th.find('span')
        if span:
            text = span.get_text()
        else:
            text = ""
        table_headers_names.append(text)

    table_column_length = len(table_headers_names)

    # Find all table rows (Amounts)
    table_rows = soup.find("tbody")
    table_rows_data = table_rows.find_all("tr")
    table_rows_data_arrays = []
    for tr in table_rows_data:
        row_data_array = []
        for i, td in enumerate(tr):
            if i == 0:
                row_data_array.append(td.span.get_text())
            else:
                row_data_array.append(td.get_text())
        table_rows_data_arrays.append(row_data_array)
    
    data_frame_object = {table_headers_names[0]:table_headers_names[1:]}

    # First element in row array contains the row header
    # Map it to an object and then convert into a dataframe
    for data in table_rows_data_arrays:
        data_frame_object[data[0]] = data[1:]

    df = pd.DataFrame(data_frame_object)
    # Save CSV
    df.to_csv(f'./CSV/{ticker}/market_stats.csv')

def save_market_cap(ticker:str):
    URL = get_URL_to(ticker, 'key-statistics')
    go_to_URL(URL)
    # wait = WebDriverWait(driver, 10)  
    # element = wait.until(EC.presence_of_element_located((By.XPATH, f'//button[div[span[text()="{period}"]]]')))
    # button = driver.find_element(By.XPATH, f'//button[div[span[text()="{period}"]]]')
    # button.click()
    time.sleep(1)
    html_source = driver.page_source
    soup = BeautifulSoup(html_source, "html.parser")

    # Find all table headers (Dates)
    table_headers = soup.find("thead")
    table_headers_row = table_headers.find_all("th")
    table_headers_names = []
    print(table_headers)
    for th in table_headers_row:
        print(th)
        span = th.find('span')
        if span:
            text = span.get_text()
        else:
            text = "Date"
        table_headers_names.append(text)
    print(table_headers_names)
    table_column_length = len(table_headers_names)

    # Find all table rows (Amounts)
    table_rows = soup.find("tbody")
    table_rows_data = table_rows.find_all("tr")
    table_rows_data_arrays = []
    for tr in table_rows_data:
        row_data_array = []
        for i, td in enumerate(tr):
            if i == 0:
                row_data_array.append(td.span.get_text())
            else:
                row_data_array.append(td.get_text())
        table_rows_data_arrays.append(row_data_array)
    
    data_frame_object = {table_headers_names[0]:table_headers_names[1:]}

    # First element in row array contains the row header
    # Map it to an object and then convert into a dataframe
    for data in table_rows_data_arrays:
        data_frame_object[data[0]] = data[1:]

    df = pd.DataFrame(data_frame_object)
    # Save CSV
    df.to_csv(f'./CSV/{ticker}/market_stats.csv')


In [44]:

# Script execution
stock_tickers_arr = [
    "FCNCA", "OVLY"
]

# Install and set ChromeDriver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
login("kevinator112233@yahoo.com", "")

for stock_ticker in stock_tickers_arr:
    save_all_financial_statements(stock_ticker)
    save_market_cap(stock_ticker)
driver.quit()

<thead><tr class="Bdtw(0px) C($primaryColor)"><th class="Fw(400) Pend(10px) Pos(st) Start(0) Pend(10px) Bgc($lv2BgColor) Z(1)"> <div class="W(3px) Pos(a) Start(100%) T(0) H(100%) Bg($pfColumnFakeShadowGradient) Pe(n) Pend(5px)"></div></th><th class="Fw(b) Ta(c) Pstart(6px) Pend(4px) Py(6px) Miw(fc) Miw(fc)--pnclg Bgc($lv1BgColor) Pend(0)"><span class="Pos(r) smplTblTooltip"><span class="Pos(a) Z(3) Bgc($lv3BgColor) Bd($featurePromoBorder) Bxsh($boxAreaShadow) smplTblTooltip:h_V(v) V(h) W(150px) P(10px) D(ib) Fz(12px) C($tertiaryColor) Fw(500) Mt(25px)">As of Date: 5/9/2023<div class="Pos(a) H(0) W(0) Bdbc($seperatorColor) End(100%) Bds(s) Bdw(10px) Bdstartc(t) Bdendc(t) Bdtc(t)" style="left:41px;top:-20px"></div><div class="Pos(a) H(0) W(0) Bdbc($lv3BgColor) End(100%) Bds(s) Bdw(10px) Bdstartc(t) Bdendc(t) Bdtc(t)" style="left:42px;top:-20px"></div></span><span>Current</span><div class="Lh(14px) D(ib) Va(m) Pstart(1px)"><svg class="Fill($iconColor)! Stk($iconColor)! Cur(p)" data-icon="

In [34]:
import requests

# replace the "demo" apikey below with your own key from https://www.alphavantage.co/support/#api-key
url = 'https://www.alphavantage.co/query?function=INCOME_STATEMENT&symbol=SIVBQ&apikey=YE0A8ZO3DU8J7SNU'
r = requests.get(url)
data = r.json()
data

{}