# SEC - Filtering for Ratios and Expenses

In [43]:
import os
import json

JSON_folder_path = './Json'
files = os.listdir(JSON_folder_path)
file = files[0]


with open(JSON_folder_path + "/" + file) as jsonFile:
    data = json.load(jsonFile)
    us_gaap = data['facts']['us-gaap']
    keys=[]
    formatted_data = {}
    formatted_data['entity_name'] = data['entityName']
    for key in us_gaap:
        keys.append(key.lower())
        formatted_data[key] = us_gaap[key]

search_key = ["cash", "receivable", "payable", "asset", "liabili", "loan", "equit", "income", "expense", "interest", "deposit"]
print(keys)

results={}
for key in search_key:
    matches = []
    for account_name in keys:
        if key in account_name:
            matches.append(account_name)
    results[key]=matches

print(results['cash'])

['accountsreceivablegrosscurrent', 'accretionamortizationofdiscountsandpremiumsinvestments', 'accruedincometaxescurrent', 'accumulateddepreciationdepletionandamortizationpropertyplantandequipment', 'accumulatedothercomprehensiveincomelossnetoftax', 'additionalpaidincapital', 'adjustmentstoadditionalpaidincapitalincometaxdeficiencyfromsharebasedcompensation', 'adjustmentstoadditionalpaidincapitalsharebasedcompensationrequisiteserviceperiodrecognitionvalue', 'adjustmentstoadditionalpaidincapitaltaxeffectfromsharebasedcompensation', 'adjustmenttoadditionalpaidincapitalincometaxeffectfromsharebasedcompensationnet', 'affordablehousingtaxcreditsandothertaxbenefitsamount', 'allowanceforloanandleaselossesforeigncurrencytranslation', 'allowanceforloanandleaselossesperiodincreasedecrease', 'allowanceforloanandleaselossesprovisionforlossnet', 'allowanceforloanandleaselosseswriteoffs', 'allowanceforloanandleaselossrecoveryofbaddebts', 'alternativeinvestmentsfairvaluedisclosure', 'amortizationmetho

# Yahoo Finance Webscraping w/ BeautifulSoup and Selenium

In [57]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

# Initialize default variables - don't change these
stock_tickers_arr = [
    "SIVBQ","JPM","FRCB",
    "WFC","BAC","DFS",
    "MS","PNC","CS",
    "DB","COF","C",
    "KEY","BNY","GS",
    "HSBC","MTB","TD",
    "FHN","OVLY","PACW",
    "WAL","SBNY","ZION",
    "CMA","FCNCA","EWBC",
    "CFG","MCBC","NYCB",
]
path_to = {
    'income_statement': "/financials?p=",
    'balance_sheet':"/balance-sheet?p=",
    'cashflow_statement':"/cash-flow?p=",
}
stock_ticker = "JPM"
stock_URL = (
    "https://finance.yahoo.com/quote/" + stock_ticker + "/financials?p=" + stock_ticker
)

# Functions

# Login to account. Yahoo Finance only gives the last 4 reporting periods, need Yahoo Finance Plus for complete hitorical FS data
def login():
    driver.get(stock_URL)

    # Login to my account
    sign_in_button=driver.find_element(By.ID, 'header-signin-link')
    sign_in_button.click()
    login_username=driver.find_element(By.ID, 'login-username')

    # Enter username
    login_username.send_keys('')
    login_sign_in_button=driver.find_element(By.ID, 'login-signin')
    login_sign_in_button.click()

    # Maximum wait time of 10 seconds, wait for element to load
    wait = WebDriverWait(driver, 10)  
    element = wait.until(EC.presence_of_element_located((By.ID, 'login-passwd')))
    login_passwd = driver.find_element(By.ID, 'login-passwd')

    # Enter password
    # FIXME: Please delete password before pushing
    login_passwd.send_keys('')
    login_sign_in_button=driver.find_element(By.ID, 'login-signin')
    login_sign_in_button.click()

def get_URL_to(ticker: str, financial_statement_name: str):
    stock_ticker = ticker
    path = path_to[financial_statement_name]
    URL = (
        "https://finance.yahoo.com/quote/" + stock_ticker + path + stock_ticker
    )
    return URL

#Only go to URL if it is different than the current URL
def go_to_URL(URL):
    current_url = driver.current_url
    if URL != current_url:
        driver.get(URL)

def expand_all_get_html():
    # Wait for expand button to load before clicking
    wait = WebDriverWait(driver, 10)  
    element = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'expandPf')))
    expand_button = driver.find_element(By.CLASS_NAME, 'expandPf')
    expand_button.click()

    # Let page load new html before extracting
    time.sleep(0.5)

    # Download HTML file
    html_source = driver.page_source

    return html_source

def create_directory_if_not_exists(ticker):
    directory = f'./CSV/{ticker}'
    # Create the directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)

# Need to refactor this into a separate module
def scrape_and_save_to_csv(html, ticker:str, financial_statement_name:str, period:str):
    soup = BeautifulSoup(html, "html.parser")

    # Find all table headers (Dates)
    table_headers = soup.find("div", {"class": "D(tbhg)"})
    table_headers_row = table_headers.find_all("div", {"class": "D(ib)"})
    table_headers_names = []

    for div in table_headers_row:
        span = div.span
        name = span.get_text()
        table_headers_names.append(name)

    table_column_length = len(table_headers_names)

    # Find all table rows (Amounts)
    table_rows = soup.find("div", {"class": "D(tbrg)"})
    table_rows_data = table_rows.find_all("div", {"class": "D(tbr)"})
    table_rows_data_arrays = []

    for table_row in table_rows_data:
        row_header_text = table_row.find("div", {'class':"Ta(start)"}).find("span").get_text()
        row_data_array = table_row.find_all("div", {'class':"Ta(c)"})
        row_data_text_array = [row_header_text]

        for div in row_data_array:
            span = div.find("span") if div.find("span") != None else div
            text = span.get_text()
            row_data_text_array.append(text)
        table_rows_data_arrays.append(row_data_text_array)


    data_frame_object = {table_headers_names[0]:table_headers_names[1:]}

    # First element in row array contains the row header
    # Map it to an object and then convert into a dataframe
    for data in table_rows_data_arrays:
        data_frame_object[data[0]] = data[1:]

    df = pd.DataFrame(data_frame_object)
    # Save CSV
    df.to_csv(f'./CSV/{ticker}/{financial_statement_name}_{period}.csv')

def save_one_financial_statement(ticker:str, financial_statement_name:str, period:str):
    create_directory_if_not_exists(ticker)
    if period == 'quarterly':
        URL = get_URL_to(ticker, financial_statement_name)
        go_to_URL(URL)
        wait = WebDriverWait(driver, 10)  
        element = wait.until(EC.presence_of_element_located((By.XPATH, '//button[div[span[text()="Quarterly"]]]')))
        quarterly_button = driver.find_element(By.XPATH, '//button[div[span[text()="Quarterly"]]]')
        quarterly_button.click()
        html_source = expand_all_get_html()
        scrape_and_save_to_csv(html_source, ticker, financial_statement_name, period)
    else:
        URL = get_URL_to(ticker, financial_statement_name)
        go_to_URL(URL)
        html_source = expand_all_get_html()
        scrape_and_save_to_csv(html_source, ticker, financial_statement_name, period)

def save_all_financial_statements(ticker:str):
    save_one_financial_statement(ticker, 'income_statement', 'annual')
    save_one_financial_statement(ticker, 'income_statement', 'quarterly')
    save_one_financial_statement(ticker, 'balance_sheet', 'annual')
    save_one_financial_statement(ticker, 'balance_sheet', 'quarterly')
    save_one_financial_statement(ticker, 'cashflow_statement', 'annual')
    save_one_financial_statement(ticker, 'cashflow_statement', 'quarterly')

# Login
login()
for stock_ticker in stock_tickers_arr:
    save_all_financial_statements(stock_ticker)


driver.quit()

# for ticker in stock_tickers_arr:
# We iterate through a stock ticker array
# On each iteration we navigate to the stock page IS, expand all, and get source.
# Click on quarterly, expand all, get source
# Navigate to B/S, repeat
# Navigate to CF, repeat



