### Extract data from SEC website using Selenium Package

In [209]:
import os
import time
import shutil
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Configuration variables
CIK = "1696025"
companyName = "Amazon"
downloadDir = "D:\\pythonProject\\DCF\\Data"
tempDownloadDir = "D:\\pythonProject\\DCF\\Data\\Temp"
maxFilings = 10

# Set up WebDriver
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_experimental_option("prefs", {'download.default_directory': tempDownloadDir})
service = Service('D:/pythonProject/DCF/Scrapping/chromedriver.exe')
driver = webdriver.Chrome(service=service, options=chromeOptions)

def wait_for_element(driver, by, value, timeout=10):
    return WebDriverWait(driver, timeout).until(EC.element_to_be_clickable((by, value)))

def process_filing(row, index):
    try:
        filingDate = row.find_element(By.XPATH, ".//td[3]").text
        filingLink = row.find_element(By.XPATH, ".//a[contains(@href, 'index.htm')]")
        filingURL = filingLink.get_attribute('href')

        # Open the filing in a new tab
        driver.execute_script("window.open(arguments[0], '_blank');", filingURL)
        driver.switch_to.window(driver.window_handles[-1])

        # Interact with the interactive data and download the Excel file
        wait_for_element(driver, By.ID, "interactiveDataBtn").click()
        wait_for_element(driver, By.LINK_TEXT, "View Excel Document").click()

        # Allow time for the file to download
        time.sleep(10)

        # Verify the download and move the file
        listFiles = os.listdir(tempDownloadDir)
        if not listFiles:
            raise FileNotFoundError("No files found in the temporary download directory.")

        latestFile = max([os.path.join(tempDownloadDir, file) for file in listFiles], key=os.path.getctime)
        newFileName = os.path.join(downloadDir, f"SEC_{companyName} {filingDate}.xlsx")
        shutil.move(latestFile, newFileName)
        print(f"File has been downloaded and renamed to {newFileName}")

    except Exception as e:
        print(f"An error occurred during processing filing {index + 1}: {e}")
    finally:
        # Close the tab and return to the main tab
        driver.close()
        driver.switch_to.window(driver.window_handles[0])

def main():
    try:
        driver.get(f"https://www.sec.gov/edgar/browse/?CIK={CIK}")
        wait_for_element(driver, By.ID, "btnViewAllFilings").click()
        wait_for_element(driver, By.ID, "searchbox").send_keys("10-K")

        for i in range(maxFilings):
            tenkRows = driver.find_elements(By.XPATH, "//td[normalize-space(text())='10-K']/parent::tr")
            
            if i >= len(tenkRows):
                print("No more '10-K' filings found.")
                break

            process_filing(tenkRows[i], i)

            # Refresh the search to process the next filing
            wait_for_element(driver, By.ID, "searchbox").clear()
            wait_for_element(driver, By.ID, "searchbox").send_keys("10-K")
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "filingsTable")))

    finally:
        driver.quit()

if __name__ == "__main__":
    main()


File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Amazon 2024-05-15.xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Amazon 2023-05-19.xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Amazon 2022-05-17.xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Amazon 2021-04-30.xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Amazon 2020-04-30.xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Amazon 2019-04-29.xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Amazon 2018-04-27.xlsx
No more '10-K' filings found.


#### Extract data from SEC Excel using Pandas package

In [171]:
import json
import pandas as pd
from dateutil.parser import parse

def parseDate(key, value):
    try:
        return parse(key).strftime('%b. %d, %Y')
    except (ValueError, TypeError):
        pass
    try:
        return parse(value).strftime('%b. %d, %Y')
    except (ValueError, TypeError):
        pass
    return None

def processSheet(filePath, sheetId, attrNames, dataMap):
    dataKey = None
    innerMap = {}
    # Load the Excel file
    xls = pd.ExcelFile(filePath)

    # Find the sheet that contains the specified identifier
    sheetName = None
    for sheet in xls.sheet_names:
        if sheetId in sheet.lower():
            sheetName = sheet
            break

    if sheetName:
        df = pd.read_excel(filePath, sheet_name=sheetName)
        print(f"Data from sheet: {sheetName}")

        # Convert the DataFrame to JSON
        dfJson = df.to_json(orient='records')
        data = json.loads(dfJson)

        # Iterate over each row in the DataFrame to find dates in column B
        for i, row in enumerate(data):
            #print(f"Row {i+1}:")
            items = list(row.items())
            prev = ""

            for j, (key, value) in enumerate(items):
                #print(f"  Column {j+1}: {key} = {value}")
                if j == 1 and not dataKey:
                    dataKey = parseDate(key, value)
                if j == 0 and value:
                    words = value.lower().split()
                    if value.lower() in attrNames:
                        prev = value
                    elif len(words) >= 2 and (words[0] + " " + words[1]) in attrNames:
                        prev = value
                if j == 1 and prev:
                    innerMap[prev] = value       
            if dataKey:
                if dataKey not in dataMap:
                    dataMap[dataKey] = {}
                dataMap[dataKey].update(innerMap)
    else:
        print(f"No sheet found containing '{attrNames}'.")

    return dataMap


companyName = "Amazon"
filePath = f"D:\\pythonProject\\DCF\\Data\\SEC_Microsoft.xlsx"

dataMap = {}
balanceSheet = ["cash", "cash and cash equivalents", "short-term investments", "accounts receivable," ,"inventories", "total assets", 
                "accounts payable" ,"short-term debt", "long-term debt", "total liabilities", "retained earnings", "total stockholders’ equity"]
incomeStatement = ["revenue", "other revenue", "cost of revenue", "gross margin", "gross profit", "selling, general and administrative",
                    "sales and marketing", "general and administrative", "general & administrative expenses", "selling expenses", 
                    "research and development", "interest expense", "income before income taxes", "net loss before income tax", 
                    "(Benefit from) provision for income taxes", "provision for income taxes", "net income"]
cashFlow = ["net cash from operations", "net cash used in by operating activities", "net cash provided by operating activities",
             "net cash used in financing", "net cash provided by (used in) financing activities", "net cash provided by financing activities", 
             "net cash used in investing"]

dataMap = processSheet(filePath, 'balance sheet', balanceSheet, dataMap)
dataMap = processSheet(filePath, 'income statements', incomeStatement, dataMap)
dataMap = processSheet(filePath, 'cash flows', cashFlow, dataMap)

for date, data in dataMap.items():
    print(f"Date: {date}")
    for key, value in data.items():
        print(f" {key}: {value}")


Data from sheet: BALANCE SHEETS
Data from sheet: INCOME STATEMENTS
Data from sheet: CASH FLOWS STATEMENTS
Date: Jun. 30, 2024
 Cash and cash equivalents: 18315
 Short-term investments: 57228
 Accounts receivable, net of allowance for doubtful accounts of $830 and $650: 56924
 Inventories: 1246
 Total assets: 512163
 Accounts payable: 21996
 Short-term debt: 6693
 Long-term debt: 42688
 Total liabilities: 243686
 Retained earnings: 173144
 Total stockholders’ equity: 268477
 Total liabilities and stockholders’ equity: 512163
 Revenue: 180349
 Cost of revenue: 58842
 Gross margin: 171008
 Research and development: 29510
 Sales and marketing: 24456
 General and administrative: 7609
 Income before income taxes: 107787
 Provision for income taxes: 19651
 Net income: 88136
 Net cash from operations: 118548
 Net cash used in financing: -37757
 Net cash used in investing: -96970


#### Calculate and plot results from extracted information

In [None]:
%pip install openturns

### --- Import Packages --- ###
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import datetime as dt
import scipy.stats
import sys
import openturns as ot
from IPython.display import display


ebitda, revenue, expense, adminExp, amortization, otherRev, otherExp = 0
ebit = 0
nopat, incomeTax = 0
capEx, chok, liquidExp = 0  
discountRate = 0
dcf, terminalVal = 0
equityVal, enterpriseVal = 0



try:
    # Open the Company's SEC filings page
    driver.get("https://www.sec.gov/edgar/browse/?CIK=" + CIK)

    # Wait until the page loads and the 10-K section is present
    tenK = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.CLASS_NAME, "js-10k-body"))
    )

    # Make the 10-K filings visible
    driver.execute_script("arguments[0].style.display = 'block';", tenK)

    # Find the first 10-K link
    firstTenK = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.XPATH, "(//a[contains(text(), '10-K')])[1]"))
    )

    # Find the nearby "Filing" link
    filing = firstTenK.find_element(By.XPATH, "following-sibling::a[contains(text(), 'Filing')]")

    # Get the URL of the "Filing" link and redirect to the Filing URL
    filing_url = filing.get_attribute('href')
    driver.execute_script("window.location.href = arguments[0];", filing_url)

    # Wait for the "Interactive Data" link and click it (reduced timeout to 5 seconds)
    interactive_data_link = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.ID, "interactiveDataBtn"))
    )

    # Get the URL of the "Interactive Data" link and redirect to the Interactive Data URL
    interactive_data_url = interactive_data_link.get_attribute('href')
    driver.execute_script("window.location.href = arguments[0];", interactive_data_url)

    # Wait for the "View Excel Document" link and click it
    TenKData = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.LINK_TEXT, "View Excel Document"))
    ).click()

    # Allow time for the download to complete
    time.sleep(10)

    # List and print files in the temporary download directory
    list_files = os.listdir(tempDownloadDir)
    print(f"Files in temp download directory: {list_files}")

    # Check if the directory is not empty
    if not list_files:
        raise FileNotFoundError("No files found in the temporary download directory.")

    # Get the latest downloaded file
    full_path = [os.path.join(tempDownloadDir, file) for file in list_files]
    latest_file = max(full_path, key=os.path.getctime)
    print(f"Latest downloaded file: {latest_file}")

    # Rename and move the downloaded file
    new_filename = os.path.join(downloadDir, (f"SEC_{companyName}.xlsx"))
    shutil.move(latest_file, new_filename)
    print(f"File has been downloaded and renamed to {new_filename}")
