### Extract data from SEC website using Selenium Package

In [41]:
import os
import time
import shutil
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Enter Company CIK and Name from SEC website
CIK = "1696025"
companyName = "Amazon"

# Set up the download directory
downloadDir = "D:\\pythonProject\\DCF\\Data"
tempDownloadDir = "D:\\pythonProject\\DCF\\Data\\Temp"

# Configure Chrome options  
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_experimental_option("prefs", {'download.default_directory': tempDownloadDir})

# Set up the WebDriver (make sure to specify the path to your WebDriver executable)
service = Service('D:/pythonProject/DCF/Scrapping/chromedriver.exe')
driver = webdriver.Chrome(service=service, options=chromeOptions)

try:
    # Open the Company's SEC filings page
    driver.get("https://www.sec.gov/edgar/browse/?CIK=" + CIK)

    # Wait until the page loads and the 10-K section is present
    tenK = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.CLASS_NAME, "js-10k-body"))
    )

    # Make the 10-K filings visible
    driver.execute_script("arguments[0].style.display = 'block';", tenK)

    # Find the first 10-K link
    firstTenK = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.XPATH, "(//a[contains(text(), '10-K')])[1]"))
    )

    # Find the nearby "Filing" link
    filing = firstTenK.find_element(By.XPATH, "following-sibling::a[contains(text(), 'Filing')]")

    # Get the URL of the "Filing" link and redirect to the Filing URL
    filing_url = filing.get_attribute('href')
    driver.execute_script("window.location.href = arguments[0];", filing_url)

    # Wait for the "Interactive Data" link and click it (reduced timeout to 5 seconds)
    interactive_data_link = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.ID, "interactiveDataBtn"))
    )

    # Get the URL of the "Interactive Data" link and redirect to the Interactive Data URL
    interactive_data_url = interactive_data_link.get_attribute('href')
    driver.execute_script("window.location.href = arguments[0];", interactive_data_url)

    # Wait for the "View Excel Document" link and click it
    TenKData = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.LINK_TEXT, "View Excel Document"))
    ).click()

    # Allow time for the download to complete
    time.sleep(10)

    # List and print files in the temporary download directory
    list_files = os.listdir(tempDownloadDir)
    print(f"Files in temp download directory: {list_files}")

    # Check if the directory is not empty
    if not list_files:
        raise FileNotFoundError("No files found in the temporary download directory.")

    # Get the latest downloaded file
    full_path = [os.path.join(tempDownloadDir, file) for file in list_files]
    latest_file = max(full_path, key=os.path.getctime)
    print(f"Latest downloaded file: {latest_file}")

    # Rename and move the downloaded file
    new_filename = os.path.join(downloadDir, (f"SEC_{companyName}.xlsx"))
    shutil.move(latest_file, new_filename)
    print(f"File has been downloaded and renamed to {new_filename}")

finally:
    # Close the browser after the operations are complete
    driver.quit()


Files in temp download directory: ['Financial_Report (1).xlsx', 'Financial_Report.xlsx']
Latest downloaded file: D:\pythonProject\DCF\Data\Temp\Financial_Report (1).xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Amazon.xlsx


#### Extract data from SEC Excel using Pandas 

In [58]:
import pandas as pd
from dateutil.parser import parse
import json

companyName = "Amazon"
file_path = f"D:\\pythonProject\\DCF\\Data\\SEC_{companyName}.xlsx"

# Load the Excel file
xls = pd.ExcelFile(file_path)

# Find the sheet that contains 'Balance Sheet'
sheet_name = None
for sheet in xls.sheet_names:
    if 'Balance Sheet' in sheet:
        sheet_name = sheet
        break

if sheet_name:
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    
    print(f"Data from sheet: {sheet_name}")
    print(df)

    # Convert the DataFrame to JSON
    dfJson = df.to_json(orient='records')
    data = json.loads(dfJson)

    dataMap = {}

    # Iterate over each row in the DataFrame to find dates in column B
    for row in df.itertuples(index=False):
        try:
            # Check if the value in the second column (Column B) is a date
            date_str = str(row[1])  # Column B is the second column, so we use index 1
            date = parse(date_str, fuzzy=False)
            
            # If parsing is successful, store the financial data
            dataMap[date] = {
                'Total Assets': getattr(row, 'Total Assets', None),
                'Total Liabilities': getattr(row, 'Total Liabilities', None),
                'Total Current Assets': getattr(row, 'Total Current Assets', None),
                'Total Current Liabilities': getattr(row, 'Total Current Liabilities', None),
                'Total Long-term Liabilities': getattr(row, 'Total Long-term Liabilities', None),
                'Stockholders Deficit': getattr(row, 'Total Stockholders Deficit', None)
                # Add more fields as needed
            }
        except (ValueError, TypeError):
            # If the value in column B is not a date, continue to the next row
            continue
    # Print the map for verification
    for date, data in dataMap.items():
        print(f"Date: {date}")
        for key, value in data.items():
            print(f"  {key}: {value}")

else:
    print("No sheet found containing 'Balance Sheet'.")



Data from sheet: Consolidated Balance Sheets
                Consolidated Balance Sheets - USD ($) Jan. 31, 2024  \
0                                     Current Assets:                 
1                                                Cash          9647   
2                 Accounts receivable, net - unbilled         20557   
3                                    Prepaid expenses         17550   
4                                Total Current Assets         47754   
5                  Property, plant and equipment, net          6127   
6                              Intangible Assets, net        108725   
7                                  Total Other Assets        114852   
8                                        Total Assets        162606   
9                                 Current Liabilities                 
10                                   Accounts payable        267435   
11                                   Accrued interest         19358   
12                              

In [None]:
%pip install openturns

### --- Import Packages --- ###
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import datetime as dt
import scipy.stats
import sys
import openturns as ot
from IPython.display import display


ebitda, revenue, expense, adminExp, amortization, otherRev, otherExp = 0
ebit = 0
nopat, incomeTax = 0
capEx, chok, liquidExp = 0  
discountRate = 0
dcf, terminalVal = 0
equityVal, enterpriseVal = 0 