### Extract data from SEC website using Selenium Package

In [61]:
import os
import time
import shutil
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Enter Company CIK and Name from SEC website
CIK = "1696025"
companyName = "Amazon"

# Set up the download directory
downloadDir = "D:\\pythonProject\\DCF\\Data"
tempDownloadDir = "D:\\pythonProject\\DCF\\Data\\Temp"

# Configure Chrome options  
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_experimental_option("prefs", {'download.default_directory': tempDownloadDir})

# Set up the WebDriver (make sure to specify the path to your WebDriver executable)
service = Service('D:/pythonProject/DCF/Scrapping/chromedriver.exe')
driver = webdriver.Chrome(service=service, options=chromeOptions)

try:
    # Open the Company's SEC filings page
    driver.get("https://www.sec.gov/edgar/browse/?CIK=" + CIK)

    # Wait until the page loads and the 10-K section is present
    tenK = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.CLASS_NAME, "js-10k-body"))
    )

    # Make the 10-K filings visible
    driver.execute_script("arguments[0].style.display = 'block';", tenK)

    # Find the first 10-K link
    firstTenK = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.XPATH, "(//a[contains(text(), '10-K')])[1]"))
    )

    # Find the nearby "Filing" link
    filing = firstTenK.find_element(By.XPATH, "following-sibling::a[contains(text(), 'Filing')]")

    # Get the URL of the "Filing" link and redirect to the Filing URL
    filing_url = filing.get_attribute('href')
    driver.execute_script("window.location.href = arguments[0];", filing_url)

    # Wait for the "Interactive Data" link and click it (reduced timeout to 5 seconds)
    interactive_data_link = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.ID, "interactiveDataBtn"))
    )

    # Get the URL of the "Interactive Data" link and redirect to the Interactive Data URL
    interactive_data_url = interactive_data_link.get_attribute('href')
    driver.execute_script("window.location.href = arguments[0];", interactive_data_url)

    # Wait for the "View Excel Document" link and click it
    TenKData = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.LINK_TEXT, "View Excel Document"))
    ).click()

    # Allow time for the download to complete
    time.sleep(10)

    # List and print files in the temporary download directory
    list_files = os.listdir(tempDownloadDir)
    print(f"Files in temp download directory: {list_files}")

    # Check if the directory is not empty
    if not list_files:
        raise FileNotFoundError("No files found in the temporary download directory.")

    # Get the latest downloaded file
    full_path = [os.path.join(tempDownloadDir, file) for file in list_files]
    latest_file = max(full_path, key=os.path.getctime)
    print(f"Latest downloaded file: {latest_file}")

    # Rename and move the downloaded file
    new_filename = os.path.join(downloadDir, (f"SEC_{companyName}.xlsx"))
    shutil.move(latest_file, new_filename)
    print(f"File has been downloaded and renamed to {new_filename}")

finally:
    # Close the browser after the operations are complete
    driver.quit()


Files in temp download directory: ['Financial_Report.xlsx']
Latest downloaded file: D:\pythonProject\DCF\Data\Temp\Financial_Report.xlsx


PermissionError: [WinError 32] Процесс не может получить доступ к файлу, так как этот файл занят другим процессом

#### Extract data from SEC Excel using Pandas package

In [154]:
import pandas as pd
from dateutil.parser import parse
import json

def parseDate(key, value):
    try:
        return parse(key).strftime('%b. %d, %Y')
    except (ValueError, TypeError):
        pass
    try:
        return parse(value).strftime('%b. %d, %Y')
    except (ValueError, TypeError):
        pass
    return None

def processSheet(filePath, sheetIdentifier, valueNames, dataMap):
    dataKey = None
    innerMap = {}
    # Load the Excel file
    xls = pd.ExcelFile(filePath)

    # Find the sheet that contains the specified identifier
    sheetName = None
    for sheet in xls.sheet_names:
        if sheetIdentifier in sheet.lower():
            sheetName = sheet
            break

    if sheetName:
        df = pd.read_excel(filePath, sheet_name=sheetName)
        print(f"Data from sheet: {sheetName}")

        # Convert the DataFrame to JSON
        dfJson = df.to_json(orient='records')
        data = json.loads(dfJson)

        # Iterate over each row in the DataFrame to find dates in column B
        for i, row in enumerate(data):
            #print(f"Row {i+1}:")
            items = list(row.items())
            prev = ""

            for j, (key, value) in enumerate(items):
                #print(f"  Column {j+1}: {key} = {value}")
                if j == 1 and not dataKey:
                    dataKey = parseDate(key, value)
                if j == 0 and value:
                    if value.lower() in valueNames:
                        prev = value
                if j == 1 and prev:
                    innerMap[prev] = value       
            if dataKey:
                if dataKey not in dataMap:
                    dataMap[dataKey] = {}
                dataMap[dataKey].update(innerMap)
    else:
        print(f"No sheet found containing '{sheetIdentifier}'.")

    return dataMap


companyName = "Amazon"
filePath = f"D:\\pythonProject\\DCF\\Data\\SEC_Microsoft.xlsx"

dataMap = {}
balanceSheet = ["cash", "cash and cash equivalents", "short-term investments", "total assets", "short-term debt", "long-term debt", 
                "total liabilities", "retained earnings", "total stockholders’ equity"]
incomeStatement = ["revenue", "other revenue", "cost of revenue", "gross margin", "gross profit", "general and administrative",
                "research and development", "net income"]

dataMap = processSheet(filePath, 'balance sheet', balanceSheet, dataMap)
dataMap = processSheet(filePath, 'income statements', incomeStatement, dataMap)

for date, data in dataMap.items():
    print(f"Date: {date}")
    for key, value in data.items():
        print(f" {key}: {value}")


Data from sheet: BALANCE SHEETS
Data from sheet: INCOME STATEMENTS
Date: Jun. 30, 2024
 Cash and cash equivalents: 18315
 Short-term investments: 57228
 Total assets: 512163
 Short-term debt: 6693
 Long-term debt: 42688
 Total liabilities: 243686
 Retained earnings: 173144
 Total stockholders’ equity: 268477
 Revenue: 180349
 Cost of revenue: 58842
 Gross margin: 171008
 Research and development: 29510
 General and administrative: 7609
 Net income: 88136


In [None]:
%pip install openturns

### --- Import Packages --- ###
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import datetime as dt
import scipy.stats
import sys
import openturns as ot
from IPython.display import display


ebitda, revenue, expense, adminExp, amortization, otherRev, otherExp = 0
ebit = 0
nopat, incomeTax = 0
capEx, chok, liquidExp = 0  
discountRate = 0
dcf, terminalVal = 0
equityVal, enterpriseVal = 0




# Load the Excel file
xls = pd.ExcelFile(filePath)

# Find the sheet that contains 'Balance Sheet'
sheetName = None
for sheet in xls.sheet_names:
    if 'balance sheet' in sheet.lower():
        sheetName = sheet
        break

if sheetName:
    df = pd.read_excel(filePath, sheet_name=sheetName)
    print(f"Data from sheet: {sheetName}")

    # Convert the DataFrame to JSON
    dfJson = df.to_json(orient='records')
    data = json.loads(dfJson)

    # Iterate over each row in the DataFrame to find dates in column B
    for i, row in enumerate(data):
        items = list(row.items())
        innerMap = {}
        prev = ""
        
        for j, (key, value) in enumerate(items):
            if j == 1:
                dataKey = parseDate(key, value)
            if j == 0:
                if value.lower() in ["total assets", "total liabilities", "total stockholders’ equity"]:
                    prev = value
            if j == 1 and prev:
                innerMap[prev] = value

        if dataKey:
            if dataKey not in dataMap:
                dataMap[dataKey] = {}
            dataMap[dataKey].update(innerMap)

    # Print the resulting map
    for date, data in dataMap.items():
        print(f"Date: {date}")
        for key, value in data.items():
            print(f" {key}: {value}")
else:
    print("No sheet found containing 'Balance Sheet'.")