### Extract data from SEC website using Selenium Package

In [1]:
import os
import time
import shutil
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Configuration variables
CIK = "0000789019"
companyName = "Microsoft"
downloadDir = "D:\\pythonProject\\DCF\\Data"
tempDownloadDir = "D:\\pythonProject\\DCF\\Data\\Temp"
maxFilings = 10
sicCode = None
sicDesc = None

# Set up WebDriver
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_experimental_option("prefs", {'download.default_directory': tempDownloadDir})
service = Service('D:/pythonProject/DCF/Scrapping/chromedriver.exe')
driver = webdriver.Chrome(service=service, options=chromeOptions)


def wait_for_element(driver, by, value, timeout=10):
    return WebDriverWait(driver, timeout).until(EC.element_to_be_clickable((by, value)))

def extract_company_info():
    global sicCode, sicDesc
    try:
        # Expand the Company Information section
        companyInfo = wait_for_element(driver, By.ID, "entityInformationHeader")
        driver.execute_script("arguments[0].click();", companyInfo)

        # Extract SIC code and description
        sicElement = wait_for_element(driver, By.ID, "sicSection")
        sicCode = sicElement.find_element(By.XPATH, ".//span[@id='SIC']/a").text
        sicDesc = sicElement.find_element(By.ID, "SICDescription").text.strip(" - ")
        print(f"SIC Code: {sicCode}, Description: {sicDesc}")

    except Exception as e:
        print(f"An error occurred while extracting company information: {e}")

def process_filing(row, index):
    try:
        filingDate = row.find_element(By.XPATH, ".//td[3]").text
        filingLink = row.find_element(By.XPATH, ".//a[contains(@href, 'index.htm')]")
        filingURL = filingLink.get_attribute('href')

        # Open the filing in a new tab
        driver.execute_script("window.open(arguments[0], '_blank');", filingURL)
        driver.switch_to.window(driver.window_handles[-1])

        # Interact with the interactive data and download the Excel file
        wait_for_element(driver, By.ID, "interactiveDataBtn").click()
        wait_for_element(driver, By.LINK_TEXT, "View Excel Document").click()

        # Allow time for the file to download
        time.sleep(10)

        # Verify the download and move the file
        listFiles = os.listdir(tempDownloadDir)
        if not listFiles:
            raise FileNotFoundError("No files found in the temporary download directory.")

        latestFile = max([os.path.join(tempDownloadDir, file) for file in listFiles], key=os.path.getctime)
        newFileName = os.path.join(downloadDir, f"SEC_{companyName} {filingDate}.xlsx")
        shutil.move(latestFile, newFileName)
        print(f"File has been downloaded and renamed to {newFileName}")

    except Exception as e:
        print(f"An error occurred during processing filing {index + 1}: {e}")
    finally:
        # Close the tab and return to the main tab
        driver.close()
        driver.switch_to.window(driver.window_handles[0])

def main():
    try:
        driver.get(f"https://www.sec.gov/edgar/browse/?CIK={CIK}")

        # Extract and print company information (including SIC code)
        extract_company_info()

        wait_for_element(driver, By.ID, "btnViewAllFilings").click()
        wait_for_element(driver, By.ID, "searchbox").send_keys("10-K")

        for i in range(maxFilings):
            tenkRows = driver.find_elements(By.XPATH, "//td[normalize-space(text())='10-K']/parent::tr")
            
            if i >= len(tenkRows):
                print("No more '10-K' filings found.")
                break

            process_filing(tenkRows[i], i)

            # Refresh the search to process the next filing
            wait_for_element(driver, By.ID, "searchbox").clear()
            wait_for_element(driver, By.ID, "searchbox").send_keys("10-K")
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "filingsTable")))

    finally:
        driver.quit()

if __name__ == "__main__":
    main()
    print(str(sicDesc))


SIC Code: 7372, Description: Services-Prepackaged Software
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Microsoft 2024-07-30.xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Microsoft 2023-07-27.xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Microsoft 2022-07-28.xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Microsoft 2021-07-29.xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Microsoft 2020-07-30.xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Microsoft 2019-08-01.xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Microsoft 2018-08-03.xlsx
No more '10-K' filings found.
Services-Prepackaged Software


#### Extract data from SEC Excel using Pandas package

In [2]:
import json
import pandas as pd
from dateutil.parser import parse

def parseDate(key, value):
    try:
        return parse(key).strftime('%b. %d, %Y')
    except (ValueError, TypeError):
        pass
    try:
        return parse(value).strftime('%b. %d, %Y')
    except (ValueError, TypeError):
        pass
    return None

def processSheet(filePath, sheetIds, attrNames, dataMap):
    dataKey = None
    innerMap = {}
    # Load the Excel file
    xls = pd.ExcelFile(filePath)

    # Find the first sheet that matches any of the specified identifiers
    sheetName = None
    for sheet in xls.sheet_names:
        for sheetId in sheetIds:
            if sheetId in sheet.lower():
                sheetName = sheet
                break
        if sheetName:
            break

    if sheetName:
        df = pd.read_excel(filePath, sheet_name=sheetName)
        print(f"Data from sheet: {sheetName}")

        # Convert the DataFrame to JSON
        dfJson = df.to_json(orient='records')
        data = json.loads(dfJson)

        # Iterate over each row in the DataFrame to find dates in column B
        for i, row in enumerate(data):
            items = list(row.items())
            prev = ""

            for j, (key, value) in enumerate(items):
                if j == 1 and not dataKey:
                    dataKey = parseDate(key, value)
                if j == 0 and value:
                    words = value.lower().split()
                    if value.lower() in attrNames:
                        prev = value
                    elif len(words) >= 2 and (words[0] + " " + words[1]) in attrNames:
                        prev = value
                if j == 1 and prev:
                    innerMap[prev] = value       
            if dataKey:
                if dataKey not in dataMap:
                    dataMap[dataKey] = {}
                dataMap[dataKey].update(innerMap)
    else:
        print(f"No sheet found containing any of the identifiers: {sheetIds}")

    return dataMap

def processAllFiles(companyName, directory):
    files = [f for f in os.listdir(directory) if f.startswith(f"SEC_{companyName}")]
    dataMap = {}

    balanceSheet = ["cash", "cash and cash equivalents", "short-term investments", "accounts receivable,", "inventories", "total assets", 
                    "accounts payable", "short-term debt", "long-term debt", "operating lease" "total liabilities", "retained earnings", "total stockholders’ equity"]
    incomeStatement = ["revenue", "other revenue", "cost of revenue", "gross margin", "gross profit", "selling, general and administrative",
                       "sales and marketing", "general and administrative", "general & administrative expenses", "selling expenses", 
                       "research and development", "interest expense", "income before income taxes", "net loss before income tax", 
                       "(Benefit from) provision for income taxes", "provision for income taxes", "net income"]
    cashFlow = ["net cash from operations", "net cash used in by operating activities", "net cash provided by operating activities",
                "net cash used in financing", "net cash provided by (used in) financing activities", "net cash provided by financing activities", 
                "net cash used in investing"]

    # Define possible sheet identifiers for each statement
    balanceSheetIds = ['balance sheet', 'consolidated balance sheets']
    incomeStatementIds = ['income statements', 'consolidated statements of oper', 'statements of operations']
    cashFlowIds = ['cash flows', 'consolidated statements of cash', 'statements of cash fl']

    for file in files:
        filePath = os.path.join(directory, file)
        print(f"Processing file: {filePath}")
        dataMap = processSheet(filePath, balanceSheetIds, balanceSheet, dataMap)
        dataMap = processSheet(filePath, incomeStatementIds, incomeStatement, dataMap)
        dataMap = processSheet(filePath, cashFlowIds, cashFlow, dataMap)

    return dataMap

# Specify the company name and directory to search
companyName = "Microsoft"
directory = "D:\\pythonProject\\DCF\\Data"

# Process all files matching the pattern
dataMap = processAllFiles(companyName, directory)

# Output the results
for date, data in dataMap.items():
    print(f"Date: {date}")
    for key, value in data.items():
        print(f" {key}: {value}")     


Processing file: D:\pythonProject\DCF\Data\SEC_Microsoft 2018-08-03.xlsx
Data from sheet: BALANCE SHEETS
Data from sheet: INCOME STATEMENTS
Data from sheet: CASH FLOWS STATEMENTS
Processing file: D:\pythonProject\DCF\Data\SEC_Microsoft 2019-08-01.xlsx
Data from sheet: BALANCE SHEETS
Data from sheet: INCOME STATEMENTS
Data from sheet: CASH FLOWS STATEMENTS
Processing file: D:\pythonProject\DCF\Data\SEC_Microsoft 2020-07-30.xlsx
Data from sheet: BALANCE SHEETS
Data from sheet: INCOME STATEMENTS
Data from sheet: CASH FLOWS STATEMENTS
Processing file: D:\pythonProject\DCF\Data\SEC_Microsoft 2021-07-29.xlsx
Data from sheet: BALANCE SHEETS
Data from sheet: INCOME STATEMENTS
Data from sheet: CASH FLOWS STATEMENTS
Processing file: D:\pythonProject\DCF\Data\SEC_Microsoft 2022-07-28.xlsx
Data from sheet: BALANCE SHEETS
Data from sheet: INCOME STATEMENTS
Data from sheet: CASH FLOWS STATEMENTS
Processing file: D:\pythonProject\DCF\Data\SEC_Microsoft 2023-07-27.xlsx
Data from sheet: BALANCE SHEETS


#### Calculate additional variables from extracted information

In [3]:
from sentence_transformers import SentenceTransformer, util

rdValList = []
rdVal = 0

# Load the Excel file and convert it to JSON
filePathRD = "D:\\pythonProject\\DCF\\Data\\Core\\R&DConv.xlsx"
df = pd.read_excel(filePathRD, sheet_name="Amortizable Lives Look-up Table")

# Convert the DataFrame to JSON
dfJson = df.to_json(orient='records')
data = json.loads(dfJson)

# Initialize the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to find the most similar description based on sentence embeddings
def find_most_similar_description(sicDesc, data):
    most_similar = None
    max_similarity = -1
    
    # Compute the embedding for the SIC description
    sic_embedding = model.encode(sicDesc, convert_to_tensor=True)
    
    for entry in data:
        for key, value in entry.items():
            if isinstance(value, str):
                # Compute the embedding for the current entry's description
                entry_embedding = model.encode(value, convert_to_tensor=True)
                
                # Calculate the cosine similarity between the SIC description and the entry description
                similarity = util.pytorch_cos_sim(sic_embedding, entry_embedding).item()
                
                # Update the most similar entry if the similarity is higher
                if similarity > max_similarity:
                    max_similarity = similarity
                    most_similar = {
                        "entry": entry,
                        "similarity_score": similarity
                    }
    
    return most_similar

# Find the most similar description in the data using Sentence Transformers
most_similar_description = find_most_similar_description(str(sicDesc), data)

# Print the result
if most_similar_description:
    amorPeriod = int(most_similar_description["entry"].get("Amortization Period"))
    dates = list(dataMap.keys())
    for date in reversed(dates):
        if 'Research and development' in dataMap[date]:
            rdValList.append((date, dataMap[date]['Research and development']))
        if len(rdValList) > amorPeriod:
            break
    first = -1
    for date, value in rdValList:
        print(date, value)
        if first == -1:
            first = value
        else:
            rdVal += (value / amorPeriod)
    rdVal = first - rdVal
    print(rdVal)
    print("Most Similar Entry Found:")
    print(json.dumps(most_similar_description["entry"], indent=2))
    print(f"Similarity Score: {most_similar_description['similarity_score']:.4f}")
else:
    print("No similar entry found.")

  from tqdm.autonotebook import tqdm, trange







Jun. 30, 2024 29510
Jun. 30, 2023 27195
Jun. 30, 2022 24512
Jun. 30, 2021 20716
Jun. 30, 2020 19269
-1054.0
Most Similar Entry Found:
{
  "Industry Name": "Computer Software & Services",
  "Amortization Period": 3
}
Similarity Score: 0.5620


#### Plot results

In [None]:
%pip install openturns

### --- Import Packages --- ###
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np
import datetime as dt
import scipy.stats
import sys
from IPython.display import display


ebitda, revenue, expense, adminExp, amortization, otherRev, otherExp = 0
ebit = 0
nopat, incomeTax = 0
capEx, chok, liquidExp = 0  
discountRate = 0
dcf, terminalVal = 0
equityVal, enterpriseVal = 0
