### Extract data from SEC website using Selenium Package

In [1]:
import os
import time
import shutil
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Configuration variables
CIK = "0000789019"
companyName = "Microsoft"
downloadDir = "D:\\pythonProject\\DCF\\Data"
tempDownloadDir = "D:\\pythonProject\\DCF\\Data\\Temp"
maxFilings = 10
sicCode = None
sicDesc = None

# Set up WebDriver
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_experimental_option("prefs", {'download.default_directory': tempDownloadDir})
service = Service('D:/pythonProject/DCF/Scrapping/chromedriver.exe')
driver = webdriver.Chrome(service=service, options=chromeOptions)


def wait_for_element(driver, by, value, timeout=10):
    return WebDriverWait(driver, timeout).until(EC.element_to_be_clickable((by, value)))

def extract_company_info():
    global sicCode, sicDesc
    try:
        # Expand the Company Information section
        companyInfo = wait_for_element(driver, By.ID, "entityInformationHeader")
        driver.execute_script("arguments[0].click();", companyInfo)

        # Extract SIC code and description
        sicElement = wait_for_element(driver, By.ID, "sicSection")
        sicCode = sicElement.find_element(By.XPATH, ".//span[@id='SIC']/a").text
        sicDesc = sicElement.find_element(By.ID, "SICDescription").text.strip(" - ")
        print(f"SIC Code: {sicCode}, Description: {sicDesc}")

    except Exception as e:
        print(f"An error occurred while extracting company information: {e}")

def process_filing(row, index):
    try:
        filingDate = row.find_element(By.XPATH, ".//td[3]").text
        filingLink = row.find_element(By.XPATH, ".//a[contains(@href, 'index.htm')]")
        filingURL = filingLink.get_attribute('href')

        # Open the filing in a new tab
        driver.execute_script("window.open(arguments[0], '_blank');", filingURL)
        driver.switch_to.window(driver.window_handles[-1])

        # Interact with the interactive data and download the Excel file
        wait_for_element(driver, By.ID, "interactiveDataBtn").click()
        wait_for_element(driver, By.LINK_TEXT, "View Excel Document").click()

        # Allow time for the file to download
        time.sleep(10)

        # Verify the download and move the file
        listFiles = os.listdir(tempDownloadDir)
        if not listFiles:
            raise FileNotFoundError("No files found in the temporary download directory.")

        latestFile = max([os.path.join(tempDownloadDir, file) for file in listFiles], key=os.path.getctime)
        newFileName = os.path.join(downloadDir, f"SEC_{companyName} {filingDate}.xlsx")
        shutil.move(latestFile, newFileName)
        print(f"File has been downloaded and renamed to {newFileName}")

    except Exception as e:
        print(f"An error occurred during processing filing {index + 1}: {e}")
    finally:
        # Close the tab and return to the main tab
        driver.close()
        driver.switch_to.window(driver.window_handles[0])

def main():
    try:
        driver.get(f"https://www.sec.gov/edgar/browse/?CIK={CIK}")

        # Extract and print company information (including SIC code)
        extract_company_info()

        wait_for_element(driver, By.ID, "btnViewAllFilings").click()
        wait_for_element(driver, By.ID, "searchbox").send_keys("10-K")

        for i in range(maxFilings):
            tenkRows = driver.find_elements(By.XPATH, "//td[normalize-space(text())='10-K']/parent::tr")
            
            if i >= len(tenkRows):
                print("No more '10-K' filings found.")
                break

            process_filing(tenkRows[i], i)

            # Refresh the search to process the next filing
            wait_for_element(driver, By.ID, "searchbox").clear()
            wait_for_element(driver, By.ID, "searchbox").send_keys("10-K")
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "filingsTable")))

    finally:
        driver.quit()

if __name__ == "__main__":
    main()
    print(str(sicDesc))


SIC Code: 7372, Description: Services-Prepackaged Software
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Microsoft 2024-07-30.xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Microsoft 2023-07-27.xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Microsoft 2022-07-28.xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Microsoft 2021-07-29.xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Microsoft 2020-07-30.xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Microsoft 2019-08-01.xlsx
File has been downloaded and renamed to D:\pythonProject\DCF\Data\SEC_Microsoft 2018-08-03.xlsx
No more '10-K' filings found.
Services-Prepackaged Software


#### Extract data from SEC Excel using Pandas package

In [66]:
import json
import os
import pandas as pd
from dateutil.parser import parse
import re

def parse_date(key, value):
    try:
        return parse(key).strftime('%b. %d, %Y')
    except (ValueError, TypeError):
        pass
    try:
        return parse(value).strftime('%b. %d, %Y')
    except (ValueError, TypeError):
        pass
    return None

def find_attr(value, attrNames): 
    value = value.lower()
    res = None
    for attr in attrNames:
        if attr == value:
            return attr.capitalize(), False
    for attr in attrNames:
        if attr in value:
            res = attr.capitalize()
    return res, True

# Insert newline every time a number ends and is followed by a word
def insert_newlines(text):
    return re.sub(r'(\d+)\s+(?=[a-zA-Z])', r'\1\n', text)

def extract_lease_info(filePath):
    try:
        # Load the Excel file to inspect sheet names
        xls = pd.ExcelFile(filePath)

        # Find the sheet that contains "leases (tables)" in its name
        sheetName = next((sheet for sheet in xls.sheet_names if "leases (tables)" in sheet.lower()), None)
        if not sheetName:
            print("No sheet found containing 'leases (tables)'.")
            return None, None
        
        print(f"Found Leases Tables Sheet: {sheetName}")
        df = pd.read_excel(filePath, sheet_name=sheetName)

        opLease = None
        maturities = {}
        maturityKeywords = ["maturities of lease liabilities", "maturity", "maturities"]

        for index, row in df.iterrows():
            # Convert row to string, replace '\n' with space, and lowercase it for uniformity
            # Insert newlines after numbers followed by a word
            strRow = insert_newlines(' '.join(row.astype(str)).replace('\n', ' ').lower())

            if not opLease:
                # Match the pattern with or without a space after the "$" and extract the full number with commas
                match = re.search(r'operating lease cost.*?\$\s*([\d,]+)', strRow, re.IGNORECASE)
                if match:
                    opLease = int(match.group(1).replace(',', ''))
            
            if any(keyword in strRow for keyword in maturityKeywords):
                nextRow = [token for token in strRow.split() if token != '$']
                
                for i in range(len(nextRow) - 1):
                    # Check if the current token is a year and the next token is a dollar amount
                    if re.match(r'\d{4}', nextRow[i]):
                        try:
                            year = nextRow[i]
                            maturities[year] = int(nextRow[i + 1].replace(',', ''))
                        except ValueError:
                            print(f"Skipping invalid value: '{nextRow[i + 1]}' for year {year}")
        return opLease, maturities

    except Exception as e:
        print(f"Error while extracting lease information: {e}")
        return None, None

def processSheet(filePath, sheetIds, attrNames, dataMap):
    # Load the Excel file
    xls = pd.ExcelFile(filePath)

    # List all sheet names to confirm the correct sheet exists
    print(f"Available sheets in {filePath}: {xls.sheet_names}")

    # Find the first sheet that matches any of the specified identifiers
    sheetName = next((sheet for sheet in xls.sheet_names if any(sheetId in sheet.lower() for sheetId in sheetIds)), None)
    if not sheetName:
        print(f"No sheet found containing any of the identifiers: {sheetIds}")
        return dataMap
    
    df = pd.read_excel(filePath, sheet_name=sheetName)
    print(f"Data from sheet: {sheetName}")

    dataKey = None
    innerMap = {}
    matched = {}

    # Convert the DataFrame to JSON
    dfJson = df.to_json(orient='records')
    data = json.loads(dfJson)
    
    # Iterate over each row in the DataFrame to find dates in column B
    for i, row in enumerate(data):
        items = list(row.items())
        prev_attr = ""

        for j, (key, value) in enumerate(items):
            if j == 1 and not dataKey:
                dataKey = parse_date(key, value)
            if j == 0 and value:
                matched_attr, partial = find_attr(value, attrNames)
                if matched_attr:
                    prev_attr = matched_attr
            if j == 1 and prev_attr:
                if prev_attr not in innerMap or (matched[prev_attr] == True and partial == False):
                    innerMap[prev_attr] = value
                    matched[prev_attr] = partial
        if dataKey:
            if dataKey not in dataMap:
                dataMap[dataKey] = {}
            dataMap[dataKey].update(innerMap)
    return dataMap

def processAllFiles(companyName, directory):
    files = [f for f in os.listdir(directory) if f.startswith(f"SEC_{companyName}")]
    dataMap = {}

    # Define possible sheet identifiers and attributes for each statement
    sheetIds = {
        'balanceSheet': ['balance sheet', 'consolidated balance sheets'],
        'incomeStatement': ['income statements', 'consolidated statements of oper', 'statements of operations'],
        'cash_flow': ['cash flows', 'consolidated statements of cash', 'statements of cash fl']
    }
    attrNames = {
        'balanceSheet': ["cash", "cash and cash equivalents", "goodwill", "total current assets", "marketable securities", 
                         "short-term investments", "total assets", "short-term debt", "long-term debt", "total current liabilities", 
                         "total liabilities", "retained earnings", "total stockholders’ equity"],
        'incomeStatement': ["revenue", "other revenue", "cost of revenue", "gross margin", "gross profit", "selling, general and administrative",
                            "sales and marketing", "general and administrative", "general & administrative expenses", "selling expenses", 
                            "research and development", "interest expense", "operating income", "income before taxes", "income before income taxes", 
                            "net loss before income tax", "(benefit from) provision for income taxes", "provision for income taxes", "net income"],
        'cash_flow': ["depreciation, amortization, and other", "net cash from operations", "net cash used in by operating activities", 
                      "net cash provided by operating activities", "net cash used in financing", "net cash from (used in) financing", 
                      "net cash provided by (used in) financing activities", "net cash provided by financing activities", 
                      "additions to property and equipment", "net cash used in investing"]
    }

    for file in files:
        filePath = os.path.join(directory, file)
        print(f"Processing file: {filePath}")

        dataMap = processSheet(filePath, sheetIds['balanceSheet'], attrNames['balanceSheet'], dataMap)
        dataMap = processSheet(filePath, sheetIds['incomeStatement'], attrNames['incomeStatement'], dataMap)
        dataMap = processSheet(filePath, sheetIds['cash_flow'], attrNames['cash_flow'], dataMap)
        dataMap = processSheet(filePath, ['components of other income'], ['interest expense'], dataMap)

        # Extract lease information
        opLease, maturities = extract_lease_info(filePath)
        if opLease or maturities:
            latest = list(dataMap.keys())[-1]
            innerMap = {}
            if opLease:
                innerMap["Operating Lease Cost"] = opLease
            if maturities:
                innerMap["Lease Maturities"] = maturities
            if innerMap:
                dataMap[latest].update(innerMap)
    return dataMap

# Specify the company name and directory to search
companyName = "Microsoft"
directory = "D:\\pythonProject\\DCF\\Data"

# Process all files matching the pattern
dataMap = processAllFiles(companyName, directory)

# Output the results
for key, value in dataMap.items():
    print(f"{key}:")
    if isinstance(value, dict):
        for sub_key, sub_value in value.items():
            print(f"  {sub_key}: {sub_value}")
    else:
        print(f"  {value}")


Processing file: D:\pythonProject\DCF\Data\SEC_Microsoft 2018-08-03.xlsx
Available sheets in D:\pythonProject\DCF\Data\SEC_Microsoft 2018-08-03.xlsx: ['Document and Entity Information', 'INCOME STATEMENTS', 'COMPREHENSIVE INCOME STATEMENTS', 'BALANCE SHEETS', 'BALANCE SHEETS (Parenthetical)', 'CASH FLOWS STATEMENTS', "STOCKHOLDERS' EQUITY STATEMENTS", 'ACCOUNTING POLICIES', 'EARNINGS PER SHARE', 'OTHER INCOME (EXPENSE), NET', 'INVESTMENTS', 'DERIVATIVES', 'FAIR VALUE MEASUREMENTS', 'INVENTORIES', 'PROPERTY AND EQUIPMENT', 'BUSINESS COMBINATIONS', 'GOODWILL', 'INTANGIBLE ASSETS', 'DEBT', 'INCOME TAXES', 'RESTRUCTURING CHARGES', 'UNEARNED REVENUE', 'LEASES', 'CONTINGENCIES', "STOCKHOLDERS' EQUITY", 'ACCUMULATED OTHER COMPREHENSIVE', 'EMPLOYEE STOCK AND SAVINGS PLAN', 'SEGMENT INFORMATION AND GEOGRAP', 'QUARTERLY INFORMATION (UNAUDITE', 'ACCOUNTING POLICIES (Policies)', 'ACCOUNTING POLICIES (Tables)', 'EARNINGS PER SHARE (Tables)', 'OTHER INCOME (EXPENSE), NET (Ta', 'INVESTMENTS (Tables)'

#### Calculate additional variables from extracted information

In [67]:
from sentence_transformers import SentenceTransformer, util

def extractExcel(filePath, sheetName):
    try:
        df = pd.read_excel(filePath, sheet_name=sheetName)
        dfJson = df.to_json(orient='records')
        data = json.loads(dfJson)
        return data

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def calc_rev(data):
    return data.get('Revenue', 0) + data.get('Other revenue', 0)
    
def calc_rev_growth(rev1, rev2):
    return (rev1 - rev2) / abs(rev2) if rev2 else 0

def calc_tax_rate(data):
    incomeBeforeTax = (data.get('Income before income taxes', 0) + data.get('Income before taxes', 0) 
                       - data.get('Net loss before income tax', 0))
    return data.get('Provision for income taxes', 0) / incomeBeforeTax if incomeBeforeTax else 0

def calc_gross_margin(data, rev):
    return (rev - data.get('Cost of revenue', 0)) / rev

def calc_debt(data):
    return data.get('Short-term debt', 0) + data.get('Long-term debt', 0)

def calc_int_coverage(data, debt):
    return debt / (data.get('Interest expense', 0) * -1)

def calc_rd(similarDesc, dataMap, dates):
    rdValList = []
    rdVal = 0
    capitalized = 0.0

    if similarDesc:
        amorPeriod = int(similarDesc["entry"].get("Amortization Period"))
        for date in reversed(dates):
            if 'Research and development' in dataMap[date]:
                rdValList.append((date, dataMap[date]['Research and development']))
            if len(rdValList) > amorPeriod:
                break
        if rdValList:
            first = rdValList[0][1]
            portion = 1.0 / amorPeriod
            if len(rdValList) > 1:
                rdVal = sum(value / amorPeriod for _, value in rdValList[1:])
                capitalized = sum(float(value) * (1.0 - (portion * index)) for index, (_, value) in enumerate(rdValList[1:], start=1))
                capitalized += float(first)
                rdVal = first - rdVal
            else:
                rdVal = first
                capitalized = float(first)
    else:
        print("No similar entry found.")
    
    return rdVal, capitalized

# Function to find the most similar description based on sentence embeddings
def find_most_similar_description(sicDesc, data):
    most_similar = None
    max_similarity = -1
    
    # Compute the embedding for the SIC description
    sic_embedding = model.encode(sicDesc, convert_to_tensor=True)
    
    for entry in data:
        for key, value in entry.items():
            if isinstance(value, str):
                # Compute the embedding for the current entry's description
                entry_embedding = model.encode(value, convert_to_tensor=True)
                
                # Calculate the cosine similarity between the SIC description and the entry description
                similarity = util.pytorch_cos_sim(sic_embedding, entry_embedding).item()
                
                # Update the most similar entry if the similarity is higher
                if similarity > max_similarity:
                    max_similarity = similarity
                    most_similar = {
                        "entry": entry,
                        "similarity_score": similarity
                    }

    return most_similar

def find_bond_rating(data, similarDesc, rev, intCoverage):
    finance = ["Banking", "Banking (Canadian)", "Banking (Foreign)", "Banking (Midwest)", "Financial Services", "Insurance (Diversified)",
                "Insurance (Life)", "Insurance (Property/Casualty)", "Investment Companies (Domestic)", "Investment Companies (Foreign)",
                "Investment Companies (Income)", "Securities Brokerage", "Thrift Institutions"]
    category = "For large non-financial service firms"
    nextRows = None

    if rev < 5000:
        category = "For smaller and riskier firms"
    if similarDesc in finance:
        category = "For financial service firms (default spreads are slighty different)"
    
    for index, row in enumerate(data):
        if row.get('Inputs for synthetic rating estimation') == category:
            nextRows = data[index + 3:index + 18] 

    for row in nextRows:
        items = list(row.items())
        if intCoverage > items[0][1] and intCoverage <= items[1][1]:
            return items[2][1], items[3][1]     
    return "", 0

def calc_deprec_leased_assets(data, preTaxCostDebt):
    commitments = data.get("Lease Maturities", 0)
    pv = [0] * 6
    avg = 0
    leaseDebt = 0
    
    for index, value in enumerate(commitments.values()):
        if index < 5:
            pv[index] = value / ((1 + preTaxCostDebt)**(index + 1))
            leaseDebt += pv[index]
            avg += value
        else:
            pv[5] += value

    cnt = round(pv[5] / (avg / 5))
    if cnt > 0 and pv[5] > 0:
        pv[5] /= cnt
        pv[5] = (pv[5] * (1 - (1 + preTaxCostDebt) ** (-cnt)) / preTaxCostDebt) / ((1 + preTaxCostDebt)**(index + 1))
    leaseDebt += pv[5]
    
    return leaseDebt / (cnt + 5), leaseDebt


data = extractExcel("D:\\pythonProject\\DCF\\Data\\Core\\R&DConv.xlsx", "Amortizable Lives Look-up Table")
dataBond = extractExcel("D:\\pythonProject\\DCF\\Data\\Core\\ratings.xlsx", "Start here Ratings sheet")

# Initialize the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Date indexes for dataMap
dates = list(dataMap.keys())
latest = dataMap[dates[-1]]
prev = dataMap[dates[-2]]

# Revenue and Revenue Growth
latestRev = calc_rev(latest)
prevRev = calc_rev(prev)
revGrowth = calc_rev_growth(latestRev, prevRev)

# Tax Rate
tax = latest.get("(benefit from) provision for income taxes", 0) + latest.get("provision for income taxes", 0)
taxRate = calc_tax_rate(latest)

# Research and Development
# Find the most similar description in the data using Sentence Transformers
similarDesc = find_most_similar_description(str(sicDesc), data)
rdVal, capRd = calc_rd(similarDesc, dataMap, dates)

# Debt and Cost of Debt
debt = calc_debt(latest)
intCoverage = calc_int_coverage(latest, debt)
govBondRate = 0.0384
bondRating, defaultSpread = find_bond_rating(dataBond, similarDesc["entry"]["Industry Name"], latestRev, intCoverage)
preTaxCostDebt = govBondRate + defaultSpread

# Find depreciation on leased assets
deprecLeased, capLeased = calc_deprec_leased_assets(latest, preTaxCostDebt)
prevDeprecLeased, prevCapLeased = calc_deprec_leased_assets(prev, preTaxCostDebt)

ebit = latest.get("Operating income", 0)
adjEbit = ebit + rdVal
adjEbit += latest.get("Operating Lease Cost", 0) - deprecLeased

nopat = adjEbit * (1 - taxRate)
adjNopat = nopat + (rdVal * taxRate)

netInc = latest.get("Net income", 0)
adjNetInc = netInc + rdVal + (rdVal * taxRate)

# Margins
grossMargin = calc_gross_margin(latest, latestRev)
ebitMargin = adjEbit / latestRev
netMargin = adjNetInc / latestRev
fcffMargin = 0

eq = latest.get("Total stockholders’ equity")
prevEq = prev.get("Total stockholders’ equity")
adjPrevEq = prevEq - prev.get("Goodwill", 0) + capRd
prevDebt = calc_debt(prev)

prevNonOpAssets = prev.get("Cash and cash equivalents", 0) + prev.get("Short-term investments", 0)
nonOpAssets = latest.get("Cash and cash equivalents", 0) + latest.get("Short-term investments", 0)

investedCapital = prevDebt + prevEq + prev.get("Minority interests", 0) - prev.get("Goodwill", 0) - prevNonOpAssets
adjInvCap = investedCapital + prevCapLeased + capRd

roe = adjNetInc / adjPrevEq
roic = adjNopat / adjInvCap 
salesCap = latestRev / adjInvCap

# Working Capital and Change in Working Capital
wc1 = latest.get("Total current assets", 0) - latest.get("Total current liabilities", 0) 
wc2 = prev.get("Total current assets", 0) - prev.get("Total current liabilities", 0)
changeWC = wc1 - wc2

print("Most Similar Entry Found:")
print(json.dumps(similarDesc["entry"], indent=2))
print(f"Similarity Score: {similarDesc['similarity_score']:.4f}\n")

print(f"Revenue Growth: {revGrowth}")
print(f"Effective Tax Rate: {taxRate}")
print(f"Total Debt: {debt}")
print(f"Bond Rating: {bondRating}")
print(f"Pre-Tax Cost of Debt: {preTaxCostDebt}")
print(f"Working Capital: {wc1}")
print(f"Change in WC: {changeWC}\n")
print(f"Research and Development Value: {rdVal}")
print(f"Capitalized Research and Development: {capRd}")
print(f"Depreciation on Leased Asset: {deprecLeased}")   
print(f"Capitalized Depreciation on Leased Asset: {prevCapLeased}\n")   
print(f"EBIT: {ebit}")    
print(f"Net Income: {netInc}")      
print(f"Adjusted EBIT: {adjEbit}")   
print(f"Adjusted After-tax Operating Income: {adjNopat}")   
print(f"Adjusted Net Income: {adjNetInc}\n")   
print(f"Gross Margin: {grossMargin}")
print(f"EBIT Margin: {ebitMargin}")
print(f"Net Margin: {netMargin}\n")
print(f"Previous Year Total Equity: {prevEq}")
print(f"Previous Year Total Debt: {prevDebt}")
print(f"Invested Capital: {investedCapital}")
print(f"Adjusted Invested Capital: {adjInvCap}")
print(f"Adjusted ROE: {roe}")
print(f"Adjusted ROIC: {roic}")
print(f"Sales to Capital: {salesCap}")

  warn("""Cannot parse header or footer so it will be ignored""")


Most Similar Entry Found:
{
  "Industry Name": "Computer Software & Services",
  "Amortization Period": 3
}
Similarity Score: 0.5620

Revenue Growth: 0.15669962013071279
Effective Tax Rate: 0.18231326597827197
Total Debt: 49381
Bond Rating: Aaa/AAA
Pre-Tax Cost of Debt: 0.0443
Working Capital: 34448
Change in WC: -45660

Research and Development Value: 5369.0
Capitalized Research and Development: 55810.66666666667
Depreciation on Leased Asset: 2678.4174903689195
Capitalized Depreciation on Leased Asset: 9597.480260318855

EBIT: 109433
Net Income: 88136
Adjusted EBIT: 115678.58250963109
Adjusted After-tax Operating Income: 95567.68225360058
Adjusted Net Income: 94483.83992503735

Gross Margin: 0.6976444382797138
EBIT Margin: 0.47192248149750365
Net Margin: 0.38545638467798626

Previous Year Total Equity: 206223
Previous Year Total Debt: 41990
Invested Capital: 69065
Adjusted Invested Capital: 134473.14692698553
Adjusted ROE: 0.48665967274928534
Adjusted ROIC: 0.710682277001301
Sales to 

#### Plot results

In [1]:
### --- Import Packages --- ###
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np
import datetime as dt
import scipy.stats
import sys
from IPython.display import display


ebitda, revenue, expense, adminExp, amortization, otherRev, otherExp = 0
ebit = 0
nopat, incomeTax = 0
capEx, chok, liquidExp = 0  
discountRate = 0
dcf, terminalVal = 0
equityVal, enterpriseVal = 0


from sentence_transformers import SentenceTransformer, util

dates = list(dataMap.keys())
n = len(dates)

rev = {}
taxRate = {}
revGrowth = 0

defaultRating = ""
intCoverage = 0
preTaxCostDebt = 0
debt = 0

grossMargin = 0
fcffMargin = 0
netMargin = 0

roe = 0
roic = 0
salesCap = 0

wc = {}
changeWC = 0

rdValList = []
rdVal = 0

# Iterate over the dates in reverse order to find and merge "Revenue" and "Other revenue"
rev[dates[n - 1]] = (dataMap[dates[n - 1]].get('Revenue', 0) + dataMap[dates[n - 1]].get('Other revenue', 0))
rev[dates[n - 2]] = (dataMap[dates[n - 2]].get('Revenue', 0) + dataMap[dates[n - 2]].get('Other revenue', 0))

# Convert the merged dictionary to a list of tuples
rev = list(rev.items())
if len(rev) >= 2:
    revGrowth = (rev[0][1] - rev[1][1]) / abs(rev[1][1])

taxRate[dates[n - 1]] = dataMap[dates[n - 1]].get('Provision for income taxes', 0) / (dataMap[dates[n - 1]].get('Income before income taxes', 0) 
                     + dataMap[dates[n - 1]].get('Income before taxes', 0) 
                     - dataMap[dates[n - 1]].get('Net loss before income tax', 0))

grossMargin = (rev[0][1] - dataMap[dates[n - 1]].get('Cost of revenue', 0)) / rev[0][1]

wc[dates[n - 1]] = (dataMap[dates[n - 1]].get('Accounts receivable,', 0) + dataMap[dates[n - 1]].get('Inventories', 0) 
                    - dataMap[dates[n - 1]].get('Accounts payable', 0))
wc[dates[n - 2]] = (dataMap[dates[n - 2]].get('Accounts receivable,', 0) + dataMap[dates[n - 2]].get('Inventories', 0) 
                    - dataMap[dates[n - 2]].get('Accounts payable', 0))
changeWC = wc[dates[n - 1]] - wc[dates[n - 2]]

debt = (dataMap[dates[n - 1]].get('Short-term debt', 0) + dataMap[dates[n - 1]].get('Long-term debt', 0))
preTaxCostDebt = (dataMap[dates[n - 1]].get('Interest expense', 0) * -1) / debt

# Load the Excel file and convert it to JSON
filePathRD = "D:\\pythonProject\\DCF\\Data\\Core\\R&DConv.xlsx"
df = pd.read_excel(filePathRD, sheet_name="Amortizable Lives Look-up Table")

# Convert the DataFrame to JSON
dfJson = df.to_json(orient='records')
data = json.loads(dfJson)

# Initialize the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to find the most similar description based on sentence embeddings
def find_most_similar_description(sicDesc, data):
    most_similar = None
    max_similarity = -1
    
    # Compute the embedding for the SIC description
    sic_embedding = model.encode(sicDesc, convert_to_tensor=True)
    
    for entry in data:
        for key, value in entry.items():
            if isinstance(value, str):
                # Compute the embedding for the current entry's description
                entry_embedding = model.encode(value, convert_to_tensor=True)
                
                # Calculate the cosine similarity between the SIC description and the entry description
                similarity = util.pytorch_cos_sim(sic_embedding, entry_embedding).item()
                
                # Update the most similar entry if the similarity is higher
                if similarity > max_similarity:
                    max_similarity = similarity
                    most_similar = {
                        "entry": entry,
                        "similarity_score": similarity
                    }
    
    return most_similar

# Find the most similar description in the data using Sentence Transformers
most_similar_description = find_most_similar_description(str(sicDesc), data)

# Print the result
if most_similar_description:
    amorPeriod = int(most_similar_description["entry"].get("Amortization Period"))
    for date in reversed(dates):
        if 'Research and development' in dataMap[date]:
            rdValList.append((date, dataMap[date]['Research and development']))
        if len(rdValList) > amorPeriod:
            break
    first = -1
    for date, value in rdValList:
        if first == -1:
            first = value
        else:
            rdVal += (value / amorPeriod)
    rdVal = first - rdVal
    
    print("Most Similar Entry Found:")
    print(json.dumps(most_similar_description["entry"], indent=2))
    print(f"Similarity Score: {most_similar_description['similarity_score']:.4f}\n")
else:
    print("No similar entry found.")

print(f"Revenue Growth: {revGrowth}")
print(f"Gross Margin: {grossMargin}")
print(f"Effective Tax Rate: {taxRate[dates[n - 1]]}")
print(f"Pre-Tax Cost of Debt: {preTaxCostDebt}")
print(f"Working Capital: {wc[dates[n - 1]]}")
print(f"Change in WC: {changeWC}")
print(f"Research and Development Value: {rdVal}")


KeyboardInterrupt: 