In [7]:
import pandas as pd
import os
import openpyxl
import numpy as np
import re
from datetime import datetime
import requests
import shutil
import yfinance
from bs4 import BeautifulSoup

In [34]:

class SECSubmissions:
    def __init__(self,ticker):
        self.ticker = ticker 
        self.CIK = self.getCIK()
        self.filings = self.getSECFilings(self.CIK)
        self.filings = self.findDate(self.filings)
        self.fiscalYear = self.findFiscalYear(self.filings)
        self.filings = self.mergeFiscalYearWithFilings(self.filings, self.fiscalYear)
        self.filings = self.createSubmissionName(self.filings)
       
        
    def getCIK(self): #get SEC code for company based on ticker
        headers={"User-Agent": "Mozilla/5.0"}
        symbol_to_cik = requests.get("https://www.sec.gov/files/company_tickers.json").json() #returns a json dictionary with a indexed list of all different companies 
        ciks = {info["ticker"]:info["cik_str"] for key,info in symbol_to_cik.items()} #create dictionary indexable by ticker
        return ciks[self.ticker]
    
        
    #input: CIK
    #request the data on every submission to SEC website for the company
    def getSECFilings(self,CIK):
        headers={"User-Agent": "Mozilla/5.0"}
        edgar_filings = requests.get(f"https://data.sec.gov/submissions/CIK{CIK:0>10}.json", headers=headers).json()
        filings = pd.DataFrame(edgar_filings["filings"]["recent"])
        filings = filings.loc[filings["reportDate"]>"2014-01-01"]#filings pre-2014 are unable to be downloaded due to different excel format
        filings = filings.loc[filings["form"].isin(["10-Q","10-K"])].reset_index(drop=True) #drop all filings that are not 10Q or 10K
        return filings 
    
    #input filings
    #output filings with mo and year
    def findDate(self,filings):
        filings["Month"] = pd.to_datetime(filings["reportDate"]).dt.strftime("%m").astype("int") #get Month of all filings 
        filings["Year"] = pd.to_datetime(filings["reportDate"]).dt.strftime("%Y").astype("int") #get Year of all filings
        filings["Date"] = pd.to_datetime(filings["reportDate"]).dt.strftime("%m/%d/%Y")
        return filings
    
    #input a list of 10k filings with mo and year
    #find one fiscal year of data via 10k, then find all possible months filed and organize to fiscal period
    def findFiscalYear(self,filings):
        #get a fiscal Year of data by finding the last 10K and the three 10Qs before it
        TenKIndex = filings.loc[filings["form"]=="10-K"].index[0] 
        fiscalYearKey = filings.loc[TenKIndex:TenKIndex+3,["Month","form","Date"]].copy().reset_index(drop=True)
        fiscalYearKey["Period"] = ["Year Ended", "Q3","Q2","Q1"]
        
        #fiscal year data for certain period can vary between months ie. both Jan & Feb, therefore I found all possible months of filings and found appropriate period for each month
        fiscalYear = pd.DataFrame({"Month":filings.Month.unique()})
        fiscalYear.index = list(fiscalYear.Month.apply(lambda x: np.argmin(np.abs(fiscalYearKey.Month-x)))) #using fiscal year key, I found the index of the period in fiscalYearKey with closest month to each month 
        fiscalYear = pd.merge(fiscalYear,fiscalYearKey, left_index=True, right_index=True, suffixes=("","_x")) #I merged together together all of the possible months with their corresponding period based on the key
        fiscalYear = fiscalYear.drop(columns="Month_x")
        
        return fiscalYear
    
    #input filings of 10k/q
    #merge by month to find which fiscal period, if filing of 10q is later in year than 10k, then adjust year so it is a part of right fiscal year
    def mergeFiscalYearWithFilings(self, filings, fiscalYear):
        #merge fiscal year labels with filings by the month that they were reported
        filings = pd.merge(filings,fiscalYear,on=["Month","form"],how="left")    
        #set Fiscal Year
        filings["Fiscal Year"] = filings["Year"]
        #get Quarters that are a part of different fiscal Year than report date
        TenKMo = fiscalYear.loc[fiscalYear["form"]=="10-K"]["Month"].iloc[0]
        filings.loc[(filings["Month"]>TenKMo)&(filings["form"]!="10-K"),"Fiscal Year"] += 1 #the 10K is reported in a month before the 10Q so therefore the 10Q is the next fiscal year 
        
        return filings
    
    

    #input filings
    #take the fiscal period and year and create a string, if different fiscal Year than filing date use parenthesis
    def createSubmissionName(self,filings):
        filings["Name"] = filings.apply(self.submissionName,axis=1)
        return filings
        
    def submissionName(self,row):
        period = row["Period"]
        fiscalYear = row["Fiscal Year"]
        Year = row["Year"]
        if fiscalYear != Year:
                name = f"{period} {fiscalYear} ({Year})"
        else: 
                name = f"{period} {fiscalYear}"
        return name
    
    
    
    def printUrl(self):
        for i,row in self.filings.iterrows():
            name = row["Name"]
            accessionNum = row["accessionNumber"].replace("-","")
            doc = row["primaryDocument"]
            url = f"https://www.sec.gov/Archives/edgar/data/{self.CIK}/{accessionNum}/{doc}"
            print(f"{name} {url}")
           
        

In [35]:
sub = SECSubmissions("AAPL")
sub.printUrl()

Q3 2022 https://www.sec.gov/Archives/edgar/data/320193/000032019322000070/aapl-20220625.htm
Q2 2022 https://www.sec.gov/Archives/edgar/data/320193/000032019322000059/aapl-20220326.htm
Q1 2022 (2021) https://www.sec.gov/Archives/edgar/data/320193/000032019322000007/aapl-20211225.htm
Year Ended 2021 https://www.sec.gov/Archives/edgar/data/320193/000032019321000105/aapl-20210925.htm
Q3 2021 https://www.sec.gov/Archives/edgar/data/320193/000032019321000065/aapl-20210626.htm
Q2 2021 https://www.sec.gov/Archives/edgar/data/320193/000032019321000056/aapl-20210327.htm
Q1 2021 (2020) https://www.sec.gov/Archives/edgar/data/320193/000032019321000010/aapl-20201226.htm
Year Ended 2020 https://www.sec.gov/Archives/edgar/data/320193/000032019320000096/aapl-20200926.htm
Q3 2020 https://www.sec.gov/Archives/edgar/data/320193/000032019320000062/aapl-20200627.htm
Q2 2020 https://www.sec.gov/Archives/edgar/data/320193/000032019320000052/a10-qq220203282020.htm
Q1 2020 (2019) https://www.sec.gov/Archives/e

In [4]:
class HelperFunctions():
    #input: a list of quarters such as "Q2 2022"
    #iterates through a finds a date/year in the file
    def getYears(self,quarters): 
        return [int(re.findall("\d{4}",quarter)[0]) for quarter in quarters]
    
    def getUniqueYears(self,quarters):
        return set(self.getYears(quarters))
    
    #input: compiledStatement
    #returns all quarters but removes first col of label
    def getQuarters(self, compiledStatement): 
        return list(compiledStatement.columns[1:])
    
    def getPossibleFilingsFromYears(self, years): 
        possible = [q + " " + str(year) for year in years for q in ["Q1","Q2","Q3","Q4","Year Ended"]]
        possible += [q + " " + str(year) for year in years for q in ["6mo","9mo"]]
        return possible
    
    #input: compiledStatement
    #find all years in statement and get all possible filings in order then only keep the ones in actually in the statement
    def reorderQuarters(self,compiledStatement): 
        quarters = self.getQuarters(compiledStatement)
        years = self.getUniqueYears(quarters)
        cols = self.getPossibleFilingsFromYears(years)
        cols = [col for col in cols if col in quarters]
        compiledStatement = compiledStatement[["label"] + cols]
        return compiledStatement
    
    
    
    

In [337]:

class FindStockPrice:
    def __init__(self,ticker, fromDate=2017,endDate=2022):
        self.ticker = ticker
        self.fromDate = fromDate
        self.endDate = endDate
        self.path =  os.path.join("../input",f"Financial Statement {self.ticker}")
        self.file = os.path.join(self.path,"Stock Price.xlsx")
        
        self.fiscalYear = pd.read_csv(os.path.join(self.path,"fiscalYear.csv"))
        self.day = pd.to_datetime(self.fiscalYear["Date"]).dt.strftime("%d")[0]
        self.fiscalYear = self.fiscalYear.groupby("Period")["Month"].first()
        
        
        self.data()
        
        self.FinalStockPrices.to_excel(self.file,index=False)
    def data(self):
        FinalStockPrices = pd.DataFrame({"label":["Stock Price"]})
        TenKMo = self.fiscalYear.loc["Year Ended"]
        
        date = f"{self.fromDate-1}-{TenKMo:02d}-{self.day}"
        price = yfinance.download(self.ticker,start=date,progress=False)["Close"].iloc[0]
        FinalStockPrices[f"Year Ended {self.fromDate-1}"] = [price]
        
        for fiscalYear in range(self.fromDate, self.endDate+1):
            for period,mo in self.fiscalYear.iteritems():
                year = fiscalYear
                if mo > TenKMo:
                    year = year-1
                date = f"{fiscalYear}-{mo:02d}-{self.day}"
                if datetime.strptime(date, "%Y-%m-%d") > datetime.now():
                    continue
                price = yfinance.download(self.ticker,start=date,progress=False)["Close"].iloc[0]
                if period=="Year Ended":
                    FinalStockPrices[f"Q4 {year}"] = [price]
                FinalStockPrices[f"{period} {year}"] = [price]
               
        self.FinalStockPrices = FinalStockPrices
        


In [338]:
FindStockPrice("AAPL")

<__main__.FindStockPrice at 0x1235f4c10>

In [304]:
yfinance.download("AAPL",start="2016-09-25",progress=False)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-09-26,27.910000,28.347500,27.887501,28.219999,26.354940,119477600
2016-09-27,28.250000,28.295000,28.084999,28.272499,26.403969,98429600
2016-09-28,28.422501,28.660000,28.357500,28.487499,26.604757,118564400
2016-09-29,28.290001,28.450001,27.950001,28.045000,26.191505,143548000
2016-09-30,28.115000,28.342501,27.950001,28.262501,26.394630,145516400
...,...,...,...,...,...,...
2022-07-29,161.240005,163.630005,159.500000,162.509995,162.509995,101689200
2022-08-01,161.009995,163.589996,160.889999,161.509995,161.509995,67829400
2022-08-02,160.100006,162.410004,159.630005,160.009995,160.009995,59907000
2022-08-03,160.839996,166.589996,160.750000,166.130005,166.130005,82507500


In [192]:
class WriteSales(SECSubmissions, HelperFunctions):
    def __init__(self,ticker):
        super().__init__(ticker)
        self.path =  os.path.join("../input",f"Financial Statement {self.ticker}")
        self.file = os.path.join(self.path,"Sales By Segment.xlsx")
        self.Excel = self.readExcel()
        self.type = "Geography"
        self.Excel[self.type] = self.writeSalesData(self.filings)
        self.writeExcel()
        
    def writeExcel(self):
        Excel = pd.ExcelWriter(self.file)
        for name, table in self.Excel.items():  
            table.to_excel(Excel, index=False, sheet_name=name)
        Excel.save()
        
    def readExcel(self):
        if not os.path.isfile(self.file): return {}
        Excel = pd.read_excel(self.file,sheet_name=None)
        return Excel
        
    def filterTables(self,edgar_str,strings):
        finaltable = False
        soup = BeautifulSoup(edgar_str, 'html.parser')
        for table in soup.find_all('table'):
            table = [list(tr.stripped_strings) for tr in table.find_all('tr') if tr.text]
            table_string = ""
            for row in table: 
                for col in row: table_string+=col
            if all([s.lower() in table_string.lower() for s in strings]):
                finaltable = table
        if not finaltable: return False
        finaltable = [[x.strip("(") for x in row if x!="%" and x!="$" and "("!=x and ")" not in x] for row in finaltable]
        finaltable = [row for row in finaltable if row!=[]]
        
        nums = [[x for x in row if len(re.findall("[0-9]",x))>0 and len(re.findall("[a-zA-Z]",x))==0] for row in finaltable]
        nums = [row[0] if len(row)>0 else np.nan for row in nums]
        labels = [row[0] for row in finaltable]
        finaltable = list(zip(labels,nums))
        print(finaltable)

        
        return finaltable
    
    def tableToFrame(self,table,name):
        df = pd.DataFrame(table).iloc[:,[0,1]]
        df.columns = ["label",name]
        df[name] = pd.to_numeric(df[name].replace(r",","",regex=True))
        return df
    
    def parseForUnitsTable(self,edgar_str, name):
        table = self.filterTables(edgar_str,["Unit Sales by Product:"])
        if not table: return pd.DataFrame()
        table = [row for row in table if row!=[]]
        indexstart = [i for i, row in enumerate(table) if "Unit Sales by Product:" in row[0]][0]
        table = table[indexstart:]
        table = self.tableToFrame(table,name)
        return table 
    
    def parseForGeographyTable(self,edgar_str, name):
        table = self.filterTables(edgar_str,["Americas","Total net sales"])
        
        indexstart = [i for i, row in enumerate(table) if "Americas" in row[0]][0]
        table = table[indexstart:]
        indexend = [i for i, row in enumerate(table) if "Total net sales" in row[0]][0]
        table = table[:indexend+1]
        
        table = self.tableToFrame(table,name)
        return table 
    
    def parseForProductsTable(self,edgar_str, name):
        table = self.filterTables(edgar_str,["iPhone","Total net sales"])
        
        indexstart = [i for i, row in enumerate(table) if "iPhone" in row[0]][0]
        table = table[indexstart:]
        indexend = [i for i, row in enumerate(table) if "Total net sales" in row[0]][0]
        table = table[:indexend+1]

        table = self.tableToFrame(table,name)
        return table 
    
    def parseForCapitalTable(self,edgar_str, name):
        table = self.filterTables(edgar_str,["Gross property, plant and equipment"])
        
        if not table: return pd.DataFrame()
        indexstart = 1
        table = table[indexstart:]
        table = self.tableToFrame(table,name)
        return table 
    
    def parseForLeaseTable(self,edgar_str, name):
        table = self.filterTables(edgar_str,["Operating leases","Total lease liabilities"])

        if not table: return pd.DataFrame()
        indexstart = 1
        table = table[indexstart:]
        
        table = self.tableToFrame(table,name)
        table["label"] = [l + str(i) for i, l in enumerate(list(table.label))]
        return table 

    
    #input: filings
    #iterate through and use the accession number to grab filings from SEC, compile into excel spreadsheet
    def writeSalesData(self,filings):
        finalSales = pd.DataFrame(columns=["label"])
        
        for i,row in filings.iterrows():
            name = row["Name"]
            if row["Fiscal Year"]<=2016 and name != "Year Ended 2016":
                break
            name = re.sub("\s\([\w\W]+?\)","",name)
            accessionNum = row["accessionNumber"].replace("-","")
            doc = row["primaryDocument"]
            url = f"https://www.sec.gov/Archives/edgar/data/{self.CIK}/{accessionNum}/{doc}"
            print(url)
            req = requests.get(url,headers={"User-Agent": "Mozilla/5.0"})
            edgar_str = req.text
            
            if self.type == "Geography":
                sales = self.parseForGeographyTable(edgar_str, name)
            
            if self.type == "Product":
                sales = self.parseForProductsTable(edgar_str, name)
            
            if self.type == "Units":
                sales = self.parseForUnitsTable(edgar_str, name)
                
            if self.type == "Capital":
                sales = self.parseForCapitalTable(edgar_str, name)
                
            if self.type == "Lease":
                sales = self.parseForLeaseTable(edgar_str, name)

            merge = ["label"]
            if not sales.empty:
                finalSales = pd.merge(finalSales, sales, on=merge, how="outer")
            
        
        if self.type in ["Capital"]:
            finalSales =self.performBalanceMath(finalSales)
        else:
            finalSales = self.performIncomeMath(finalSales)
        finalSales = self.reorderQuarters(finalSales)

        return finalSales
    
    def performBalanceMath(self, compiledStatement):
        quarters = self.getQuarters(compiledStatement)
        years = self.getUniqueYears(quarters)
        
        for year in years:
            year = str(year)
            if "Year Ended "+year in quarters and "Q3 "+year in quarters: 
                compiledStatement["Q4 "+year] = compiledStatement["Year Ended "+year]
        return compiledStatement
    
    def performIncomeMath(self,compiledStatement): 
        quarters = self.getQuarters(compiledStatement)
        years = self.getUniqueYears(quarters)
        
         #replace the nan values with 0 so that the subtraction between an nan value does not result in nan
        c = compiledStatement.copy().replace(r'\s+', np.nan, regex=True).fillna(0).drop(columns="label")
        
        for year in years:
            year = str(year)
            if "Year Ended "+year in quarters and "Q3 "+year in quarters: 
                compiledStatement["Q4 "+year] = c["Year Ended "+year] - c["Q3 "+year] - c["Q2 "+year] - c["Q1 "+year]
        
        compiledStatement = compiledStatement.replace(0,np.nan)
        return compiledStatement
            
        

In [193]:
obj = WriteSales("AAPL")

https://www.sec.gov/Archives/edgar/data/320193/000032019322000070/aapl-20220625.htm
[('Three Months Ended', nan), ('June 25,', '2022'), ('Net sales by reportable segment:', nan), ('Americas', '37,472'), ('Europe', '19,287'), ('Greater China', '14,604'), ('Japan', '5,446'), ('Rest of Asia Pacific', '6,150'), ('Total net sales', '82,959')]
https://www.sec.gov/Archives/edgar/data/320193/000032019322000059/aapl-20220326.htm
[('Three Months Ended', nan), ('March 26,', '2022'), ('Net sales by reportable segment:', nan), ('Americas', '40,882'), ('Europe', '23,287'), ('Greater China', '18,343'), ('Japan', '7,724'), ('Rest of Asia Pacific', '7,042'), ('Total net sales', '97,278')]
https://www.sec.gov/Archives/edgar/data/320193/000032019322000007/aapl-20211225.htm
[('Three Months Ended', nan), ('December 25,', '2021'), ('Net sales by reportable segment:', nan), ('Americas', '51,496'), ('Europe', '29,749'), ('Greater China', '25,783'), ('Japan', '7,107'), ('Rest of Asia Pacific', '9,810'), ('Tota

[('Three Months Ended', nan), ('April\xa01,', '2017'), ('Net Sales by Operating Segment:', nan), ('Americas', '21,157'), ('Europe', '12,733'), ('Greater China', '10,726'), ('Japan', '4,485'), ('Rest of Asia Pacific', '3,795'), ('Total net sales', '52,896'), ('Net Sales by Product:', nan), ('iPhone', '33,249'), ('iPad', '3,889'), ('Mac', '5,844'), ('Services', '7,041'), ('Other Products', '2,873'), ('Total net sales', '52,896'), ('Unit Sales by Product:', nan), ('iPhone', '50,763'), ('iPad', '8,922'), ('Mac', '4,199')]
https://www.sec.gov/Archives/edgar/data/320193/000162828017000717/a10-qq1201712312016.htm
[('Three Months Ended', nan), ('December\xa031,', '2016'), ('Net Sales by Operating Segment:', nan), ('Americas', '31,968'), ('Europe', '18,521'), ('Greater China', '16,233'), ('Japan', '5,766'), ('Rest of Asia Pacific', '5,863'), ('Total net sales', '78,351'), ('Net Sales by Product:', nan), ('iPhone', '54,378'), ('iPad', '5,533'), ('Mac', '7,244'), ('Services', '7,172'), ('Other Pr

['1', '0']

In [None]:
Operating leases