In [7]:
import pandas as pd
import os
import openpyxl
import numpy as np
import re
from datetime import datetime
import requests
import shutil
import yfinance
from bs4 import BeautifulSoup

In [2]:

class SECSubmissions:
    def __init__(self,ticker):
        self.ticker = ticker 
        self.CIK = self.getCIK()
        self.filings = self.getSECFilings(self.CIK)
        self.filings = self.findDate(self.filings)
        self.fiscalYear = self.findFiscalYear(self.filings)
        self.filings = self.mergeFiscalYearWithFilings(self.filings, self.fiscalYear)
        self.filings = self.createSubmissionName(self.filings)
       
        
    def getCIK(self): #get SEC code for company based on ticker
        headers={"User-Agent": "Mozilla/5.0"}
        symbol_to_cik = requests.get("https://www.sec.gov/files/company_tickers.json").json() #returns a json dictionary with a indexed list of all different companies 
        ciks = {info["ticker"]:info["cik_str"] for key,info in symbol_to_cik.items()} #create dictionary indexable by ticker
        return ciks[self.ticker]
    
        
    #input: CIK
    #request the data on every submission to SEC website for the company
    def getSECFilings(self,CIK):
        headers={"User-Agent": "Mozilla/5.0"}
        edgar_filings = requests.get(f"https://data.sec.gov/submissions/CIK{CIK:0>10}.json", headers=headers).json()
        filings = pd.DataFrame(edgar_filings["filings"]["recent"])
        filings = filings.loc[filings["reportDate"]>"2014-01-01"]#filings pre-2014 are unable to be downloaded due to different excel format
        filings = filings.loc[filings["form"].isin(["10-Q","10-K"])].reset_index(drop=True) #drop all filings that are not 10Q or 10K
        return filings 
    
    #input filings
    #output filings with mo and year
    def findDate(self,filings):
        filings["Month"] = pd.to_datetime(filings["reportDate"]).dt.strftime("%m").astype("int") #get Month of all filings 
        filings["Year"] = pd.to_datetime(filings["reportDate"]).dt.strftime("%Y").astype("int") #get Year of all filings
        filings["Date"] = pd.to_datetime(filings["reportDate"]).dt.strftime("%m/%d/%Y")
        return filings
    
    #input a list of 10k filings with mo and year
    #find one fiscal year of data via 10k, then find all possible months filed and organize to fiscal period
    def findFiscalYear(self,filings):
        #get a fiscal Year of data by finding the last 10K and the three 10Qs before it
        TenKIndex = filings.loc[filings["form"]=="10-K"].index[0] 
        fiscalYearKey = filings.loc[TenKIndex:TenKIndex+3,["Month","form","Date"]].copy().reset_index(drop=True)
        fiscalYearKey["Period"] = ["Year Ended", "Q3","Q2","Q1"]
        
        #fiscal year data for certain period can vary between months ie. both Jan & Feb, therefore I found all possible months of filings and found appropriate period for each month
        fiscalYear = pd.DataFrame({"Month":filings.Month.unique()})
        fiscalYear.index = list(fiscalYear.Month.apply(lambda x: np.argmin(np.abs(fiscalYearKey.Month-x)))) #using fiscal year key, I found the index of the period in fiscalYearKey with closest month to each month 
        fiscalYear = pd.merge(fiscalYear,fiscalYearKey, left_index=True, right_index=True, suffixes=("","_x")) #I merged together together all of the possible months with their corresponding period based on the key
        fiscalYear = fiscalYear.drop(columns="Month_x")
        
        return fiscalYear
    
    #input filings of 10k/q
    #merge by month to find which fiscal period, if filing of 10q is later in year than 10k, then adjust year so it is a part of right fiscal year
    def mergeFiscalYearWithFilings(self, filings, fiscalYear):
        #merge fiscal year labels with filings by the month that they were reported
        filings = pd.merge(filings,fiscalYear,on=["Month","form"],how="left")    
        #set Fiscal Year
        filings["Fiscal Year"] = filings["Year"]
        #get Quarters that are a part of different fiscal Year than report date
        TenKMo = fiscalYear.loc[fiscalYear["form"]=="10-K"]["Month"].iloc[0]
        filings.loc[(filings["Month"]>TenKMo)&(filings["form"]!="10-K"),"Fiscal Year"] += 1 #the 10K is reported in a month before the 10Q so therefore the 10Q is the next fiscal year 
        
        return filings
    
    

    #input filings
    #take the fiscal period and year and create a string, if different fiscal Year than filing date use parenthesis
    def createSubmissionName(self,filings):
        filings["Name"] = filings.apply(self.submissionName,axis=1)
        return filings
        
    def submissionName(self,row):
        period = row["Period"]
        fiscalYear = row["Fiscal Year"]
        Year = row["Year"]
        if fiscalYear != Year:
                name = f"{period} {fiscalYear} ({Year})"
        else: 
                name = f"{period} {fiscalYear}"
        return name
    
    
    
    def printUrl(self,filings):
        for i,row in self.filings.iterrows():
            name = row["Name"]
            accessionNum = row["accessionNumber"].replace("-","")
            doc = row["primaryDocument"]
            url = f"https://www.sec.gov/Archives/edgar/data/{self.CIK}/{accessionNum}/{doc}"
            print(url)
           
        

In [3]:
sub = SECSubmissions("AAPL")
sub.printUrl()

Unnamed: 0,Month,form,Date,Period
0,9,10-K,09/25/2021,Year Ended
1,6,10-Q,06/26/2021,Q3
1,7,10-Q,06/26/2021,Q3
2,3,10-Q,03/27/2021,Q2
2,4,10-Q,03/27/2021,Q2
3,12,10-Q,12/26/2020,Q1


In [4]:
class HelperFunctions():
    #input: a list of quarters such as "Q2 2022"
    #iterates through a finds a date/year in the file
    def getYears(self,quarters): 
        return [int(re.findall("\d{4}",quarter)[0]) for quarter in quarters]
    
    def getUniqueYears(self,quarters):
        return set(self.getYears(quarters))
    
    #input: compiledStatement
    #returns all quarters but removes first col of label
    def getQuarters(self, compiledStatement): 
        return list(compiledStatement.columns[1:])
    
    def getPossibleFilingsFromYears(self, years): 
        possible = [q + " " + str(year) for year in years for q in ["Q1","Q2","Q3","Q4","Year Ended"]]
        possible += [q + " " + str(year) for year in years for q in ["6mo","9mo"]]
        return possible
    
    #input: compiledStatement
    #find all years in statement and get all possible filings in order then only keep the ones in actually in the statement
    def reorderQuarters(self,compiledStatement): 
        quarters = self.getQuarters(compiledStatement)
        years = self.getUniqueYears(quarters)
        cols = self.getPossibleFilingsFromYears(years)
        cols = [col for col in cols if col in quarters]
        compiledStatement = compiledStatement[["label"] + cols]
        return compiledStatement
    
    
    
    

In [337]:

class FindStockPrice:
    def __init__(self,ticker, fromDate=2017,endDate=2022):
        self.ticker = ticker
        self.fromDate = fromDate
        self.endDate = endDate
        self.path =  os.path.join("../input",f"Financial Statement {self.ticker}")
        self.file = os.path.join(self.path,"Stock Price.xlsx")
        
        self.fiscalYear = pd.read_csv(os.path.join(self.path,"fiscalYear.csv"))
        self.day = pd.to_datetime(self.fiscalYear["Date"]).dt.strftime("%d")[0]
        self.fiscalYear = self.fiscalYear.groupby("Period")["Month"].first()
        
        
        self.data()
        
        self.FinalStockPrices.to_excel(self.file,index=False)
    def data(self):
        FinalStockPrices = pd.DataFrame({"label":["Stock Price"]})
        TenKMo = self.fiscalYear.loc["Year Ended"]
        
        date = f"{self.fromDate-1}-{TenKMo:02d}-{self.day}"
        price = yfinance.download(self.ticker,start=date,progress=False)["Close"].iloc[0]
        FinalStockPrices[f"Year Ended {self.fromDate-1}"] = [price]
        
        for fiscalYear in range(self.fromDate, self.endDate+1):
            for period,mo in self.fiscalYear.iteritems():
                year = fiscalYear
                if mo > TenKMo:
                    year = year-1
                date = f"{fiscalYear}-{mo:02d}-{self.day}"
                if datetime.strptime(date, "%Y-%m-%d") > datetime.now():
                    continue
                price = yfinance.download(self.ticker,start=date,progress=False)["Close"].iloc[0]
                if period=="Year Ended":
                    FinalStockPrices[f"Q4 {year}"] = [price]
                FinalStockPrices[f"{period} {year}"] = [price]
               
        self.FinalStockPrices = FinalStockPrices
        


In [338]:
FindStockPrice("AAPL")

<__main__.FindStockPrice at 0x1235f4c10>

In [304]:
yfinance.download("AAPL",start="2016-09-25",progress=False)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-09-26,27.910000,28.347500,27.887501,28.219999,26.354940,119477600
2016-09-27,28.250000,28.295000,28.084999,28.272499,26.403969,98429600
2016-09-28,28.422501,28.660000,28.357500,28.487499,26.604757,118564400
2016-09-29,28.290001,28.450001,27.950001,28.045000,26.191505,143548000
2016-09-30,28.115000,28.342501,27.950001,28.262501,26.394630,145516400
...,...,...,...,...,...,...
2022-07-29,161.240005,163.630005,159.500000,162.509995,162.509995,101689200
2022-08-01,161.009995,163.589996,160.889999,161.509995,161.509995,67829400
2022-08-02,160.100006,162.410004,159.630005,160.009995,160.009995,59907000
2022-08-03,160.839996,166.589996,160.750000,166.130005,166.130005,82507500


In [30]:
class WriteGeographySales(SECSubmissions, HelperFunctions):
    def __init__(self,ticker):
        super().__init__(ticker)
        self.path =  os.path.join("../input",f"Financial Statement {self.ticker}")
        self.file = os.path.join(self.path,"Sales By Segment.xlsx")
        
        self.writeGeographySalesData(self.filings)
    
    def filterTables(self,edgar_str,strings):
        finaltable = False
        soup = BeautifulSoup(edgar_str, 'html.parser')
        for table in soup.find_all('table'):
            table = [list(tr.stripped_strings) for tr in table.find_all('tr') if tr.text]
            table_string = ""
            for row in table: 
                for col in row: table_string+=col
            if all([s in table_string for s in strings]):
                finaltable = table
        if not finaltable: return False
        finaltable = [[x for x in row if x!="%" and x!="$" and "(" not in x] for row in finaltable]
        finaltable = [row for row in finaltable if row!=[]]
        return finaltable
    
    
    def parseForUnitsTable(self,edgar_str, name):
        table = self.filterTables(edgar_str,["Unit Sales by Product:"])
        if not table: return pd.DataFrame()
        table = [row for row in table if row!=[]]
        indexstart = [i for i, row in enumerate(table) if "Unit Sales by Product:" in row[0]][0]
        table = table[indexstart:]
        
        df = pd.DataFrame(table).iloc[:,[0,1]]
        df.columns = ["label",name]
        df[name] = pd.to_numeric(df[name].replace(r",","",regex=True))
        return df 
    
    def parseForGeographyTable(self,edgar_str, name):
        table = self.filterTables(edgar_str,["Americas","Total net sales"])
        
        indexstart = [i for i, row in enumerate(table) if "Americas" in row[0]][0]
        table = table[indexstart:]
        indexend = [i for i, row in enumerate(table) if "Total net sales" in row[0]][0]
        table = table[:indexend+1]
        
        df = pd.DataFrame(table).iloc[:,[0,1]]
        df.columns = ["label",name]
        df[name] = pd.to_numeric(df[name].replace(r",","",regex=True))
        return df 
    
    def parseForProductsTable(self,edgar_str, name):
        table = self.filterTables(edgar_str,["iPhone","Total net sales"])
        
        indexstart = [i for i, row in enumerate(table) if "iPhone" in row[0]][0]
        table = table[indexstart:]
        indexend = [i for i, row in enumerate(table) if "Total net sales" in row[0]][0]
        table = table[:indexend+1]
        
        df = pd.DataFrame(table).iloc[:,[0,1]]
        df.columns = ["label",name]
        df[name] = pd.to_numeric(df[name].replace(r",","",regex=True))
        return df 
    
    #input: filings
    #iterate through and use the accession number to grab filings from SEC, compile into excel spreadsheet
    def writeGeographySalesData(self,filings):
        finalGeography = pd.DataFrame(columns=["label"])
        finalProduct = pd.DataFrame(columns=["label"])
        finalUnits = pd.DataFrame(columns=["label"])
    
        for i,row in filings.iterrows():
            name = row["Name"]
            if row["Fiscal Year"]<=2016 and name != "Year Ended 2016":
                break
            name = re.sub("\s\([\w\W]+?\)","",name)
            accessionNum = row["accessionNumber"].replace("-","")
            doc = row["primaryDocument"]
            url = f"https://www.sec.gov/Archives/edgar/data/{self.CIK}/{accessionNum}/{doc}"
            print(url)
            req = requests.get(url,headers={"User-Agent": "Mozilla/5.0"})
            edgar_str = req.text
            
            geography_sales = self.parseForGeographyTable(edgar_str, name)
            finalGeography = pd.merge(finalGeography, geography_sales, on="label", how="outer")
            product_sales = self.parseForProductsTable(edgar_str, name)
            finalProduct = pd.merge(finalProduct, product_sales, on="label", how="outer")
            unit_sales = self.parseForUnitsTable(edgar_str, name)
            if not unit_sales.empty:
                finalUnits = pd.merge(finalUnits, unit_sales, on="label", how="outer")
            
        
        finalSales = pd.concat([pd.concat([finalGeography,finalProduct]),finalUnits])
        finalSales = self.performIncomeMath(finalSales)
        finalSales = self.reorderQuarters(finalSales)
        
        finalSales.to_excel(self.file, index=False)
    
   
    
    def performIncomeMath(self,compiledStatement): 
        quarters = self.getQuarters(compiledStatement)
        years = self.getUniqueYears(quarters)
        
         #replace the nan values with 0 so that the subtraction between an nan value does not result in nan
        c = compiledStatement.copy().replace(r'\s+', np.nan, regex=True).fillna(0).drop(columns="label")
        
        for year in years:
            year = str(year)
            if "Year Ended "+year in quarters and "Q3 "+year in quarters: 
                compiledStatement["Q4 "+year] = c["Year Ended "+year] - c["Q3 "+year] - c["Q2 "+year] - c["Q1 "+year]
        
        compiledStatement = compiledStatement.replace(0,np.nan)
        return compiledStatement
            
        

In [31]:
WriteGeographySales("AAPL")

https://www.sec.gov/Archives/edgar/data/320193/000032019322000070/aapl-20220625.htm
https://www.sec.gov/Archives/edgar/data/320193/000032019322000059/aapl-20220326.htm
https://www.sec.gov/Archives/edgar/data/320193/000032019322000007/aapl-20211225.htm
https://www.sec.gov/Archives/edgar/data/320193/000032019321000105/aapl-20210925.htm
https://www.sec.gov/Archives/edgar/data/320193/000032019321000065/aapl-20210626.htm
https://www.sec.gov/Archives/edgar/data/320193/000032019321000056/aapl-20210327.htm
https://www.sec.gov/Archives/edgar/data/320193/000032019321000010/aapl-20201226.htm
https://www.sec.gov/Archives/edgar/data/320193/000032019320000096/aapl-20200926.htm
https://www.sec.gov/Archives/edgar/data/320193/000032019320000062/aapl-20200627.htm
https://www.sec.gov/Archives/edgar/data/320193/000032019320000052/a10-qq220203282020.htm
https://www.sec.gov/Archives/edgar/data/320193/000032019320000010/a10-qq1202012282019.htm
https://www.sec.gov/Archives/edgar/data/320193/000032019319000119

<__main__.WriteGeographySales at 0x12f3bbf10>

In [394]:
df = pd.DataFrame([1,2])
if df.empty: print("he")