In [385]:
import pandas as pd
import os
import openpyxl
import numpy as np
import re
from datetime import datetime
import requests
import shutil


In [399]:
class scrapeSECData:
    def __init__(self,ticker):
        self.ticker = ticker 
        self.CIK = self.getCIK()
        self.filings, self.fiscalYear = self.getSECFilings()
        self.writeFilings()
        
        
    def getCIK(self): #get SEC code for company based on ticker
        headers={"User-Agent": "Mozilla/5.0"}
        symbol_to_cik = requests.get("https://www.sec.gov/files/company_tickers.json").json() #returns a json dictionary with a indexed list of all different companies 
        ciks = {info["ticker"]:info["cik_str"] for key,info in symbol_to_cik.items()} #create dictionary indexable by ticker
        return ciks[self.ticker]
    
        
    
    def getSECFilings(self):
        headers={"User-Agent": "Mozilla/5.0"}
        edgar_filings = requests.get(f"https://data.sec.gov/submissions/CIK{self.CIK:0>10}.json", headers=headers).json()
        filings = pd.DataFrame(edgar_filings["filings"]["recent"])
        filings = filings.loc[filings["reportDate"]>"2014-01-01"]#filings pre-2014 are unable to be downloaded due to different excel format
        
        #get 10Q and 10k filings
        filings = filings.loc[filings["form"].isin(["10-Q","10-K"])].reset_index(drop=True) #drop all filings that are not 10Q or 10K
        filings["Month"] = pd.to_datetime(filings["reportDate"]).dt.strftime("%m").astype("int") #get Month of all filings 
        filings["Year"] = pd.to_datetime(filings["reportDate"]).dt.strftime("%Y").astype("int") #get Year of all filings
        
#         filings = filings.sort_values(by="reportDate",ascending=False)
        
        #get a fiscal Year of data by finding the last 10K and the three 10Qs before it
        TenKIndex = filings.loc[filings["form"]=="10-K"].index[0] 
        fiscalYearKey = filings.loc[TenKIndex:TenKIndex+3,["Month","form"]].copy().reset_index(drop=True)
        fiscalYearKey["Period"] = ["Year Ended", "Q3","Q2","Q1"]
        
        
        #fiscal year data for certain period can vary between months ie. both Jan & Feb, therefore I found all possible months of filings and found appropriate period for each month
        fiscalYear = pd.DataFrame({"Month":filings.Month.unique()})
        fiscalYear.index = fiscalYear.Month.apply(lambda x: np.argmin(np.abs(fiscalYearKey.Month-x))) #using fiscal year key, I found the index of the period in fiscalYearKey with closest month to each month 
        fiscalYear = pd.merge(fiscalYear,fiscalYearKey, left_index=True, right_index=True, suffixes=("","_x")) #I merged together together all of the possible months with their corresponding period based on the key
        fiscalYear = fiscalYear.drop(columns="Month_x")
        
        
        #merge fiscal year labels with filings by the month that they were reported
        filings = pd.merge(filings,fiscalYear,on=["Month","form"],how="left")
        
        if filings["Period"].isnull().any():
            print("WARNING THERE IS A FILING WITH A NULL FISCAL PERIOD LABEL")
        
        #set Fiscal Year
        filings["Fiscal Year"] = filings["Year"]
        #get Quarters that are a part of different fiscal Year than report date
        TenKMo = fiscalYear.loc[fiscalYear["form"]=="10-K"]["Month"].iloc[0]
        filings.loc[filings["Month"]>TenKMo,"Fiscal Year"] += 1 #the 10K is reported in a month before the 10Q so therefore the 10Q is the next fiscal year 
        
        return filings, fiscalYear
        
        
    def writeFilings(self):
        #create a folder to put all of the filings in
        path = os.path.join("input",f"Financial Statement {self.ticker}")
        folder = os.path.join(path,"raw")
        if f"Financial Statement {self.ticker}" not in os.listdir("input"):
            os.mkdir(path)
        if "raw" in os.listdir(path):
            shutil.rmtree(folder)
        os.mkdir(folder)
            
        
        #iterate through each filing, grab the accession number to access the filing and write to excel file
        for i,row in self.filings.iterrows():
            period = row["Period"]
            fiscalYear = row["Fiscal Year"]
            Year = row["Year"]
            accessionNum = row["accessionNumber"].replace("-","")
            
            url = f"https://www.sec.gov/Archives/edgar/data/{self.CIK}/{accessionNum}/Financial_Report.xlsx"
            req = requests.get(url,headers={"User-Agent": "Mozilla/5.0"})
            
            #only include the parenthesis actual year if the fiscal year is different than the actual filing date
            if fiscalYear != Year:
                name = os.path.join(folder,f"{period} {fiscalYear} ({Year}).xlsx")
            else: 
                name = os.path.join(folder,f"{period} {fiscalYear}.xlsx")

            file = open(name,"wb")
            file.write(req.content)
            
        self.fiscalYear.to_csv(os.path.join(path,"fiscalYear.csv"))
    
        

In [400]:
d = scrapeSECData("SVC")



In [411]:
class FinancialReport():
    def __init__(self,filePath):
        self.filePath = filePath
        self.readFile()

    def readFile(self):
        self.statements = pd.read_excel(self.filePath,sheet_name=None)
    
    def findBalanceSheet(self):
        #find sheet where the first sheet value (header) includes balance
        balanceSheets = [sheet for name,sheet in self.statements.items() if "balance" in str(sheet.columns[0]).lower()]
        balance = balanceSheets[0]
        
        balance.columns = ["label"]+list(balance.columns[1:])
        balance = balance.loc[:, balance.columns.notna()]

        balance = balance.iloc[:,:2] 
        
        return balance
        
    def findIncomeSheet(self):
        #find sheet where the first sheet value (header) includes income
        incomeSheets = [sheet for name,sheet in self.statements.items() if "income" in str(sheet.columns[0]).lower() or "operations" in str(sheet.columns[0]).lower()]
        income = incomeSheets[0]
        
        #remove the first row which doesn't say the date but instead says the length of time period ie. 3 mo ended 
        income.columns = income.iloc[0]
        income = income.drop(0)
        
        
        income.columns = ["label"]+list(income.columns[1:])
        income = income.dropna(axis=0, how="all")
        
        #remove rows where the label is just [1] or [2] because they are empty
        income = income.lo
        
        income = income.dropna(axis=1, thresh=5) #delete columns where there is only one value ie. a note like [1]
        
        income = income.iloc[:,:2] 
        return income

        
    def findCashflowSheet(self):
        #find sheet where the first sheet value (header) includes income
        cashSheets = [sheet for name,sheet in self.statements.items() if "cash" in str(sheet.columns[0]).lower()]
        cash = cashSheets[0]
        
        #remove the first row which doesn't say the date but instead says the length of time period ie. 3 mo ended 
        cash.columns = cash.iloc[0]
        cash = cash.drop(0)
        
        
        cash.columns = ["label"]+list(cash.columns[1:])
        cash = cash.loc[:, cash.columns.notna()]
        
        cash = cash.iloc[:,:2]
        return cash
        
        

        
   
    

In [412]:
obj = FinancialReport("input/Financial Statement GPS/raw/Year Ended 2021.xlsx")


i = obj.findIncomeSheet()
i

Unnamed: 0,label,"Jan. 30, 2021"
1,Revenues,13800000000.0
2,Cost of goods sold and occupancy expenses,9095000000.0
3,Gross profit,4705000000.0
4,Operating expenses,5567000000.0
5,Operating income (loss),-862000000.0
6,Gain (Loss) on Extinguishment of Debt,58000000.0
7,Interest expense,192000000.0
8,Interest income,-10000000.0
9,Income (loss) before income taxes,-1102000000.0
10,Income taxes,-437000000.0


In [413]:
class compileFilings:
    def __init__(self,ticker,fromDate):
        self.ticker = ticker 
        self.fromDate = fromDate
        self.path = os.path.join("input",f"Financial Statement {self.ticker}")
        self.folder = os.path.join(self.path,"raw")
        
        self.filings = self.getFilings()
        self.filings = self.getFilingsPostDate()
        self.filings = self.orderFilings()
        self.files = self.getFilePathsFromFilings()
        
        
        self.writeIncomeExcel()
        self.writeCashflowExcel()
        self.writeBalanceExcel()
    
    def getFilings(self): #get the names of all of the filings
        #get a list of all of the filings 
        filings = os.listdir(self.folder)
        if ".DS_Store" in filings:
            filings.remove(".DS_Store")
        filings = [filing for filing in filings if"~" not in filing]
        
        return filings
    
    def getDatesFromFilings(self,filings): #go through each file name and find the year of filing
        return [int(re.findall("\d{4}",file)[0]) for file in filings]
    
    def getFilingsPostDate(self): #get all dates and select files where date is later than year specified 
        fileDates = self.getDatesFromFilings(self.filings) 
        filings = [file for file, date in zip(self.filings, fileDates) if date>self.fromDate or (date==self.fromDate and "Year Ended" in file)]
        return filings
        
    def orderFilings(self): #order filings by year and period
        years = set(self.getDatesFromFilings(self.filings))
        names = [q + " " + str(year) for year in years for q in ["Q1","Q2","Q3","Q4","Year Ended"]] #get all possible names of files in order
        filings = [filing for name in names for filing in self.filings if name in filing] #then find a file that corresponds to each name by sorting through each file. for ex. Q3 2020 will match with Q3 2020 (2021) 
        return filings
    
    def getFilePathsFromFilings(self): #take file names and return a list of the paths to the file
        return [os.path.join(self.folder,filing) for filing in self.filings]
        
    def writeIncomeExcel(self):
        
        file = os.path.join(self.path, f"IncomeStatements-{self.ticker}.xlsx")
        Income = pd.ExcelWriter(file)
        
        #for each filing, read the filing into a dataframe and save to a combined excel file
        for file in self.files:
            statement = FinancialReport(file)
            income = statement.findIncomeSheet()
            name = file.split("/")[-1].split(".")[0]
            income.to_excel(Income, index=False, sheet_name=name)
        
        Income.save()
        
    def writeCashflowExcel(self):
        
        file = os.path.join(self.path, f"CashflowStatements-{self.ticker}.xlsx")
        Cashflow = pd.ExcelWriter(file)
        #for each filing, read the filing into a dataframe and save to a combined excel file
        for file in self.files:
            statement = FinancialReport(file)
            cashflow = statement.findCashflowSheet()
            name = file.split("/")[-1].split(".")[0]
            cashflow.to_excel(Cashflow, index=False, sheet_name=name)
        
        Cashflow.save()
        
    
    def writeBalanceExcel(self):
        
        file = os.path.join(self.path, f"BalanceStatements-{self.ticker}.xlsx")
        Balance = pd.ExcelWriter(file)
        #for each filing, read the filing into a dataframe and save to a combined excel file
        for file in self.files:
            statement = FinancialReport(file)
            balance = statement.findBalanceSheet()
            name = file.split("/")[-1].split(".")[0]
            balance.to_excel(Balance, index=False, sheet_name=name)
        
        Balance.save()
        
        

In [415]:
compileFilings("XOM",2017)

<__main__.compileFilings at 0x169e68250>