In [3]:
import pandas as pd
import os
import openpyxl
import numpy as np
import re
from datetime import datetime
import requests

In [119]:
class scrapeSECData:
    def __init__(self,ticker):
        self.ticker = ticker 
        self.CIK = self.getCIK()
        self.filings, self.fiscalYear = self.getSECFilings()
        self.writeFilings()
        
    def getCIK(self): #get SEC code for company based on ticker
        headers={"User-Agent": "Mozilla/5.0"}
        symbol_to_cik = requests.get("https://www.sec.gov/files/company_tickers.json").json() #returns a json dictionary with a indexed list of all different companies 
        ciks = {info["ticker"]:info["cik_str"] for key,info in symbol_to_cik.items()} #create dictionary indexable by ticker
        return ciks[self.ticker]
    
        
    
    def getSECFilings(self):
        headers={"User-Agent": "Mozilla/5.0"}
        edgar_filings = requests.get(f"https://data.sec.gov/submissions/CIK{self.CIK:0>10}.json", headers=headers).json()
        filings = pd.DataFrame(edgar_filings["filings"]["recent"])
        
        #get 10Q and 10k filings
        filings = filings.loc[filings["form"].isin(["10-Q","10-K"])].reset_index(drop=True) #drop all filings that are not 10Q or 10K
        filings["Month"] = pd.to_datetime(filings["reportDate"]).dt.strftime("%m").astype("int") #get Month of all filings 
        filings["Year"] = pd.to_datetime(filings["reportDate"]).dt.strftime("%Y").astype("int") #get Year of all filings
        

        #get a fiscal Year of data by finding the last 10K and the three 10Qs before it
        TenKIndex = filings.loc[filings["form"]=="10-K"].index[0] 
        fiscalYear = filings.loc[TenKIndex:TenKIndex+3,["Month","form"]].copy()
        fiscalYear["Period"] = ["Year Ended", "Q3","Q2","Q1"]

        #merge fiscal year labels with filings by the month that they were reported
        filings = pd.merge(fiscalYear,filings,on=["Month","form"]).sort_values(by="reportDate",ascending=False)
        
        
        #set Fiscal Year
        filings["Fiscal Year"] = filings["Year"]
        #get Quarters that are a part of different fiscal Year than report date
        TenKMo = fiscalYear.loc[TenKIndex]["Month"]
        mos = fiscalYear.loc[fiscalYear["Month"]>TenKMo]["Month"] #the 10K is reported in a month before the 10Q so therefore the 10Q is the next fiscal year 
        filings.loc[filings["Month"].isin(mos),"Fiscal Year"] += 1
        
        
        return filings, fiscalYear
        
        
    def writeFilings(self):
        #create a folder to put all of the filings in
        path = os.path.join("input",f"Financial Statement {self.ticker}")
        folder = os.path.join(path,"raw")
        if f"Financial Statement {self.ticker}" not in os.listdir("input"):
            os.mkdir(path)
            if "raw" in os.listdir(path):
                os.rmdir(folder)
            os.mkdir(folder)
            
        
        #iterate through each filing, grab the accession number to access the filing and write to excel file
        for i,row in self.filings.iterrows():
            period = row["Period"]
            fiscalYear = row["Fiscal Year"]
            Year = row["Year"]
            accessionNum = row["accessionNumber"].replace("-","")
            
            url = f"https://www.sec.gov/Archives/edgar/data/{self.CIK}/{accessionNum}/Financial_Report.xlsx"
            req = requests.get(url,headers={"User-Agent": "Mozilla/5.0"})
            
            #only include the parenthesis actual year if the fiscal year is different than the actual filing date
            if fiscalYear != Year:
                name = os.path.join(folder,f"{period} {fiscalYear} ({Year}).xlsx")
            else: 
                name = os.path.join(folder,f"{period} {fiscalYear}.xlsx")

            file = open(name,"wb")
            file.write(req.content)
    
        

In [120]:
d = scrapeSECData("XOM")

d.fiscalYear


Unnamed: 0,Month,form,Period
1,12,10-K,Year Ended
2,9,10-Q,Q3
3,6,10-Q,Q2
4,3,10-Q,Q1


In [182]:
class FinancialReport():
    def __init__(self,filePath):
        self.filePath = filePath
        self.readFile()

    def readFile(self):
        self.statements = pd.read_excel(self.filePath,sheet_name=None)
    
    def findBalanceSheet(self):
        #find sheet where the first sheet value (header) includes balance
        balanceSheets = [sheet for name,sheet in self.statements.items() if "balance" in str(sheet.columns[0]).lower()]
        balance = balanceSheets[0]
        
        balance.columns = ["label"]+list(balance.columns[1:])
        balance = balance.loc[:, balance.columns.notna()]

        return balance
        
    def findIncomeSheet(self):
        #find sheet where the first sheet value (header) includes income
        incomeSheets = [sheet for name,sheet in self.statements.items() if "income" in str(sheet.columns[0]).lower()]
        income = incomeSheets[0]
        
        #remove the first row which doesn't say the date but instead says the length of time period ie. 3 mo ended 
        income.columns = income.iloc[0]
        income = income.drop(0)
        
        
        income.columns = ["label"]+list(income.columns[1:])
        income = income.dropna(axis=0, how="all")
        
        #remove rows where the label is just [1] or [2] because they are empty
        income = income.loc[income.label.str.contains("[a-zA-Z]",regex=True)]
        
        income = income.dropna(axis=1, thresh=5) #delete columns where there is only one value ie. a note like [1]
        
        income = income.iloc[:,:2]
        return income

        
    def findCashFlowSheet(self):
        #find sheet where the first sheet value (header) includes income
        cashSheets = [sheet for name,sheet in self.statements.items() if "cash" in str(sheet.columns[0]).lower()]
        cash = cashSheets[0]
        
        #remove the first row which doesn't say the date but instead says the length of time period ie. 3 mo ended 
        cash.columns = cash.iloc[0]
        cash = cash.drop(0)
        
        
        cash.columns = ["label"]+list(cash.columns[1:])
        cash = cash.loc[:, cash.columns.notna()]

        return cash
        
        

        
   
    

In [183]:
obj = FinancialReport("input/Financial Statement XOM/raw/Year Ended 2018.xlsx")


i = obj.findIncomeSheet()
i

Unnamed: 0,label,"Dec. 31, 2018"
1,Revenues and other income,
2,Sales and other operating revenue,279332.0
3,Income from equity affiliates,7355.0
4,Other income,3525.0
5,Total revenues and other income,290212.0
6,Costs and other deductions,
7,Crude oil and product purchases,156172.0
8,Production and manufacturing expenses,36682.0
9,"Selling, general and administrative expenses",11480.0
10,Depreciation and depletion,18745.0


In [246]:
class writeIncomeStatement:
    def __init__(self,ticker,fromDate):
        self.ticker = ticker 
        self.fromDate = fromDate
        self.path = os.path.join("input",f"Financial Statement {self.ticker}")
        self.folder = os.path.join(self.path,"raw")
        
        self.filings = self.getFilings()
        self.filings = self.orderFilings()
        
        self.files = [os.path.join(self.folder,filing) for filing in self.filings]
        
        self.writeIncomeExcel()
    
    def getFilings(self):
        #get a list of all of the filings 
        filings = os.listdir(self.folder)
        if ".DS_Store" in filings:
            filings.remove(".DS_Store")
        filings = [filing for filing in filings if"~" not in filing]
            
        #get filings post date
        fileDates = [int(re.findall("\d{4}",file)[0]) for file in filings] #get all dates and select files where date is later than year specified 
        filings = [file for file, date in zip(filings, fileDates) if date>=self.fromDate]
        
        return filings
    
    
    def orderFilings(self):
        years = set([int(re.findall("\d{4}",file)[0]) for file in self.filings])
        names = [q + " " + str(year) for year in years for q in ["Q1","Q2","Q3","Q4","Year Ended"]] #get all possible names of files in order
        filings = [filing for name in names for filing in self.filings if name in filing] #then find a file that corresponds to each name by sorting through each file. for ex. Q3 2020 will match with Q3 2020 (2021) 
        return filings
        
        
    def writeIncomeExcel(self):
        
        file = os.path.join(self.path, f"IncomeStatements-{ticker}.xlsx")
        Income = pd.ExcelWriter(file)
        
        for file in self.files:
            statement = FinancialReport(file)
            income = statement.findIncomeSheet()
            name = file.split("/")[-1].split(".")[0]
            income.to_excel(Income, index=False, sheet_name=name)
        
        Income.save()
        
        

In [248]:
writeIncomeStatement("GPS",2017)

<__main__.writeIncomeStatement at 0x15700d910>

In [237]:
[[x,y] for y in [3,4] for x in [1,2]]

[[1, 3], [2, 3], [1, 4], [2, 4]]

In [203]:
def mergeSort(data):
        if len(data) ==1:
            return data
        
        mid = int(len(data)/2)
        L = data[:mid]
        R = data[mid:]
        
        L = mergeSort(L)
        R = mergeSort(R)
        

        l = r = k = 0
        while l<len(L) and r<len(R):
            if L[l] < R[r]:
                data[k] = L[l]
                l+=1
            else:
                data[k] = R[r]
                r+=1
            k+=1
            
        while r<len(R):
            data[k] = R[r]
            r+=1
            k+=1
            
        while l<len(L):
            data[k] = L[l]
            l+=1
            k+=1
        return data
    
mergeSort([2,3,12,5])
    

[2, 3, 5, 12]