In [73]:
import pandas as pd
import os
import openpyxl
import numpy as np
import re
from datetime import datetime
import requests

In [74]:
class HelperFunctions:
    def clean_up_string(self,string):
        string = string.strip()
        space_positions = [x.start() for x in re.finditer('\ ',  string)]
        for i in reversed(space_positions):
            if string[i+1:] in ["of", "and"]:
                string = string[:i]
        return string
    
    def merge_number(self,a,b):
        if pd.isnull(a):
            return b
        if pd.isnull(b):
            return a
        return max(a,b)
    
    def getFiles(self):
        files = os.listdir("financial statements")
        files.remove(".DS_Store")
        files = [file for file in files if file[0] != "~"]
        return files

    def getFinancialStatements(self):
        files = self.getFiles()
        FinancialReports = [FinancialReport(file) for file in files]
        return FinancialReports
        

In [75]:
class ProcessQuarters:
    
    def getQuarterHeaders(self):
        quarters = [col for col in list(self.finalStatement.columns) if col not in ["title","label","title_order"]]
        return quarters
    def includesYear(self, string):
        return string[-4:].isdigit()
        
    def getLabelHeaders(self):
        headers = [col for col in list(self.finalStatement.columns) if not self.includesYear(col)]
        return headers
    
    def setQuarters(self):
        dates = self.getQuarterHeaders()
        headers = self.getLabelHeaders()
        dates.sort(key=lambda date: datetime.strptime(date.replace(".",""), '%b %d, %Y'))
        self.finalStatement = self.finalStatement[headers+dates]
        self.finalStatement.columns = headers+[self.parseQuarter(date) for date in self.getQuarterHeaders()]
    
    def reorderQuarters(self):
        years = self.getYears()
        headers = self.getLabelHeaders()
        #years = [year for year in years if "Q1 "+year in self.getQuarterHeaders()]
        quarters = self.getQuarterHeaders()
        cols = [q + " " + year for year in years for q in ["Q1","Q2","Q3","Q4","Year Ended"] if q + " " + year in quarters]
        cols = headers + cols
        self.finalStatement = self.finalStatement[cols]
        
    def getYears(self):
        years = list(set([x.split(" ")[-1] for x in self.getQuarterHeaders()]))
        years.sort()
        return years
    
    
   

In [76]:
class FinancialDataMerge:
    
    def addData(self, FinancialReports):
        for [sheet,file] in FinancialReports:
            print(file)
            dates = [col for col in sheet.columns[1:] if col not in self.finalStatement.columns] ## remove columns of data already in main sheet sheet
            if len(dates) == 0:
                continue
            cols = ["label"]+dates
            sheet = sheet[cols]
            
            sheet = self.parseStatement(sheet)
            
            self.finalStatement = pd.merge(self.finalStatement, sheet, how="outer",on=["label"],suffixes=('', '_x'))
            
            self.finalStatement.title_order = [self.merge_number(self.finalStatement.title_order[i],self.finalStatement.title_order_x[i]) for i in range(len(self.finalStatement))] #merge the number for the titles. use greatest num bc bigger num means there is section before it
            self.finalStatement = self.finalStatement.drop(["title_order_x","title_x"],axis=1)
    
    def rearrangeFinalStatement(self):
        self.finalStatement = self.finalStatement.replace(" ",np.nan)
        self.finalStatement = self.finalStatement.loc[~self.finalStatement[self.getQuarterHeaders()].isnull().all(axis=1)]
        
        #Make sure each title has same max number index
        self.reconcileTitleOrder()
        
        # put Totals at the bottom of each section
        self.reconcileLabelOrderWithTotals()
        
        self.finalStatement[["title_order","label_order"]] = self.finalStatement[["title_order","label_order"]].astype("int")
        self.finalStatement = self.finalStatement.sort_values(["title_order","label_order"]) # order by title and then location
        
        self.finalStatement = self.finalStatement.drop(["title_order","label_order"],axis=1).reset_index(drop=True)
        
        self.finalStatement[self.getQuarterHeaders()] = self.finalStatement[self.getQuarterHeaders()].fillna(0)
        
    def parseStatement(self,sheet):
        
        sheet = sheet.loc[(~sheet.drop(columns="label").isnull().all(axis=1) | sheet.label.str.contains(":"))]
        num = 0
        index = []
        title = None 
        for label in sheet.label:
            if ":" in label:
                title = re.sub("\([\w\W]+\)","",label[:-1]).replace("  "," ")
                num += 1
                continue
            new_label = label.lower()
            new_label = new_label.replace("gain","loss").replace("decrease","increase")
            new_label = re.sub("\([\w\W]+?\)","",new_label).replace("  "," ").replace("—"," - ").strip()
            new_label = re.sub(" \$|\s*\d+\,*", "", new_label) 
            new_label = self.clean_up_string(new_label)
            
            is_total = self.total_keyword == new_label.split(" ")[0]
            
            index.append([title,num,label,new_label])
                
            if is_total:
                title = None 
                num += 1
                
        sheet = sheet.loc[sheet.label.isin(np.array(index)[:,2])].copy()
        sheet["title"] = np.array(index)[:,0].tolist()
        sheet["title_order"] = np.array(index)[:,1].tolist()
        sheet["label"] = np.array(index)[:,3].tolist()
        sheet = sheet.set_index(["title","title_order","label"]).reset_index()
        self.sheet = sheet
        return sheet
    
        
    def reconcileTitleOrder(self):
        title_groupby = self.finalStatement.groupby("title")["title_order"].max().reset_index().dropna(axis=0, how='any')
        self.finalStatement = self.finalStatement.merge(title_groupby, on="title", how="left", suffixes=('', '_x') )
        self.finalStatement["title_order"] = [self.merge_number(self.finalStatement.title_order[i],self.finalStatement.title_order_x[i]) for i in range(len(self.finalStatement))] #merge the number for the titles. 
        self.finalStatement = self.finalStatement.drop(["title_order_x"],axis=1)
        
    def reconcileLabelOrderWithTotals(self):
        totals = self.finalStatement.loc[self.finalStatement.label.str.startswith(self.total_keyword)].reset_index(drop=True) 
        self.finalStatement = self.finalStatement.loc[~self.finalStatement.label.str.startswith(self.total_keyword)].reset_index(drop=True)
        self.finalStatement = pd.concat([self.finalStatement,totals]) 
        self.finalStatement = self.finalStatement.reset_index(drop=True).reset_index().rename(columns={"index":"label_order"})
        
            

In [230]:
class Income(FinancialDataMerge, ProcessQuarters, HelperFunctions):
    def __init__(self,ticker,fromDate):
        
        self.ticker = ticker
        self.fromDate = fromDate
        self.compiledStatement = pd.DataFrame(columns=["label"])
        
        self.IncomeExcel = self.readIncomeExcel()
        self.IncomeExcel = self.getFilingsPostDate()
        
        self.addData()
        self.performIncomeMath()
        self.reorderQuarters()
        
    def readIncomeExcel(self):
        self.path = os.path.join("input",f"Financial Statement {self.ticker}")
        file = os.path.join(self.path, f"IncomeStatements-{self.ticker}.xlsx")
        IncomeExcel = pd.read_excel(file,sheet_name=None)
        return IncomeExcel
    
    def addData(self):
        for quarter,sheet in list(self.IncomeExcel.items()):
            quarter =  re.sub("\([\w\W]+?\)","",quarter) #remove the parenthesis ie. (2021) from the quarter
            sheet.columns = ["label",quarter]
            
            sheeti = statementi = 0
            newStatement = pd.DataFrame(columns=["label"])
             
            #merge the current finalstatement and new sheet into one statement 
            #go through each row of the two statements and keep track of the indexes of each statement
            #continue until one of either the compiled statement or sheet ends
            while sheeti < len(sheet) and statementi < len(self.compiledStatement):
              
                sheet_label = sheet.iloc[sheeti].label
                statement_label = self.compiledStatement.iloc[statementi].label
                newRowStatement = self.compiledStatement.iloc[statementi:statementi+1].reset_index(drop=True) #grab the row and drop the index so that they merge
                newRowSheet = sheet.iloc[sheeti:sheeti+1].reset_index(drop=True)
                
                #check if the labels are the same 
                if statement_label == sheet_label:
                    #if labels are the same then create a new row by concating the rows
                    newRowSheet = newRowSheet.drop(columns="label")
                    newRow = pd.concat([newRowStatement,newRowSheet],axis=1)
                    
                    #concat the newRow to the newstatement
                    newStatement = pd.concat([newStatement, newRow])
                    #move on to the next row
                    sheeti+=1
                    statementi+=1
                    
                else:
                    #check if the sheet index is the last one so doesn't cause error
                    if sheeti+1<len(sheet):
                        #next, before I append the final statement label, I want to check if the statement label corresponds to the next label for the sheet
                        #if it corresponds, then I will move the sheet up instead
                        if statement_label == sheet.iloc[sheeti+1].label: 
                            newStatement = pd.concat([newStatement,newRowSheet]) #concat the sheet row to the new statement
                            sheeti += 1
                            continue
                    
                    #if the two labels don't match, default is to add the row of the final statment
                    newStatement = pd.concat([newStatement,newRowStatement])
                    statementi += 1
                    
                    
                    
            #if either the finalstatement or the new sheet hasn't finished, then add the end of the statement to the new one
            newStatement = pd.concat([newStatement,sheet.iloc[sheeti:len(sheet)]])
            newStatement = pd.concat([newStatement,self.compiledStatement.iloc[statementi:len(self.compiledStatement)]])
            
            self.compiledStatement = newStatement
            
        self.compiledStatement = self.compiledStatement.reset_index(drop=True)
    
    def getYears(self,quarters): #go through each file name and find the year of filing
        return [int(re.findall("\d{4}",quarter)[0]) for quarter in quarters]
    
    def getFilingsPostDate(self): #use IncomeExcel dictionary to get all dates and only keep where date is later than year specified 
        quarters = self.IncomeExcel.keys()
        fileDates = self.getYears(quarters) 
        files = [file for file, date in zip(self.IncomeExcel, fileDates) if date>self.fromDate or (date==self.fromDate and "Year Ended" in file)]
        IncomeExcel = {file:self.IncomeExcel[file] for file in files}
        return IncomeExcel
        
    def reorderQuarters(self):
        quarters = self.getQuarters()
        years = self.getYears(quarters)
        cols = [q + " " + str(year) for year in years for q in ["Q1","Q2","Q3","Q4","Year Ended"] if q + " " + str(year) in quarters]
        self.compiledStatement = self.compiledStatement[["label"] + cols]
        
    def getQuarters(self):
        return list(self.compiledStatement.columns[1:])
    
    def performIncomeMath(self):
        quarters = self.getQuarters()
        years = self.getYears(quarters)
        
        for year in years:
            year = str(year)
            if "Year Ended "+year in quarters and "Q3 "+year in quarters: 
                
                self.compiledStatement["Q4 "+year] = self.compiledStatement["Year Ended "+year] - self.compiledStatement["Q3 "+year]
                self.compiledStatement.loc[self.compiledStatement.label.str.startswith("weighted-average"),"Q4 "+year] = self.compiledStatement.loc[self.compiledStatement.label.str.startswith("weighted-average"),"Year Ended "+year]


            

In [231]:
income = Income("MSFT",2018)
finalIncome = income.compiledStatement
finalIncome.to_excel("output/raw/income.xlsx",index=False)
# finalIncome
finalIncome


['Year Ended 2018', 'Q1 2019 ', 'Q2 2019 ', 'Q3 2019', 'Year Ended 2019', 'Q1 2020 ', 'Q2 2020 ', 'Q3 2020', 'Year Ended 2020', 'Q1 2021 ', 'Q2 2021 ', 'Q3 2021', 'Year Ended 2021', 'Q1 2022 ', 'Q2 2022 ', 'Q3 2022', 'Q4 2019', 'Q4 2020', 'Q4 2021']
False
['Year Ended 2018', 'Q3 2019', 'Q4 2019', 'Year Ended 2019', 'Q3 2019', 'Q4 2019', 'Year Ended 2019', 'Q3 2019', 'Q4 2019', 'Year Ended 2019', 'Q3 2019', 'Q4 2019', 'Year Ended 2019', 'Q3 2020', 'Q4 2020', 'Year Ended 2020', 'Q3 2020', 'Q4 2020', 'Year Ended 2020', 'Q3 2020', 'Q4 2020', 'Year Ended 2020', 'Q3 2020', 'Q4 2020', 'Year Ended 2020', 'Q3 2021', 'Q4 2021', 'Year Ended 2021', 'Q3 2021', 'Q4 2021', 'Year Ended 2021', 'Q3 2021', 'Q4 2021', 'Year Ended 2021', 'Q3 2021', 'Q4 2021', 'Year Ended 2021', 'Q3 2022', 'Q3 2022', 'Q3 2022', 'Q3 2019', 'Q4 2019', 'Year Ended 2019', 'Q3 2020', 'Q4 2020', 'Year Ended 2020', 'Q3 2021', 'Q4 2021', 'Year Ended 2021']


Unnamed: 0,label,Year Ended 2018,Q3 2019,Q4 2019,Year Ended 2019,Q3 2019.1,Q4 2019.1,Year Ended 2019.1,Q3 2019.2,Q4 2019.2,...,Q3 2022,Q3 2019.3,Q4 2019.3,Year Ended 2019.2,Q3 2020,Q4 2020,Year Ended 2020,Q3 2021,Q4 2021,Year Ended 2021
0,Revenue,110360.0,30571.0,95272.0,125843.0,30571.0,95272.0,125843.0,30571.0,95272.0,...,49360.0,30571.0,95272.0,125843.0,35021.0,107994.0,143015.0,41706.0,126382.0,168088.0
1,Cost of revenue,38353.0,10170.0,32740.0,42910.0,10170.0,32740.0,42910.0,10170.0,32740.0,...,15615.0,10170.0,32740.0,42910.0,10975.0,35103.0,46078.0,13045.0,39187.0,52232.0
2,Gross margin,72007.0,20401.0,62532.0,82933.0,20401.0,62532.0,82933.0,20401.0,62532.0,...,33745.0,20401.0,62532.0,82933.0,24046.0,72891.0,96937.0,28661.0,87195.0,115856.0
3,Research and development,14726.0,4316.0,12560.0,16876.0,4316.0,12560.0,16876.0,4316.0,12560.0,...,6306.0,4316.0,12560.0,16876.0,4887.0,14382.0,19269.0,5204.0,15512.0,20716.0
4,Sales and marketing,17469.0,4565.0,13648.0,18213.0,4565.0,13648.0,18213.0,4565.0,13648.0,...,5595.0,4565.0,13648.0,18213.0,4911.0,14687.0,19598.0,5082.0,15035.0,20117.0
5,General and administrative,4754.0,1179.0,3706.0,4885.0,1179.0,3706.0,4885.0,1179.0,3706.0,...,1480.0,1179.0,3706.0,4885.0,1273.0,3838.0,5111.0,1327.0,3780.0,5107.0
6,Impairment and restructuring,0.0,,,,,,,,,...,,,,,,,,,,
7,Restructuring,,,,0.0,,,0.0,,,...,,,,0.0,,,,,,
8,Operating income,35058.0,10341.0,32618.0,42959.0,10341.0,32618.0,42959.0,10341.0,32618.0,...,20364.0,10341.0,32618.0,42959.0,12975.0,39984.0,52959.0,17048.0,52868.0,69916.0
9,"Other income (expense), net",1416.0,,,,,,,,,...,-174.0,,,,-132.0,,,188.0,,


In [195]:
income.IncomeExcel.keys()

dict_keys(['Year Ended 2017', 'Q1 2018 (2017)', 'Q2 2018 (2017)', 'Q3 2018', 'Year Ended 2018', 'Q1 2019 (2018)', 'Q2 2019 (2018)', 'Q3 2019', 'Year Ended 2019', 'Q1 2020 (2019)', 'Q2 2020 (2019)', 'Q3 2020', 'Year Ended 2020', 'Q1 2021 (2020)', 'Q2 2021 (2020)', 'Q3 2021', 'Year Ended 2021', 'Q1 2022 (2021)', 'Q2 2022 (2021)', 'Q3 2022'])

In [170]:
k = {1:2,3:4}
i = [1,2]
list(zip(i,k))

[(1, 1), (2, 3)]

In [None]:
pd.concat()

In [118]:



pd.concat([income.newRowStatement,income.newRowSheet.reset_index(drop=True)],axis=1)

Unnamed: 0,label,Year Ended 2017,Q1 2018 (2017)
24,Cash dividends declared per common share,1.56,
0,,,0.42


In [132]:
class Balance(FinancialDataMerge, ProcessQuarters, HelperFunctions):
    def __init__(self):
        self.total_keyword = "total"
        self.finalStatement = pd.DataFrame(columns=["title","title_order","label"])
        
        self.BalanceStatements = self.getBalanceStatements()
        self.addData(self.BalanceStatements)
        
        self.rearrangeFinalStatement()
        
        self.setQuarters()
        
        self.performBalanceMath()
        
        self.reorderQuarters()
        
      
    def getBalanceStatements(self):
        FinancialReports = self.getFinancialStatements()
        BalanceStatements = [[report.getBalanceSheet().copy(), report.file] for report in FinancialReports]
        return BalanceStatements
    
    def parseQuarter(self,date):
        mo = date.split(" ")[0].strip(".")
        year = date.split(", ")[-1]
        if mo in ["Apr","May"]:
            return "Q1 "+str(int(year)+1)
        
        if mo in ["Jul","Aug"]:
            return "Q2 "+str(int(year)+1)
        
        if mo in ["Oct","Nov"]:
            return "Q3 "+str(int(year)+1)
        
        if mo in ["Jan","Feb"]:
            return "Year Ended "+str(int(year))
        
        
    def performBalanceMath(self):
        years = self.getYears()[1:]
        for year in years:
            if "Year Ended "+year not in self.getQuarterHeaders(): 
                continue
            self.finalStatement["Q4 "+year] = self.finalStatement["Year Ended "+year]
            
    def getAccumulatedDepreciation(self):
        Depreciation = {}
        quarters = []
        for sheet, file in self.BalanceStatements:
            sheet.label = [label.lower() for label in sheet.label]
            date = sheet.columns[1]
            quarter = self.parseQuarter(date) 
            quarters.append(quarter)
            label = sheet.loc[sheet.label.str.contains("property and equipment, net of accumulated depreciation"),"label"]
            if len(label)>0:
                label = label.iloc[0]
                dep = re.findall("\d+,*\d*",label)
                if len(dep) > 0:
                    dep = int(dep[0].replace(",",""))
                    Depreciation[quarter] = dep
                    continue
                   
            dep = sheet.loc[(sheet.label == "property and equipment, accumulated depreciation") | (sheet.label=="accumulated depreciation, depletion and amortization, property, plant, and equipment"), date]
            if len(dep) > 0:
                dep = dep.iloc[0]
                Depreciation[quarter] = dep
                if "Year Ended" in quarter:
                    Depreciation["Q4 "+quarter.split(" ")[-1]] = dep
                continue
        #dates = sorted(Depreciation,key=lambda date: datetime.strptime(date.replace(".",""), '%b %d, %Y'))
        years = list(set([x.split(" ")[-1] for x in quarters]))
        years.sort()
        cols = { q+" "+year : [Depreciation.get(q + " " + year, np.nan)] for year in years for q in ["Q1","Q2","Q3","Q4","Year Ended"] if q + " " + year in quarters}
        self.Depreciation = pd.DataFrame(cols)
        return self.Depreciation
            
            
        

In [133]:
balance = Balance()
balance.getAccumulatedDepreciation().to_excel("output/raw/depreciation.xlsx")


q3 2022 (2021).xlsx
q3 2019 (2018).xlsx
q3 2021 (2020).xlsx
q3 2018 (2017).xlsx
q1 2020 (2019).xlsx
q2 2022 (2021).xlsx
q2 2018 (2017).xlsx
q2 2021 (2020).xlsx
10k 2020 (2019).xlsx
q2 2019 (2018).xlsx
q1 2023 (2022).xlsx
q1 2022 (2021).xlsx
q3 2020 (2019).xlsx
q1 2021 (2020)xlsx.xlsx
q1 2018 (2017).xlsx
q1 2019 (2018).xlsx
10k 2022 (2021).xlsx
10k 2019 (2018).xlsx
q2 2020 (2019).xlsx
10k 2017 (2016).xlsx
10k 2018 (2017).xlsx
10k 2021 (2020).xlsx


In [134]:
balance = Balance()
finalBalance = balance.finalStatement
finalBalance.to_excel("output/raw/balance.xlsx")
finalBalance


q3 2022 (2021).xlsx
q3 2019 (2018).xlsx
q3 2021 (2020).xlsx
q3 2018 (2017).xlsx
q1 2020 (2019).xlsx
q2 2022 (2021).xlsx
q2 2018 (2017).xlsx
q2 2021 (2020).xlsx
10k 2020 (2019).xlsx
q2 2019 (2018).xlsx
q1 2023 (2022).xlsx
q1 2022 (2021).xlsx
q3 2020 (2019).xlsx
q1 2021 (2020)xlsx.xlsx
q1 2018 (2017).xlsx
q1 2019 (2018).xlsx
10k 2022 (2021).xlsx
10k 2019 (2018).xlsx
q2 2020 (2019).xlsx
10k 2017 (2016).xlsx
10k 2018 (2017).xlsx
10k 2021 (2020).xlsx


Unnamed: 0,title,label,Year Ended 2017,Q1 2018,Q2 2018,Q3 2018,Q4 2018,Year Ended 2018,Q1 2019,Q2 2019,...,Q2 2021,Q3 2021,Q4 2021,Year Ended 2021,Q1 2022,Q2 2022,Q3 2022,Q4 2022,Year Ended 2022,Q1 2023
0,,cash and cash equivalents,1783.0,1583.0,1609.0,1353.0,1783.0,1783.0,1210.0,1322.0,...,2188.0,2471.0,0.0,0.0,2066.0,2375.0,801.0,877.0,877.0,845.0
1,,short-term investments,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,410.0,410.0,0.0,0.0,275.0,0.0,0.0,0.0
2,,merchandise inventory,1830.0,1961.0,2051.0,2476.0,1997.0,1997.0,2035.0,2202.0,...,2242.0,2747.0,2451.0,2451.0,2370.0,2281.0,2721.0,3018.0,3018.0,3169.0
3,,other current assets,702.0,575.0,598.0,654.0,788.0,788.0,778.0,780.0,...,882.0,966.0,0.0,0.0,1091.0,1201.0,1410.0,1270.0,1270.0,991.0
4,,"available-for-sale securities, current",0.0,0.0,0.0,0.0,0.0,0.0,164.0,286.0,...,25.0,178.0,0.0,0.0,475.0,337.0,0.0,0.0,0.0,0.0
5,,"cash and cash equivalents, at carrying value",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1988.0,1988.0,0.0,0.0,0.0,0.0,0.0,0.0
6,,"other assets, current",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1159.0,1159.0,0.0,0.0,0.0,0.0,0.0,0.0
7,,total current assets,4315.0,4119.0,4258.0,4483.0,4568.0,4568.0,4187.0,4590.0,...,5337.0,6362.0,6008.0,6008.0,6002.0,6194.0,5207.0,5165.0,5165.0,5005.0
8,,"property and equipment, net of accumulated dep...",0.0,2605.0,2643.0,2686.0,0.0,0.0,2791.0,2832.0,...,2895.0,2846.0,2841.0,2841.0,2839.0,2897.0,2924.0,3037.0,3037.0,2791.0
9,,"operating lease, right-of-use asset",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4689.0,4460.0,4217.0,4217.0,4060.0,3975.0,3788.0,0.0,0.0,3587.0


In [137]:
class CashFlow(FinancialDataMerge, ProcessQuarters, HelperFunctions):
    def __init__(self):
        self.total_keyword = "net"
        self.finalStatement = pd.DataFrame(columns=["title","title_order","label"])
        
        self.CashFlowStatements = self.getCashFlowStatements()
        self.addData(self.CashFlowStatements)
        
        self.rearrangeFinalStatement()
        
        self.setQuarters()
        
        self.performCashFlowMath()
        
        self.reorderQuarters()
        
      
    def getCashFlowStatements(self):
        FinancialReports = self.getFinancialStatements()
        CashFlowStatements = [[report.getCashFlowSheet().copy(), report.file] for report in FinancialReports]
        return CashFlowStatements
    
    def parseQuarter(self,date):
        mo = date.split(" ")[0].strip(".")
        year = date.split(", ")[-1]
        if mo in ["Apr","May"]:
            return "Q1 "+str(int(year)+1)
        
        if mo in ["Jul","Aug"]:
            return "6mo "+str(int(year)+1)
        
        if mo in ["Oct","Nov"]:
            return "9mo "+str(int(year)+1)
        
        if mo in ["Jan","Feb"]:
            return "Year Ended "+str(int(year))     
    def performCashFlowMath(self):
        years = self.getYears()[1:]
        begin_cash = self.finalStatement.label.apply(lambda label: "cash" in label and "beginning of period" in label)
        data = [None, "cash at beginning of period"] + self.finalStatement.loc[begin_cash,self.getQuarterHeaders()].sum().tolist()
        self.finalStatement = self.finalStatement.drop(self.finalStatement.loc[begin_cash].index)
        self.finalStatement.loc[len(self.finalStatement)] = data

        end_cash = self.finalStatement.label.apply(lambda label: "cash" in label and "end of period" in label)
        data = [None, "cash at end of period"] + self.finalStatement.loc[end_cash,self.getQuarterHeaders()].sum().tolist()
        self.finalStatement = self.finalStatement.drop(self.finalStatement.loc[end_cash].index)
        self.finalStatement.loc[len(self.finalStatement)] = data
        
        self.finalStatement = self.finalStatement.set_index("label")
        for year in years:
            if "Year Ended "+year not in self.getQuarterHeaders(): 
                continue
            # find Q2 
            self.finalStatement["Q2 "+year] = self.finalStatement["6mo "+year] - self.finalStatement["Q1 "+year]
            self.finalStatement.loc["cash at beginning of period","Q2 "+year] = self.finalStatement.loc["cash at end of period","Q1 "+year]
            self.finalStatement.loc["cash at end of period","Q2 "+year] = self.finalStatement.loc["cash at end of period","6mo "+year]
            
            #find Q3 
            self.finalStatement["Q3 "+year] = self.finalStatement["9mo "+year] - self.finalStatement["6mo "+year]
            self.finalStatement.loc["cash at beginning of period","Q3 "+year] = self.finalStatement.loc["cash at end of period", "6mo "+year]
            self.finalStatement.loc["cash at end of period","Q3 "+year] = self.finalStatement.loc["cash at end of period","9mo "+year]
            
            
            #find Q3 
            self.finalStatement["Q4 "+year] = self.finalStatement["Year Ended "+year] - self.finalStatement["9mo "+year]
            self.finalStatement.loc["cash at beginning of period","Q4 "+year] = self.finalStatement["9mo "+year].loc["cash at end of period"]
            self.finalStatement.loc["cash at end of period","Q4 "+year] = self.finalStatement.loc["cash at end of period","Year Ended "+year]
        
        self.finalStatement = self.finalStatement.reset_index()
        self.finalStatement = self.finalStatement[["title","label"]+self.getQuarterHeaders()]
        

In [138]:
cashflow = CashFlow()
finalCashflow = cashflow.finalStatement
finalCashflow.to_excel("output/cashflow.xlsx")
finalCashflow

q3 2022 (2021).xlsx
q3 2019 (2018).xlsx
q3 2021 (2020).xlsx
q3 2018 (2017).xlsx
q1 2020 (2019).xlsx
q2 2022 (2021).xlsx
q2 2018 (2017).xlsx
q2 2021 (2020).xlsx
10k 2020 (2019).xlsx
q2 2019 (2018).xlsx
q1 2023 (2022).xlsx
q1 2022 (2021).xlsx
q3 2020 (2019).xlsx
q1 2021 (2020)xlsx.xlsx
q1 2018 (2017).xlsx
q1 2019 (2018).xlsx
10k 2022 (2021).xlsx
10k 2019 (2018).xlsx
q2 2020 (2019).xlsx
10k 2017 (2016).xlsx
10k 2018 (2017).xlsx
10k 2021 (2020).xlsx


Unnamed: 0,title,label,Year Ended 2017,Q1 2018,Q2 2018,Q3 2018,Q4 2018,Year Ended 2018,Q1 2019,Q2 2019,...,Q2 2021,Q3 2021,Q4 2021,Year Ended 2021,Q1 2022,Q2 2022,Q3 2022,Q4 2022,Year Ended 2022,Q1 2023
0,,net income,676,143,271,229,205,848,164,297,...,-62,95,234,-665,166,258,-152,-16,256,-162
1,,depreciation and amortization,593,138,141,139,141,559,140,140,...,126,125,126,507,120,124,128,132,504,130
2,,share-based compensation,76,20,22,18,27,87,21,27,...,17,20,22,77,36,36,25,42,139,1
3,,"operating lease, impairment loss",0,0,0,0,0,0,0,0,...,1,0,30,391,5,1,0,-6,0,0
4,,other asset impairment charges,107,0,0,0,28,28,0,0,...,3,0,8,135,0,1,0,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,,income taxes paid,0,0,0,0,0,0,0,0,...,0,0,20,20,0,0,0,0,0,0
668,,"cash paid for income taxes during the period, ...",488,35,95,130,310,570,19,42,...,16,-45,-8,0,20,127,34,34,215,-420
669,,cash at beginning of period,1370,1783,1583,1609,1353,1783,1799,1229,...,1048,2241,2499,1381,2016,2096,2407,829,2016,902
670,,"operating lease, payments",0,0,0,0,0,0,0,0,...,0,0,1096,1096,0,0,0,0,0,0


In [848]:
finalCashflow["label"].tolist()

['cash and cash equivalents',
 'short-term investments',
 'merchandise inventory',
 'other current assets',
 'available-for-sale securities, current',
 'total current assets',
 'property and equipment, net of accumulated depreciation',
 'operating lease, right-of-use asset',
 'other long-term assets',
 'accumulated depreciation, depletion and amortization, property, plant, and equipment',
 'operating lease assets',
 'total assets',
 'line of credit facility, fair value of amount outstanding',
 'accounts payable',
 'accrued expenses and other current liabilities',
 'operating lease, liability, current',
 'income taxes payable',
 'current portion of operating lease liabilities',
 'total current liabilities',
 'long-term debt',
 'lease incentives and other long-term liabilities',
 'long-term operating lease liabilities',
 'other long-term liabilities',
 'total long-term debt',
 'total long-term liabilities',
 'operating lease, liability, noncurrent',
 'lease incentives and other long-term

In [856]:
finalCashflow

Unnamed: 0,title,label,Year Ended 2016,Q1 2017,6mo 2017,9mo 2017,Year Ended 2017,Q1 2018,6mo 2018,9mo 2018,...,9mo 2020,Year Ended 2020,Q1 2021,6mo 2021,9mo 2021,Year Ended 2021,Q1 2022,6mo 2022,9mo 2022,Year Ended 2022
0,Cash flows from operating activities,net income,920,127,252,456,676,143,414,643,...,535,351,-932,-994,-899,-665,166,424,272,256
1,Adjustments to reconcile net income to net cas...,depreciation and amortization,592,148,303,449,593,138,279,418,...,417,557,130,256,381,507,120,244,372,504
2,Adjustments to reconcile net income to net cas...,share-based compensation,76,15,36,55,76,20,42,60,...,64,68,18,35,55,77,36,72,97,139
3,Adjustments to reconcile net income to net cas...,"operating lease, impairment loss",0,0,0,0,0,0,0,0,...,1,239,360,361,361,0,5,6,6,0
4,Adjustments to reconcile net income to net cas...,other asset impairment charges,54,0,0,0,107,0,0,0,...,9,98,124,127,127,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,Supplemental disclosure of cash flow information,"cash paid for income taxes during the period, ...",452,43,143,318,488,35,130,260,...,117,176,37,53,8,20,20,147,181,215
95,Supplemental disclosure of cash flow information,cash paid for interest during the period,78,39,41,80,82,38,38,76,...,0,76,0,0,0,145,0,0,0,180
96,Supplemental disclosure of cash flow information,"operating lease, payments",0,0,0,0,0,0,0,0,...,0,1244,0,0,0,0,0,0,0,0
97,Supplemental disclosure of cash flow information,cash paid for operating lease liabilities,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1096,0,0,0,1061
