In [7]:
import pandas as pd
import os
import openpyxl
import numpy as np
import re
from datetime import datetime
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
import tabulate
lemmatizer = WordNetLemmatizer()

In [8]:
class FinancialReport():
    def __init__(self,file):
        self.file = file
        self.readFile()
        self.findBalanceSheet()
        self.findCashFlowSheet()
    def readFile(self):
        path = os.path.join("financial statements",self.file)
        self.statements = pd.read_excel(path,sheet_name=None)
    def getSheetNames(self):
        return self.statements.keys()
    def findBalanceSheet(self):
        balanceSheetName = [name for name in self.getSheetNames() if "BALANCE" in name][0]
        self.balance = self.statements[balanceSheetName]
        self.balance.columns = ["label"]+list(self.balance.columns[1:])
    def findCashFlowSheet(self):
        cashflowSheetName = [name for name in self.getSheetNames() if "CASH" in name]
        if len(cashflowSheetName) > 0:
            self.cashflow = self.statements[cashflowSheetName[0]]
        else:
            cashflowSheetName = [name for name in self.getSheetNames() if "CONDENSED" in name][-1]
            self.cashflow = self.statements[cashflowSheetName]
        self.cashflow.columns = self.cashflow.iloc[0]
        self.cashflow = self.cashflow.drop(0)
        self.cashflow.columns = ["label"]+list(self.cashflow.columns[1:])
    def getBalanceSheet(self):
        return self.balance
    def getCashFlowSheet(self):
        return self.cashflow
    

In [12]:
class HelperFunctions:
    def clean_up_string(self,string):
        string = string.strip()
        space_positions = [x.start() for x in re.finditer('\ ',  string)]
        for i in reversed(space_positions):
            if string[i+1:] in ["of", "and"]:
                string = string[:i]
        return string
    
    def merge_number(self,a,b):
        if pd.isnull(a):
            return b
        if pd.isnull(b):
            return a
        return max(a,b)
    
    def getQuarterHeaders(self):
        quarters = [col for col in list(self.finalCashFlow.columns) if col not in ["title","label","num_title"]]
        return quarters
        
    def setQuarters(self):
        dates = self.getQuarterHeaders()
        dates.sort(key=lambda date: datetime.strptime(date.replace(".",""), '%b %d, %Y'))
        self.finalCashFlow = self.finalCashFlow[["title","label","num_title"]+dates]
        self.finalCashFlow.columns = ["title","label","num_title"]+[self.parseQuarter(date) for date in self.getQuarterHeaders()]
    
    def parseQuarter(self,date):
        mo = date.split(" ")[0].strip(".")
        year = date.split(", ")[-1]
        if mo in ["May"]:
            return "Q1 "+str(int(year)+1)
        
        if mo in ["Jul","Aug"]:
            return "6mo "+str(int(year)+1)
        
        if mo in ["Oct","Nov"]:
            return "9mo "+str(int(year)+1)
        
        if mo in ["Jan","Feb"]:
            return "Year Ended "+str(int(year))
        

In [25]:
class CashFlow(HelperFunctions):
    def __init__(self):
        self.finalCashFlow = pd.DataFrame(columns=["title","num_title","label"])
        self.setFinancialStatements()
        self.addDataToCashFlowSheet()
        self.setQuarters()
        self.rearrangeFinalCashFlowSheet()
        self.performCashFlowMath()
        
      
    
    def getFiles(self):
        files = os.listdir("financial statements")
        files.remove(".DS_Store")
        files = [file for file in files if file[0] != "~"]
        return files

    def setFinancialStatements(self):
        self.files = self.getFiles()
        self.FinancialReports = [FinancialReport(file) for file in self.files]
        
    def addDataToCashFlowSheet(self):
        
        for FinancialStatements in self.FinancialReports:
            print(FinancialStatements.file)
            cashflow = FinancialStatements.getCashFlowSheet().copy()
            
            cols = ["label"]+[col for col in cashflow.columns[1:] if col not in self.finalCashFlow.columns] ## remove columns of data already in main balance sheet
            cashflow = cashflow[cols]
            
            cashflow = self.parseCashFlowStatement(cashflow)
            
            self.finalCashFlow = pd.merge(self.finalCashFlow, cashflow, how="outer",on=["label","title"],suffixes=('', '_x'))
            
            self.finalCashFlow.num_title = [self.merge_number(self.finalCashFlow.num_title[i],self.finalCashFlow.num_title_x[i]) for i in range(len(self.finalCashFlow))] #merge the number for the titles. use greatest num bc bigger num means there is section before it
            self.finalCashFlow = self.finalCashFlow.drop(["num_title_x"],axis=1)

    def rearrangeFinalCashFlowSheet(self):
        self.finalCashFlow = self.finalCashFlow.replace(" ",np.nan)
        self.finalCashFlow = self.finalCashFlow.loc[~self.finalCashFlow[self.getQuarterHeaders()].isnull().all(axis=1)]
        
        #Make sure each title has same max number index
        self.reconcileTitleOrder()
        
        # put Totals at the bottom of each section
        self.reconcileLabelOrderWithTotals()
        
        self.finalCashFlow = self.finalCashFlow.sort_values(["num_title","num_label"]) # order by title and then location
        
        self.finalCashFlow = self.finalCashFlow.drop(["num_title","num_label"],axis=1)
        
        self.finalCashFlow = self.finalCashFlow.fillna(0)
        
    def parseCashFlowStatement(self,balance):
        num = 0
        index = []
        title = False
        for label in balance.label:
            if ":" in label:
                title = re.sub("\([\w\W]+\)","",label[:-1]).replace("  "," ")
                num += 1
                continue
            if title:
                new_label = label.lower()
                new_label = new_label.replace("Gain","Loss")
                new_label = re.sub("\([\w\W]+\)","",new_label).replace("  "," ").strip()
                index.append([title,num,label,new_label])
            
            if "net" == label[:3]:
                title = np.nan
                num += 1
    
        balance = balance.loc[balance.label.isin(np.array(index)[:,2])].copy()
        balance.loc[:,["title","num_title","label"]] = list(np.array(index)[:,[0,1,3]])
        balance = balance.set_index(["title","num_title","label"]).reset_index()
        return balance
    
        
    def reconcileTitleOrder(self):
        title_groupby = self.finalCashFlow.groupby("title")["num_title"].max().reset_index()
        self.finalCashFlow = self.finalCashFlow.merge(title_groupby, on="title", how="left", suffixes=('', '_x') )
        self.finalCashFlow["num_title"] = [self.merge_number(self.finalCashFlow.num_title[i],self.finalCashFlow.num_title_x[i]) for i in range(len(self.finalCashFlow))] #merge the number for the titles. 
        self.finalCashFlow = self.finalCashFlow.drop(["num_title_x"],axis=1)
        
    def reconcileLabelOrderWithTotals(self,keyword="net"):
        totals = self.finalCashFlow.loc[self.finalCashFlow.label.str.startswith(keyword)].reset_index(drop=True) 
        self.finalCashFlow = self.finalCashFlow.loc[~self.finalCashFlow.label.str.startswith(keyword)].reset_index(drop=True)
        self.finalCashFlow = pd.concat([self.finalCashFlow,totals]) 
        self.finalCashFlow = self.finalCashFlow.reset_index(drop=True).reset_index().rename(columns={"index":"num_label"})
        
    def getYears(self):
        return set([x.split(" ")[-1] for x in self.getQuarterHeaders()])
    def getDataPoint(self):
        pass
    def performCashFlowMath(self):
        years = self.getYears()
        for year in years:
            if "Q1 "+year not in self.getQuarterHeaders(): 
                continue
            # find Q2 
            self.finalCashFlow["Q2 "+year-1] = self.finalCashFlow["6mo "+year-1] - self.finalCashFlow["Q1 "+year-1]
            #find Q3 
            self.finalCashFlow["Q2 "+year-1] = self.finalCashFlow["6mo "+year-1] - self.finalCashFlow["Q1 "+year-1]
    
        
        

In [24]:
cashflow = CashFlow()
finalCashFlow = cashflow.finalCashFlow
finalCashFlow


q3 2021.xlsx
q3 2020.xlsx
q1 2019.xlsx
q2 2021.xlsx
q2 2020.xlsx
10k 2020 (2019).xlsx
10k 2019 (2020).xlsx
q1 2021.xlsx
q2 2019.xlsx
q1 2020.xlsx
10k 2022 (2021).xlsx
q3 2019.xlsx
10k 2021 (2022).xlsx


Unnamed: 0,title,label,Year Ended 2017,Year Ended 2018,Q1 2019,6mo 2019,9mo 2019,Year Ended 2019,Q1 2020,6mo 2020,...,9mo 2021,Year Ended 2021,Q1 2022,6mo 2022,9mo 2022,Year Ended 2022,Q2 2020,Q2 2019,Q2 2022,Q2 2021
78,Cash flows from operating activities,net income,676,848,164,461,727,1003,227,395,...,-899,-665,166,424,272,256,168,297,258,-62
0,Adjustments to reconcile net income to net cas...,depreciation and amortization,593,559,140,280,425,578,138,277,...,381,507,120,244,372,504,139,140,124,126
1,Adjustments to reconcile net income to net cas...,share-based compensation,76,87,21,48,72,91,24,47,...,55,77,36,72,97,139,23,27,36,17
2,Adjustments to reconcile net income to net cas...,"operating lease, impairment loss",,0,,,,0,,0,...,361,,5,6,6,,,,1,1
3,Adjustments to reconcile net income to net cas...,other asset impairment charges,107,28,,,,14,,3,...,127,,0,1,1,,,,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34,Supplemental disclosure of cash flow information,"interest paid, excluding capitalized interest,...",,,,38,77,,,38,...,41,,2,102,178,,,,100,1
35,Supplemental disclosure of cash flow information,"cash paid for income taxes during the period, ...",488,570,19,61,73,143,18,90,...,8,20,20,147,181,215,72,42,127,16
45,Supplemental disclosure of cash flow information,cash paid for interest during the period,82,76,38,,,76,38,,...,,145,,,,180,,,,
46,Supplemental disclosure of cash flow information,"operating lease, payments",,0,0,0,0,0,301,,...,,,,,,,,0,,


In [485]:
finalCashFlow.to_csv("finalCashFlow.csv")

In [499]:
3 - np.nan

nan

In [20]:
finalCashFlow

Unnamed: 0,title,label,Year Ended 2017,Year Ended 2018,Q1 2019,6mo 2019,9mo 2019,Year Ended 2019,Q1 2020,6mo 2020,9mo 2020,Year Ended 2020,Q1 2021,6mo 2021,9mo 2021,Year Ended 2021,Q1 2022,6mo 2022,9mo 2022,Year Ended 2022
78,Cash flows from operating activities,net income,676,848,164,461,727,1003,227,395,535,351,-932,-994,-899,-665,166,424,272,256
0,Adjustments to reconcile net income to net cas...,depreciation and amortization,593,559,140,280,425,578,138,277,417,557,130,256,381,507,120,244,372,504
1,Adjustments to reconcile net income to net cas...,share-based compensation,76,87,21,48,72,91,24,47,64,68,18,35,55,77,36,72,97,139
2,Adjustments to reconcile net income to net cas...,"operating lease, impairment loss",,0,,,,0,,0,1,239,360,361,361,,5,6,6,
3,Adjustments to reconcile net income to net cas...,other asset impairment charges,107,28,,,,14,,3,9,98,124,127,127,,0,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34,Supplemental disclosure of cash flow information,"interest paid, excluding capitalized interest,...",,,,38,77,,,38,75,,38,39,41,,2,102,178,
35,Supplemental disclosure of cash flow information,"cash paid for income taxes during the period, ...",488,570,19,61,73,143,18,90,117,176,37,53,8,20,20,147,181,215
45,Supplemental disclosure of cash flow information,cash paid for interest during the period,82,76,38,,,76,38,,,76,,,,145,,,,180
46,Supplemental disclosure of cash flow information,"operating lease, payments",,0,0,0,0,0,301,,,1244,,,,,,,,
