In [2]:
import pandas as pd
import os
import openpyxl
import numpy as np
import re
from datetime import datetime
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
import tabulate
lemmatizer = WordNetLemmatizer()

In [3]:
class FinancialReport():
    def __init__(self,file):
        self.file = file
        self.readFile()
        self.findBalanceSheet()
        self.findCashFlowSheet()
        self.findIncomeSheet()
    def readFile(self):
        path = os.path.join("financial statements",self.file)
        self.statements = pd.read_excel(path,sheet_name=None)
    def getSheetNames(self):
        return self.statements.keys()
    def findBalanceSheet(self):
        balanceSheetName = [name for name in self.getSheetNames() if "BALANCE" in name][0]
        self.balance = self.statements[balanceSheetName]
        self.balance.columns = ["label"]+list(self.balance.columns[1:])
    def findIncomeSheet(self):
        print(self.file)
        incomeSheetName = [name for name in self.getSheetNames() if "INCOME" in name][0]
        self.income = self.statements[incomeSheetName]
        self.income.columns = self.income.iloc[0]
        self.income = self.income.drop(0)
        self.income.columns = ["label"]+list(self.income.columns[1:])
    def findCashFlowSheet(self):
        incomeSheetName = [name for name in self.getSheetNames() if "CASH" in name]
        if len(incomeSheetName) > 0:
            self.cashflow = self.statements[incomeSheetName[0]]
        else:
            cashflowSheetName = [name for name in self.getSheetNames() if "CONDENSED" in name][-1]
            self.cashflow = self.statements[cashflowSheetName]
        self.cashflow.columns = self.cashflow.iloc[0]
        self.cashflow = self.cashflow.drop(0)
        self.cashflow.columns = ["label"]+list(self.cashflow.columns[1:])
    def getBalanceSheet(self):
        return self.balance
    def getCashFlowSheet(self):
        return self.cashflow
    def getIncomeSheet(self):
        return self.income
    

In [4]:
class HelperFunctions:
    def clean_up_string(self,string):
        string = string.strip()
        space_positions = [x.start() for x in re.finditer('\ ',  string)]
        for i in reversed(space_positions):
            if string[i+1:] in ["of", "and"]:
                string = string[:i]
        return string
    
    def merge_number(self,a,b):
        if pd.isnull(a):
            return b
        if pd.isnull(b):
            return a
        return max(a,b)
    
    def getFiles(self):
        files = os.listdir("financial statements")
        files.remove(".DS_Store")
        files = [file for file in files if file[0] != "~"]
        return files

    def getFinancialStatements(self):
        files = self.getFiles()
        FinancialReports = [FinancialReport(file) for file in files]
        return FinancialReports
        

In [5]:
class ProcessQuarters:
    
    def getQuarterHeaders(self):
        quarters = [col for col in list(self.finalStatement.columns) if col not in ["title","label","title_order"]]
        return quarters
    def includesYear(self, string):
        return string[-4:].isdigit()
        
    def getLabelHeaders(self):
        headers = [col for col in list(self.finalStatement.columns) if not self.includesYear(col)]
        return headers
    
    def setQuarters(self):
        dates = self.getQuarterHeaders()
        headers = self.getLabelHeaders()
        dates.sort(key=lambda date: datetime.strptime(date.replace(".",""), '%b %d, %Y'))
        self.finalStatement = self.finalStatement[headers+dates]
        self.finalStatement.columns = headers+[self.parseQuarter(date) for date in self.getQuarterHeaders()]
    
    def reorderQuarters(self):
        years = self.getYears()
        headers = self.getLabelHeaders()
        years = [year for year in years if "Q1 "+year in self.getQuarterHeaders()]
        cols = [q + " " + year for year in years for q in ["Q1","Q2","Q3","Q4","Year Ended"]]
        cols = headers + cols
        self.finalStatement = self.finalStatement[cols]
        
    def getYears(self):
        return set([x.split(" ")[-1] for x in self.getQuarterHeaders()])
    
    
   

In [6]:
class FinancialDataMerge:
    
    def addData(self, FinancialReports):
        for [sheet,file] in FinancialReports:
            print(file)
            cols = ["label"]+[col for col in sheet.columns[1:] if col not in self.finalStatement.columns] ## remove columns of data already in main sheet sheet
            sheet = sheet[cols]
            
            sheet = self.parseStatement(sheet)
            
            self.finalStatement = pd.merge(self.finalStatement, sheet, how="outer",on=["label","title"],suffixes=('', '_x'))
            
            self.finalStatement.title_order = [self.merge_number(self.finalStatement.title_order[i],self.finalStatement.title_order_x[i]) for i in range(len(self.finalStatement))] #merge the number for the titles. use greatest num bc bigger num means there is section before it
            self.finalStatement = self.finalStatement.drop(["title_order_x"],axis=1)
    
    def rearrangeFinalStatement(self):
        self.finalStatement = self.finalStatement.replace(" ",np.nan)
        self.finalStatement = self.finalStatement.loc[~self.finalStatement[self.getQuarterHeaders()].isnull().all(axis=1)]
        
        #Make sure each title has same max number index
        self.reconcileTitleOrder()
        
        # put Totals at the bottom of each section
        self.reconcileLabelOrderWithTotals()
        
        self.finalStatement[["title_order","label_order"]] = self.finalStatement[["title_order","label_order"]].astype("int")
        self.finalStatement = self.finalStatement.sort_values(["title_order","label_order"]) # order by title and then location
        
        self.finalStatement = self.finalStatement.drop(["title_order","label_order"],axis=1)
        
        #self.finalStatement = self.finalStatement.fillna(0)
        
    def parseStatement(self,sheet):
        num = 0
        index = []
        title = False
        for label in sheet.label:
            if ":" in label:
                title = re.sub("\([\w\W]+\)","",label[:-1]).replace("  "," ")
                num += 1
                continue
            if title:
                new_label = label.lower()
                new_label = label.lower()
                new_label = new_label.replace("gain","loss")
                new_label = re.sub("\([\w\W]+\)","",new_label).replace("  "," ").strip()
                new_label = re.sub(" \$|\s*\d+\,*", "", new_label) 
                new_label = self.clean_up_string(new_label)
                index.append([title,num,label,new_label])
            
                if self.total_keyword == new_label.split(" ")[0]:
                    title = np.nan
                    num += 1
    
        sheet = sheet.loc[sheet.label.isin(np.array(index)[:,2])].copy()
        sheet.loc[:,["title","title_order","label"]] = list(np.array(index)[:,[0,1,3]])
        sheet = sheet.set_index(["title","title_order","label"]).reset_index()
        return sheet
    
        
    def reconcileTitleOrder(self):
        title_groupby = self.finalStatement.groupby("title")["title_order"].max().reset_index().dropna(axis=0, how='any')
        title_groupby = title_groupby.loc[title_groupby.title != "nan"]
        self.finalStatement = self.finalStatement.merge(title_groupby, on="title", how="left", suffixes=('', '_x') )
        self.finalStatement["title_order"] = [self.merge_number(self.finalStatement.title_order[i],self.finalStatement.title_order_x[i]) for i in range(len(self.finalStatement))] #merge the number for the titles. 
        self.finalStatement = self.finalStatement.drop(["title_order_x"],axis=1)
        
    def reconcileLabelOrderWithTotals(self):
        totals = self.finalStatement.loc[self.finalStatement.label.str.startswith(self.total_keyword)].reset_index(drop=True) 
        self.finalStatement = self.finalStatement.loc[~self.finalStatement.label.str.startswith(self.total_keyword)].reset_index(drop=True)
        self.finalStatement = pd.concat([self.finalStatement,totals]) 
        self.finalStatement = self.finalStatement.reset_index(drop=True).reset_index().rename(columns={"index":"label_order"})
        
            

In [9]:
class Balance(FinancialDataMerge, ProcessQuarters, HelperFunctions):
    def __init__(self):
        self.total_keyword = "total"
        self.finalStatement = pd.DataFrame(columns=["title","title_order","label"])
        
        self.IncomeStatements = self.getBalanceStatements()
        self.addData(self.IncomeStatements)
        
        self.rearrangeFinalStatement()
        
        self.setQuarters()
        
        #self.reorderQuarters()
        
      
    def getIncomeStatements(self):
        FinancialReports = self.getFinancialStatements()
        IncomeStatements = [[report.getIncomeSheet().copy(), report.file] for report in FinancialReports]
        return IncomeStatements
    
    def parseQuarter(self,date):
        mo = date.split(" ")[0].strip(".")
        year = date.split(", ")[-1]
        if mo in ["May"]:
            return "Q1 "+str(int(year)+1)
        
        if mo in ["Jul","Aug"]:
            return "Q2 "+str(int(year)+1)
        
        if mo in ["Oct","Nov"]:
            return "Q3 "+str(int(year)+1)
        
        if mo in ["Jan","Feb"]:
            return "Year Ended "+str(int(year))
        

In [10]:
income = Income()
finalIncome = income.finalIncome
finalIncome


NameError: name 'Income' is not defined

In [33]:
finalCashFlow.to_csv("finalCashFlow.csv")

In [20]:
finalCashFlow

Unnamed: 0,title,label,Year Ended 2017,Year Ended 2018,Q1 2019,6mo 2019,9mo 2019,Year Ended 2019,Q1 2020,6mo 2020,9mo 2020,Year Ended 2020,Q1 2021,6mo 2021,9mo 2021,Year Ended 2021,Q1 2022,6mo 2022,9mo 2022,Year Ended 2022
78,Cash flows from operating activities,net income,676,848,164,461,727,1003,227,395,535,351,-932,-994,-899,-665,166,424,272,256
0,Adjustments to reconcile net income to net cas...,depreciation and amortization,593,559,140,280,425,578,138,277,417,557,130,256,381,507,120,244,372,504
1,Adjustments to reconcile net income to net cas...,share-based compensation,76,87,21,48,72,91,24,47,64,68,18,35,55,77,36,72,97,139
2,Adjustments to reconcile net income to net cas...,"operating lease, impairment loss",,0,,,,0,,0,1,239,360,361,361,,5,6,6,
3,Adjustments to reconcile net income to net cas...,other asset impairment charges,107,28,,,,14,,3,9,98,124,127,127,,0,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34,Supplemental disclosure of cash flow information,"interest paid, excluding capitalized interest,...",,,,38,77,,,38,75,,38,39,41,,2,102,178,
35,Supplemental disclosure of cash flow information,"cash paid for income taxes during the period, ...",488,570,19,61,73,143,18,90,117,176,37,53,8,20,20,147,181,215
45,Supplemental disclosure of cash flow information,cash paid for interest during the period,82,76,38,,,76,38,,,76,,,,145,,,,180
46,Supplemental disclosure of cash flow information,"operating lease, payments",,0,0,0,0,0,301,,,1244,,,,,,,,
