In [9]:
import pandas as pd
import os
import openpyxl
import numpy as np
import re
from datetime import datetime
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
import tabulate
lemmatizer = WordNetLemmatizer()

In [48]:
class FinancialReport():
    def __init__(self,file):
        self.file = file
        self.readFile()
        self.findBalanceSheet()
        self.findCashFlowSheet()
        self.findIncomeSheet()
    def readFile(self):
        path = os.path.join("financial statements",self.file)
        self.statements = pd.read_excel(path,sheet_name=None)
    def getSheetNames(self):
        return self.statements.keys()
    def findBalanceSheet(self):
        balanceSheetName = [name for name in self.getSheetNames() if "BALANCE" in name][0]
        self.balance = self.statements[balanceSheetName]
        self.balance.columns = ["label"]+list(self.balance.columns[1:])
    def findIncomeSheet(self):
        print(self.file)
        incomeSheetName = [name for name in self.getSheetNames() if "INCOME" in name][0]
        self.income = self.statements[incomeSheetName]
        self.income.columns = self.income.iloc[0]
        self.income = self.income.drop(0)
        self.income.columns = ["label"]+list(self.income.columns[1:])
    def findCashFlowSheet(self):
        incomeSheetName = [name for name in self.getSheetNames() if "CASH" in name]
        if len(incomeSheetName) > 0:
            self.cashflow = self.statements[incomeSheetName[0]]
        else:
            cashflowSheetName = [name for name in self.getSheetNames() if "CONDENSED" in name][-1]
            self.cashflow = self.statements[cashflowSheetName]
        self.cashflow.columns = self.cashflow.iloc[0]
        self.cashflow = self.cashflow.drop(0)
        self.cashflow.columns = ["label"]+list(self.cashflow.columns[1:])
    def getBalanceSheet(self):
        return self.balance
    def getCashFlowSheet(self):
        return self.cashflow
    def getIncomeSheet(self):
        return self.income
    

In [49]:
class HelperFunctions:
    
    def getFiles(self):
        files = os.listdir("financial statements")
        files.remove(".DS_Store")
        files = [file for file in files if file[0] != "~"]
        return files

    def setFinancialStatements(self):
        self.files = self.getFiles()
        self.FinancialReports = [FinancialReport(file) for file in self.files]
        
    def clean_up_string(self,string):
        string = string.strip()
        space_positions = [x.start() for x in re.finditer('\ ',  string)]
        for i in reversed(space_positions):
            if string[i+1:] in ["of", "and"]:
                string = string[:i]
        return string
    
    def merge_number(self,a,b):
        if pd.isnull(a):
            return b
        if pd.isnull(b):
            return a
        return max(a,b)
    
    def getQuarterHeaders(self):
        quarters = [col for col in list(self.finalCashFlow.columns) if col not in ["title","label","title_order"]]
        return quarters
        
    def setQuarters(self):
        dates = self.getQuarterHeaders()
        dates.sort(key=lambda date: datetime.strptime(date.replace(".",""), '%b %d, %Y'))
        self.finalCashFlow = self.finalCashFlow[["title","label","title_order"]+dates]
        self.finalCashFlow.columns = ["title","label","title_order"]+[self.parseQuarter(date) for date in self.getQuarterHeaders()]
    
    def parseQuarter(self,date):
        mo = date.split(" ")[0].strip(".")
        year = date.split(", ")[-1]
        if mo in ["May"]:
            return "Q1 "+str(int(year)+1)
        
        if mo in ["Jul","Aug"]:
            return "6mo "+str(int(year)+1)
        
        if mo in ["Oct","Nov"]:
            return "9mo "+str(int(year)+1)
        
        if mo in ["Jan","Feb"]:
            return "Year Ended "+str(int(year))
        

In [50]:
class Income(HelperFunctions):
    def __init__(self):
        self.finalIncome = pd.DataFrame(columns=["title","title_order","label"])
        self.setFinancialStatements()
        self.addDataToIncomeSheet()
#         self.setQuarters()
#         self.rearrangefinalIncomeSheet()
#         self.performCashFlowMath()
#         self.reorderQuarters()
        
      
    def addDataToIncomeSheet(self):
        
        for FinancialStatements in self.FinancialReports:
            print(FinancialStatements.file)
            income = FinancialStatements.getIncomeSheet().copy()
            
            cols = ["label"]+[col for col in income.columns[1:] if col not in self.finalIncome.columns] ## remove columns of data already in main sheet sheet
            income = income[cols]
            
            income = self.parseStatement(income)
            
            self.finalIncome = pd.merge(self.finalIncome, income, how="outer",on=["label","title"],suffixes=('', '_x'))
            
            self.income.title_order = [self.merge_number(self.finalIncome.title_order[i],self.finalIncome.title_order_x[i]) for i in range(len(self.finalIncome))] #merge the number for the titles. use greatest num bc bigger num means there is section before it
            self.finalIncome = self.finalIncome.drop(["title_order_x"],axis=1)

    def rearrangefinalIncomeSheet(self):
        self.finalIncome = self.finalIncome.replace(" ",np.nan)
        self.finalIncome = self.finalIncome.loc[~self.finalIncome[self.getQuarterHeaders()].isnull().all(axis=1)]
        
        #Make sure each title has same max number index
        self.reconcileTitleOrder()
        
        # put Totals at the bottom of each section
        self.reconcileLabelOrderWithTotals()
        
        self.finalIncome = self.finalIncome.sort_values(["title_order","label_order"]) # order by title and then location
        
        self.finalIncome = self.finalIncome.drop(["title_order","label_order"],axis=1)
        
        #self.finalIncome = self.finalIncome.fillna(0)
        
    def parseStatement(self,sheet):
        num = 0
        index = []
        title = False
        for label in sheet.label:
            if ":" in label:
                title = re.sub("\([\w\W]+\)","",label[:-1]).replace("  "," ")
                num += 1
                continue
            if title:
                new_label = label.lower()
                new_label = new_label.replace("gain","loss")
                new_label = re.sub("\([\w\W]+\)","",new_label).replace("  "," ").strip()
                index.append([title,num,label,new_label])
            
            if "net" == label[:3]:
                title = np.nan
                num += 1
        print(index)
        sheet = sheet.loc[sheet.label.isin(np.array(index)[:,2])].copy()
        sheet.loc[:,["title","title_order","label"]] = list(np.array(index)[:,[0,1,3]])
        sheet = sheet.set_index(["title","title_order","label"]).reset_index()
        return sheet
    
        
    def reconcileTitleOrder(self):
        title_groupby = self.finalIncome.groupby("title")["title_order"].max().reset_index()
        self.finalIncome = self.finalIncome.merge(title_groupby, on="title", how="left", suffixes=('', '_x') )
        self.finalIncome["title_order"] = [self.merge_number(self.finalIncome.title_order[i],self.finalIncome.title_order_x[i]) for i in range(len(self.finalIncome))] #merge the number for the titles. 
        self.finalIncome = self.finalIncome.drop(["title_order_x"],axis=1)
        
    def reconcileLabelOrderWithTotals(self,keyword="net"):
        totals = self.finalIncome.loc[self.finalIncome.label.str.startswith(keyword)].reset_index(drop=True) 
        self.finalIncome = self.finalIncome.loc[~self.finalIncome.label.str.startswith(keyword)].reset_index(drop=True)
        self.finalIncome = pd.concat([self.finalIncome,totals]) 
        self.finalIncome = self.finalIncome.reset_index(drop=True).reset_index().rename(columns={"index":"label_order"})
        
    def getYears(self):
        return set([x.split(" ")[-1] for x in self.getQuarterHeaders()])
    
    def reorderQuarters(self):
        years = self.getYears()
        years = [year for year in years if "Q1 "+year in self.getQuarterHeaders()]
        cols = [q + " " + year for year in years for q in ["Q1","Q2","Q3","Q4","Year Ended"]]
        cols = ["title","label"] + cols
        self.finalIncome = self.finalIncome[cols]
            
    def performCashFlowMath(self):
        years = self.getYears()
        for year in years:
            if "Q1 "+year not in self.getQuarterHeaders(): 
                continue
            # find Q2 
            self.finalIncome["Q2 "+year] = self.finalIncome["6mo "+year] - self.finalIncome["Q1 "+year]
            
            #find Q3 
            self.finalIncome["Q3 "+year] = self.finalIncome["9mo "+year] - self.finalIncome["6mo "+year]
            
            #find Q3 
            self.finalIncome["Q4 "+year] = self.finalIncome["Year Ended "+year] - self.finalIncome["9mo "+year]
    
            
        

In [51]:
income = Income()
finalIncome = income.finalIncome
finalIncome


q3 2021.xlsx


IndexError: list index out of range

In [33]:
finalCashFlow.to_csv("finalCashFlow.csv")

In [20]:
finalCashFlow

Unnamed: 0,title,label,Year Ended 2017,Year Ended 2018,Q1 2019,6mo 2019,9mo 2019,Year Ended 2019,Q1 2020,6mo 2020,9mo 2020,Year Ended 2020,Q1 2021,6mo 2021,9mo 2021,Year Ended 2021,Q1 2022,6mo 2022,9mo 2022,Year Ended 2022
78,Cash flows from operating activities,net income,676,848,164,461,727,1003,227,395,535,351,-932,-994,-899,-665,166,424,272,256
0,Adjustments to reconcile net income to net cas...,depreciation and amortization,593,559,140,280,425,578,138,277,417,557,130,256,381,507,120,244,372,504
1,Adjustments to reconcile net income to net cas...,share-based compensation,76,87,21,48,72,91,24,47,64,68,18,35,55,77,36,72,97,139
2,Adjustments to reconcile net income to net cas...,"operating lease, impairment loss",,0,,,,0,,0,1,239,360,361,361,,5,6,6,
3,Adjustments to reconcile net income to net cas...,other asset impairment charges,107,28,,,,14,,3,9,98,124,127,127,,0,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34,Supplemental disclosure of cash flow information,"interest paid, excluding capitalized interest,...",,,,38,77,,,38,75,,38,39,41,,2,102,178,
35,Supplemental disclosure of cash flow information,"cash paid for income taxes during the period, ...",488,570,19,61,73,143,18,90,117,176,37,53,8,20,20,147,181,215
45,Supplemental disclosure of cash flow information,cash paid for interest during the period,82,76,38,,,76,38,,,76,,,,145,,,,180
46,Supplemental disclosure of cash flow information,"operating lease, payments",,0,0,0,0,0,301,,,1244,,,,,,,,
