In [304]:
import pandas as pd
import os
import openpyxl
import numpy as np
import re
from datetime import datetime
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()

In [221]:
class FinancialReport():
    def __init__(self,file):
        self.file = file
        self.readFile()
        self.findBalanceSheet()
        self.findCashFlowSheet()
    def readFile(self):
        path = os.path.join("financial statements",self.file)
        self.statements = pd.read_excel(path,sheet_name=None)
    def getSheetNames(self):
        return self.statements.keys()
    def findBalanceSheet(self):
        balanceSheetName = [name for name in self.getSheetNames() if "BALANCE" in name][0]
        self.balance = self.statements[balanceSheetName]
        self.balance.columns = ["label"]+list(self.balance.columns[1:])
    def findCashFlowSheet(self):
        cashflowSheetName = [name for name in self.getSheetNames() if "CASH" in name]
        if len(cashflowSheetName) > 0:
            self.cashflow = self.statements[cashflowSheetName[0]]
        else:
            cashflowSheetName = [name for name in self.getSheetNames() if "CONDENSED" in name][-1]
            self.cashflow = self.statements[cashflowSheetName]
        self.cashflow.columns = self.cashflow.iloc[0]
        self.cashflow = self.cashflow.drop(0)
        self.cashflow.columns = ["label"]+list(self.cashflow.columns[1:])
    def getBalanceSheet(self):
        return self.balance
    def getCashFlowSheet(self):
        return self.cashflow
    

In [401]:
class CashFlow():
    def __init__(self):
        self.finalCashFlow = pd.DataFrame(columns=["title","num_title","label"])
        self.setFinancialStatements()
        self.addDataToCashFlowSheet()
        self.setQuarters()
        self.rearrangeFinalCashFlowSheet()
        
        
    def getFiles(self):
        files = os.listdir("financial statements")
        files.remove(".DS_Store")
        files = [file for file in files if file[0] != "~"]
        return files

    def setFinancialStatements(self):
        self.files = self.getFiles()
        self.FinancialReports = [FinancialReport(file) for file in self.files]
        
    def addDataToCashFlowSheet(self):
        
        for FinancialStatements in self.FinancialReports:
            print(FinancialStatements.file)
            cashflow = FinancialStatements.getCashFlowSheet().copy()
            
            cols = ["label"]+[col for col in cashflow.columns[1:] if col not in self.finalCashFlow.columns] ## remove columns of data already in main balance sheet
            cashflow = cashflow[cols]
            
            cashflow = self.parseCashFlowStatement(cashflow)
            
            self.finalCashFlow = pd.merge(self.finalCashFlow, cashflow, how="outer",on=["label","title"],suffixes=('', '_new'))
            #self.finalCashFlow.title = [self.finalCashFlow.title[i] if not pd.isnull(self.finalCashFlow.title[i]) else self.finalCashFlow.title_new[i] for i in range(len(self.finalCashFlow))] #merge titles
            self.finalCashFlow.num_title = [self.merge_number(self.finalCashFlow.num_title[i],self.finalCashFlow.num_title_new[i]) for i in range(len(self.finalCashFlow))] #merge the number for the titles. use greatest num bc bigger num means there is section before it
            self.finalCashFlow = self.finalCashFlow.drop(["num_title_new"],axis=1)

    def rearrangeFinalCashFlowSheet(self):
        self.finalCashFlow = self.finalCashFlow.replace(" ",np.nan)

        # put Nets at the bottom of each section
        Nets = self.finalCashFlow.loc[self.finalCashFlow.label.str.startswith("net")].reset_index(drop=True) 
        self.finalCashFlow = self.finalCashFlow.loc[~self.finalCashFlow.label.str.startswith("net")].reset_index(drop=True)
        self.finalCashFlow = pd.concat([self.finalCashFlow,Nets]) 
        self.finalCashFlow = self.finalCashFlow.reset_index(drop=True).reset_index().rename(columns={"index":"num_label"})
        self.finalCashFlow = self.finalCashFlow.sort_values(["num_title","num_label"]) # order by title and then location
        self.finalCashFlow = self.finalCashFlow.drop(["num_label"],axis=1).reset_index(drop=True).reset_index().rename(columns={"index":"num_label"})

    def parseCashFlowStatement(self,balance):
        num = 0
        index = []
        title = False
        for label in balance.label:
            if ":" in label:
                title = re.sub("\([\w\W]+\)","",label[:-1]).replace("  "," ").strip()
                num += 1
                continue
            if title:
                label = label.lower()
                index.append([title,num,label])
            
            if "net" == label[:3]:
                title = np.nan
                num += 1

        balance = balance.loc[balance.label.isin(np.array(index)[:,2])].copy()
        balance.loc[:,["title","num_title","label"]] = list(index)
        balance = balance.set_index(["title","num_title","label"]).reset_index()
        return balance
    
    def clean_up_string(self,string):
        string = string.strip()
        space_positions = [x.start() for x in re.finditer('\ ',  string)]
        for i in reversed(space_positions):
            if string[i+1:] in ["of", "and"]:
                string = string[:i]
        return string
    
    def merge_number(self,a,b):
        if pd.isnull(a):
            return b
        if pd.isnull(b):
            return a
        return max(a,b)
    
    def setQuarters(self):
        dates = list(self.finalCashFlow.columns[3:])
        dates.sort(key=lambda date: datetime.strptime(date.replace(".",""), '%b %d, %Y'))
        self.finalCashFlow = self.finalCashFlow[["title","label","num_title"]+dates]
        self.finalCashFlow.columns = ["title","label","num_title"]+[self.parseQuarter(date) for date in self.finalCashFlow.columns[3:]]
    def parseQuarter(self,date):
        mo = date.split(" ")[0].strip(".")
        year = date.split(", ")[-1]
        if mo in ["May"]:
            return "Q1 "+str(int(year)+1)
        
        if mo in ["Jul","Aug"]:
            return "Q2 "+str(int(year)+1)
        
        if mo in ["Oct","Nov"]:
            return "Q3 "+str(int(year)+1)
        
        if mo in ["Jan","Feb"]:
            return "Q4 "+str(int(year))
    

In [402]:
cashflow = CashFlow()
finalCashFlow = cashflow.finalCashFlow
finalCashFlow

q3 2021.xlsx
    num_title                                              title  \
0           1               Cash flows from operating activities   
1           3  Adjustments to reconcile net income to net cas...   
2           3  Adjustments to reconcile net income to net cas...   
3           3  Adjustments to reconcile net income to net cas...   
4           3  Adjustments to reconcile net income to net cas...   
5           3  Adjustments to reconcile net income to net cas...   
6           3  Adjustments to reconcile net income to net cas...   
7           3  Adjustments to reconcile net income to net cas...   
8           3  Adjustments to reconcile net income to net cas...   
9           3  Adjustments to reconcile net income to net cas...   
10          4        Changes in operating assets and liabilities   
11          4        Changes in operating assets and liabilities   
12          4        Changes in operating assets and liabilities   
13          4        Changes in ope

Unnamed: 0,num_label,title,label,num_title,Q4 2017,Q4 2018,Q1 2019,Q2 2019,Q3 2019,Q4 2019,...,Q3 2020,Q4 2020,Q1 2021,Q2 2021,Q3 2021,Q4 2021,Q1 2022,Q2 2022,Q3 2022,Q4 2022
0,0,Cash flows from operating activities,net income (loss),1.0,,,,,,,...,,,,,,,,,,
1,1,Cash flows from operating activities,net income,1.0,,,,,,,...,,,,,,,,,,
2,2,Adjustments to reconcile net income to net cas...,depreciation and amortization,3.0,,,,,,,...,,,,,,,,,,
3,3,Adjustments to reconcile net income to net cas...,share-based compensation,3.0,,,,,,,...,,,,,,,,,,
4,4,Adjustments to reconcile net income to net cas...,"operating lease, impairment loss",3.0,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,98,Supplemental disclosure of cash flow information,income taxes paid,11.0,,,,,,,...,,,,,,,,,,
99,99,Supplemental disclosure of cash flow information,"cash paid for income taxes during the period, ...",12.0,,,,,,,...,,,,,,,,,,
100,100,Supplemental disclosure of cash flow information,cash paid for interest during the period,12.0,,,,,,,...,,,,,,,,,,
101,101,Supplemental disclosure of cash flow information,"operating lease, payments",12.0,,,,,,,...,,,,,,,,,,


In [397]:
cols = finalCashFlow.columns[4:]
groupby = {x:"sum" for x in cols}
groupby.update({"label":"first","num_label":"min","num_title":"min"})


In [398]:
finalCashFlow[cols] = finalCashFlow[cols].fillna(0).astype('int32')

In [399]:
labels = [x.replace("Gain","Loss") for x in finalCashFlow.label]
labels = [re.sub("\([\w\W]+\)","",x).replace("  "," ").strip() for x in labels]

finalCashFlow["new_label"] = labels
finalCashFlow = finalCashFlow.groupby(["title","new_label"]).aggregate(groupby).reset_index()
finalCashFlow = finalCashFlow.sort_values(["num_title","num_label"])


finalCashFlow

Unnamed: 0,title,new_label,Q4 2017,Q4 2018,Q1 2019,Q2 2019,Q3 2019,Q4 2019,Q1 2020,Q2 2020,...,Q2 2021,Q3 2021,Q4 2021,Q1 2022,Q2 2022,Q3 2022,Q4 2022,label,num_label,num_title
63,Cash flows from operating activities,net income,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,net income (loss),0,1.0
4,Adjustments to reconcile net income to net cas...,depreciation and amortization,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,depreciation and amortization,2,3.0
22,Adjustments to reconcile net income to net cas...,share-based compensation,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,share-based compensation,3,3.0
20,Adjustments to reconcile net income to net cas...,"operating lease, impairment loss",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"operating lease, impairment loss",4,3.0
21,Adjustments to reconcile net income to net cas...,other asset impairment charges,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,other asset impairment charges,5,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,Supplemental disclosure of cash flow information,income taxes paid,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,income taxes paid,98,11.0
76,Supplemental disclosure of cash flow information,"cash paid for income taxes during the period, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"cash paid for income taxes during the period, ...",99,12.0
77,Supplemental disclosure of cash flow information,cash paid for interest during the period,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,cash paid for interest during the period,100,12.0
82,Supplemental disclosure of cash flow information,"operating lease, payments",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"operating lease, payments",101,12.0


In [395]:
finalCashFlow.loc[~finalCashFlow[cols].isnull().all(axis=1)]

Unnamed: 0,title,new_label,Q4 2017,Q4 2018,Q1 2019,Q2 2019,Q3 2019,Q4 2019,Q1 2020,Q2 2020,...,Q2 2021,Q3 2021,Q4 2021,Q1 2022,Q2 2022,Q3 2022,Q4 2022,label,num_label,num_title
63,Cash flows from operating activities,net income,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,net income (loss),0,1.0
4,Adjustments to reconcile net income to net cas...,depreciation and amortization,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,depreciation and amortization,2,3.0
22,Adjustments to reconcile net income to net cas...,share-based compensation,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,share-based compensation,3,3.0
20,Adjustments to reconcile net income to net cas...,"operating lease, impairment loss",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"operating lease, impairment loss",4,3.0
21,Adjustments to reconcile net income to net cas...,other asset impairment charges,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,other asset impairment charges,5,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,Supplemental disclosure of cash flow information,income taxes paid,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,income taxes paid,98,11.0
76,Supplemental disclosure of cash flow information,"cash paid for income taxes during the period, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"cash paid for income taxes during the period, ...",99,12.0
77,Supplemental disclosure of cash flow information,cash paid for interest during the period,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,cash paid for interest during the period,100,12.0
82,Supplemental disclosure of cash flow information,"operating lease, payments",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"operating lease, payments",101,12.0


In [400]:
finalCashFlow.to_csv("finalCashFlow.csv")

In [370]:

labels = [pos_tag(x.split()) for x in labels]

dict_pos = {"J": wordnet.ADJ,
                "NN": wordnet.NOUN,
                "VBD": wordnet.VERB,
                "VBZ": wordnet.VERB,
                "VBG": wordnet.VERB,
                "VBN": wordnet.VERB,
                "RB": wordnet.ADV,
                'IN':False}
def pos(x):
    return x.lower()[0]
labels = map(lambda sent: list(map(lambda x: [x[0].lower(),dict_pos.get(x[1],wordnet.NOUN)], sent)),labels)
labels = [[lemmatizer.lemmatize(word[0],pos=word[1]) for word in sent if word[1]] for sent in labels ]
[" ".join(sent) for sent in labels]

['net income',
 'net income',
 'depreciation and amortization',
 'share-based compensation',
 'operate lease, impairment loss',
 'other asset impairment charge',
 'gain extinguishment debt',
 'amortization debt issuance cost',
 'non-cash and other item',
 'gain disposition business',
 'deferred income tax',
 'gain disposition property plant equipment',
 'amortization lease incentive',
 'amortization lease incentive',
 'gain disposition property plant equipment, exclude oil and gas property and timber property',
 'tax benefit exercise stock option and vesting stock unit',
 'excess tax benefit exercise stock option and vesting stock unit',
 'goodwill, impairment loss',
 'impairment operate lease asset',
 'impairment store asset',
 'impairment intangible asset',
 'loss extinguishment debt',
 'loss disposal property and equipment',
 'loss divestiture activity',
 'gain sale building',
 'goodwill and intangible asset impairment',
 'merchandise inventory',
 'other current asset and other long

In [195]:
list(labels)

[[['net', 'n'], ['income', 'n'], ['(loss)', 'n']],
 [['net', 'n'], ['income', 'n']],
 [['depreciation', 'n'], ['and', 'n'], ['amortization', 'n']],
 [['share-based', 'n'], ['compensation', 'n']],
 [['operating', 'v'], ['lease,', 'n'], ['impairment', 'n'], ['loss', 'n']],
 [['other', 'n'], ['asset', 'n'], ['impairment', 'n'], ['charges', 'n']],
 [['loss', 'n'],
  ['on', 'n'],
  ['extinguishment', 'n'],
  ['of', 'n'],
  ['debt', 'n']],
 [['amortization', 'n'],
  ['of', 'n'],
  ['debt', 'n'],
  ['issuance', 'n'],
  ['costs', 'n']],
 [['non-cash', 'n'], ['and', 'n'], ['other', 'n'], ['items', 'n']],
 [['loss', 'n'],
  ['on', 'n'],
  ['disposition', 'n'],
  ['of', 'n'],
  ['business', 'n']],
 [['deferred', 'n'], ['income', 'n'], ['taxes', 'n']],
 [['loss', 'n'],
  ['on', 'n'],
  ['disposition', 'n'],
  ['of', 'n'],
  ['property', 'n'],
  ['plant', 'n'],
  ['equipment', 'n']],
 [['amortization', 'n'], ['of', 'n'], ['lease', 'n'], ['incentives', 'n']],
 [['amortization', 'n'], ['lease', 'n'],

[('Hi', 'NN')]