In [2]:
import pandas as pd
import os
import openpyxl
import numpy as np
import re
from datetime import datetime
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
import tabulate
lemmatizer = WordNetLemmatizer()

In [471]:
class FinancialReport():
    def __init__(self,file):
        self.file = file
        self.readFile()
        self.findBalanceSheet()
        self.findCashFlowSheet()
        self.findIncomeSheet()
    def readFile(self):
        path = os.path.join("financial statements",self.file)
        self.statements = pd.read_excel(path,sheet_name=None)
    def getSheetNames(self):
        return self.statements.keys()
    
    def findBalanceSheet(self):
        balanceSheetName = self.getSheetWithTitle(["BALANCE"])[0]
        self.balance = self.statements[balanceSheetName]
        self.balance.columns = ["label"]+list(self.balance.columns[1:])
        
    def findIncomeSheet(self):
        incomeSheetName = self.getSheetWithTitle(["OPERATIONS","INCOME"])[0]
        self.income = self.statements[incomeSheetName]
        self.income = self.setFirstRowAsHeader(self.income)
        self.income.columns = ["label"]+list(self.income.columns[1:])
        self.income = self.income.iloc[:,:-2]
        
    def getSheetWithTitle(self,words): 
        return [name for name in self.getSheetNames() if np.any([word in str(self.statements[name].columns[0]) for word in words]) ]
                                                                
    def setFirstRowAsHeader(self, sheet):
        sheet.columns = sheet.iloc[0]
        sheet = sheet.drop(0)
        return sheet
    
    def findCashFlowSheet(self):
        cashflowSheetName = self.getSheetWithTitle(["CASH"])[0]
        self.cashflow = self.statements[cashflowSheetName]
        self.cashflow = self.setFirstRowAsHeader(self.cashflow)
        self.cashflow.columns = ["label"]+list(self.cashflow.columns[1:])
    def getBalanceSheet(self):
        return self.balance
    def getCashFlowSheet(self):
        return self.cashflow
    def getIncomeSheet(self):
        return self.income
    

In [472]:
FinancialReport("q3 2021.xlsx")

<__main__.FinancialReport at 0x1238ea770>

In [473]:
class HelperFunctions:
    def clean_up_string(self,string):
        string = string.strip()
        space_positions = [x.start() for x in re.finditer('\ ',  string)]
        for i in reversed(space_positions):
            if string[i+1:] in ["of", "and"]:
                string = string[:i]
        return string
    
    def merge_number(self,a,b):
        if pd.isnull(a):
            return b
        if pd.isnull(b):
            return a
        return max(a,b)
    
    def getFiles(self):
        files = os.listdir("financial statements")
        files.remove(".DS_Store")
        files = [file for file in files if file[0] != "~"]
        return files

    def getFinancialStatements(self):
        files = self.getFiles()
        FinancialReports = [FinancialReport(file) for file in files]
        return FinancialReports
        

In [490]:
class ProcessQuarters:
    
    def getQuarterHeaders(self):
        quarters = [col for col in list(self.finalStatement.columns) if col not in ["title","label","title_order"]]
        return quarters
    def includesYear(self, string):
        return string[-4:].isdigit()
        
    def getLabelHeaders(self):
        headers = [col for col in list(self.finalStatement.columns) if not self.includesYear(col)]
        return headers
    
    def setQuarters(self):
        dates = self.getQuarterHeaders()
        headers = self.getLabelHeaders()
        dates.sort(key=lambda date: datetime.strptime(date.replace(".",""), '%b %d, %Y'))
        self.finalStatement = self.finalStatement[headers+dates]
        self.finalStatement.columns = headers+[self.parseQuarter(date) for date in self.getQuarterHeaders()]
    
    def reorderQuarters(self):
        years = self.getYears()
        headers = self.getLabelHeaders()
        #years = [year for year in years if "Q1 "+year in self.getQuarterHeaders()]
        quarters = self.getQuarterHeaders()
        cols = [q + " " + year for year in years for q in ["Q1","Q2","Q3","Q4","Year Ended"] if q + " " + year in quarters]
        cols = headers + cols
        self.finalStatement = self.finalStatement[cols]
        
    def getYears(self):
        years = list(set([x.split(" ")[-1] for x in self.getQuarterHeaders()]))
        years.sort()
        return years
    
    
   

In [491]:
class FinancialDataMerge:
    
    def addData(self, FinancialReports):
        for [sheet,file] in FinancialReports[:1]:
            print(file)
            cols = ["label"]+[col for col in sheet.columns[1:] if col not in self.finalStatement.columns] ## remove columns of data already in main sheet sheet
            print(cols)
            sheet = sheet[cols]
            
            sheet = self.parseStatement(sheet)
            
            self.finalStatement = pd.merge(self.finalStatement, sheet, how="outer",on=["label","title"],suffixes=('', '_x'))
            
            self.finalStatement.title_order = [self.merge_number(self.finalStatement.title_order[i],self.finalStatement.title_order_x[i]) for i in range(len(self.finalStatement))] #merge the number for the titles. use greatest num bc bigger num means there is section before it
            self.finalStatement = self.finalStatement.drop(["title_order_x"],axis=1)
    
    def rearrangeFinalStatement(self):
        self.finalStatement = self.finalStatement.replace(" ",np.nan)
        self.finalStatement = self.finalStatement.loc[~self.finalStatement[self.getQuarterHeaders()].isnull().all(axis=1)]
        
        #Make sure each title has same max number index
        self.reconcileTitleOrder()
        
        # put Totals at the bottom of each section
        self.reconcileLabelOrderWithTotals()
        
        self.finalStatement[["title_order","label_order"]] = self.finalStatement[["title_order","label_order"]].astype("int")
        self.finalStatement = self.finalStatement.sort_values(["title_order","label_order"]) # order by title and then location
        
        self.finalStatement = self.finalStatement.drop(["title_order","label_order"],axis=1).reset_index(drop=True)
        
        self.finalStatement = self.finalStatement.fillna(0)
        
    def parseStatement(self,sheet):
        num = 0
        index = []
        title = False
        for label in sheet.label:
            if ":" in label:
                title = re.sub("\([\w\W]+\)","",label[:-1]).replace("  "," ")
                num += 1
                continue
            new_label = label.lower()
            new_label = new_label.replace("gain","loss")
            new_label = re.sub("\([\w\W]+?\)","",new_label).replace("  "," ").strip()
            new_label = re.sub(" \$|\s*\d+\,*", "", new_label) 
            new_label = self.clean_up_string(new_label)
            is_total = self.total_keyword == new_label.split(" ")[0]
            if is_total and not title:
                title = np.nan
                
            if title:
                index.append([title,num,label,new_label])
                
            if is_total:
                title = np.nan
                num += 1
        
        self.sheet = sheet
        sheet = sheet.loc[sheet.label.isin(np.array(index)[:,2])].copy()
        sheet["title"] = np.array(index)[:,0].tolist()
        sheet["title_order"] = np.array(index)[:,1].tolist()
        sheet["label"] = np.array(index)[:,3].tolist()
        sheet = sheet.set_index(["title","title_order","label"]).reset_index()
        self.sheet1 = sheet
        return sheet
    
        
    def reconcileTitleOrder(self):
        title_groupby = self.finalStatement.groupby("title")["title_order"].max().reset_index().dropna(axis=0, how='any')
        title_groupby = title_groupby.loc[title_groupby.title != "nan"]
        self.finalStatement = self.finalStatement.merge(title_groupby, on="title", how="left", suffixes=('', '_x') )
        self.finalStatement["title_order"] = [self.merge_number(self.finalStatement.title_order[i],self.finalStatement.title_order_x[i]) for i in range(len(self.finalStatement))] #merge the number for the titles. 
        self.finalStatement = self.finalStatement.drop(["title_order_x"],axis=1)
        
    def reconcileLabelOrderWithTotals(self):
        totals = self.finalStatement.loc[self.finalStatement.label.str.startswith(self.total_keyword)].reset_index(drop=True) 
        self.finalStatement = self.finalStatement.loc[~self.finalStatement.label.str.startswith(self.total_keyword)].reset_index(drop=True)
        self.finalStatement = pd.concat([self.finalStatement,totals]) 
        self.finalStatement = self.finalStatement.reset_index(drop=True).reset_index().rename(columns={"index":"label_order"})
        
            

In [492]:
class Income(FinancialDataMerge, ProcessQuarters, HelperFunctions):
    def __init__(self):
        self.total_keyword = "net"
        self.finalStatement = pd.DataFrame(columns=["title","title_order","label"])
        
        self.IncomeStatements = self.getIncomeStatements()
        self.addData(self.IncomeStatements)
        
        self.rearrangeFinalStatement()
        
        self.setQuarters()
        
        self.reorderQuarters()
        
      
    def getIncomeStatements(self):
        FinancialReports = self.getFinancialStatements()
        IncomeStatements = [[report.getIncomeSheet().copy(), report.file] for report in FinancialReports]
        return IncomeStatements
    
    def parseQuarter(self,date):
        mo = date.split(" ")[0].strip(".")
        year = date.split(", ")[-1]
        if mo in ["May"]:
            return "Q1 "+str(int(year)+1)
        
        if mo in ["Jul","Aug"]:
            return "Q2 "+str(int(year)+1)
        
        if mo in ["Oct","Nov"]:
            return "Q3 "+str(int(year)+1)
        
        if mo in ["Jan","Feb"]:
            return "Year Ended "+str(int(year))
        
        
    
    def performCashFlowMath(self):
        years = self.getYears()
        for year in years:
            if "Q1 "+year not in self.getQuarterHeaders(): 
                continue

            

In [493]:
income = Income()
finalIncome = income.finalStatement
finalIncome


q3 2021.xlsx
['label', 'Oct. 30, 2021', 'Oct. 31, 2020']


Unnamed: 0,title,label,Q3 2021,Q3 2022
0,,net sales,3994.0,3943.0
1,,cost of goods sold and occupancy expenses,2374.0,2282.0
2,,gross profit,1620.0,1661.0
3,,operating expenses,1445.0,1508.0
4,,operating income,175.0,153.0
5,,loss on extinguishment of debt,0.0,325.0
6,,interest expense,55.0,44.0
7,,interest income,-1.0,-1.0
8,,income before income taxes,121.0,-215.0
9,,income taxes,26.0,-63.0


In [389]:
sheet = income.sheet1
index = income.index

In [412]:
sheet

Unnamed: 0,title,title_order,label,"Oct. 30, 2021","Oct. 30, 2021.1","Oct. 31, 2020","Oct. 31, 2020.1","Oct. 30, 2021.2","Oct. 30, 2021.3","Oct. 31, 2020.2","Oct. 31, 2020.3"
0,,0,net sales,3943.0,12145.0,3994.0,9376.0,3943.0,12145.0,3994.0,9376.0
1,,1,cost of goods sold and occupancy expenses,2282.0,7031.0,2374.0,6339.0,2282.0,7031.0,2374.0,6339.0
2,,1,gross profit,1661.0,5114.0,1620.0,3037.0,1661.0,5114.0,1620.0,3037.0
3,,1,operating expenses,1508.0,4312.0,1445.0,4033.0,1508.0,4312.0,1445.0,4033.0
4,,1,operating income,153.0,802.0,175.0,-996.0,153.0,802.0,175.0,-996.0
5,,1,loss on extinguishment of debt,325.0,325.0,0.0,58.0,325.0,325.0,0.0,58.0
6,,1,interest expense,44.0,149.0,55.0,132.0,44.0,149.0,55.0,132.0
7,,1,interest income,-1.0,-3.0,-1.0,-7.0,-1.0,-3.0,-1.0,-7.0
8,,1,income before income taxes,-215.0,331.0,121.0,-1179.0,-215.0,331.0,121.0,-1179.0
9,,1,income taxes,-63.0,59.0,26.0,-280.0,-63.0,59.0,26.0,-280.0


In [295]:
sheet = sheet.copy()

In [296]:
sheet

Unnamed: 0,label,"Oct. 30, 2021","Oct. 30, 2021.1","Oct. 31, 2020","Oct. 31, 2020.1","Oct. 30, 2021.2","Oct. 30, 2021.3","Oct. 31, 2020.2","Oct. 31, 2020.3"
3,"Other comprehensive income , net of tax",4,2,5,-14,4,2,5,-14
4,"Other comprehensive income , net of tax",1,-6,-2,9,1,-6,-2,9
5,"Other comprehensive income , net of tax",3,11,-1,-11,3,11,-1,-11
6,"Other comprehensive income , net of tax",8,7,2,-16,8,7,2,-16
7,"Other comprehensive income , net of tax",-144,279,97,-915,-144,279,97,-915


In [194]:
class Balance(FinancialDataMerge, ProcessQuarters, HelperFunctions):
    def __init__(self):
        self.total_keyword = "total"
        self.finalStatement = pd.DataFrame(columns=["title","title_order","label"])
        
        self.BalanceStatements = self.getBalanceStatements()
        self.addData(self.BalanceStatements)
        
        self.rearrangeFinalStatement()
        
        self.setQuarters()
        
        self.performBalanceMath()
        
        self.reorderQuarters()
        
      
    def getBalanceStatements(self):
        FinancialReports = self.getFinancialStatements()
        BalanceStatements = [[report.getBalanceSheet().copy(), report.file] for report in FinancialReports]
        return BalanceStatements
    
    def parseQuarter(self,date):
        mo = date.split(" ")[0].strip(".")
        year = date.split(", ")[-1]
        if mo in ["May"]:
            return "Q1 "+str(int(year)+1)
        
        if mo in ["Jul","Aug"]:
            return "Q2 "+str(int(year)+1)
        
        if mo in ["Oct","Nov"]:
            return "Q3 "+str(int(year)+1)
        
        if mo in ["Jan","Feb"]:
            return "Year Ended "+str(int(year))
        
        
    def performBalanceMath(self):
        years = self.getYears()
        for year in years:
            if "Q1 "+year not in self.getQuarterHeaders(): 
                continue
            self.finalStatement["Q4 "+year] = self.finalStatement["Year Ended "+year]
        

In [195]:
# balance = Balance()
# finalBalance = balance.finalStatement
# finalBalance


In [196]:
class CashFlow(FinancialDataMerge, ProcessQuarters, HelperFunctions):
    def __init__(self):
        self.total_keyword = "net"
        self.finalStatement = pd.DataFrame(columns=["title","title_order","label"])
        
        self.CashFlowStatements = self.getCashFlowStatements()
        self.addData(self.CashFlowStatements)
        
        self.rearrangeFinalStatement()
        
        self.setQuarters()
        
        self.performCashFlowMath()
        self.reorderQuarters()
        
      
    def getCashFlowStatements(self):
        FinancialReports = self.getFinancialStatements()
        CashFlowStatements = [[report.getCashFlowSheet().copy(), report.file] for report in FinancialReports]
        return CashFlowStatements
    
    def parseQuarter(self,date):
        mo = date.split(" ")[0].strip(".")
        year = date.split(", ")[-1]
        if mo in ["May"]:
            return "Q1 "+str(int(year)+1)
        
        if mo in ["Jul","Aug"]:
            return "6mo "+str(int(year)+1)
        
        if mo in ["Oct","Nov"]:
            return "9mo "+str(int(year)+1)
        
        if mo in ["Jan","Feb"]:
            return "Year Ended "+str(int(year))     
    def performCashFlowMath(self):
        years = self.getYears()
        for year in years:
            if "Q1 "+year not in self.getQuarterHeaders(): 
                continue
            # find Q2 
            self.finalStatement["Q2 "+year] = self.finalStatement["6mo "+year] - self.finalStatement["Q1 "+year]
            
            #find Q3 
            self.finalStatement["Q3 "+year] = self.finalStatement["9mo "+year] - self.finalStatement["6mo "+year]
            
            #find Q3 
            self.finalStatement["Q4 "+year] = self.finalStatement["Year Ended "+year] - self.finalStatement["9mo "+year]
    
            
        

In [197]:
# cashflow = CashFlow()
# finalCashflow = cashflow.finalStatement
# finalCashflow

In [27]:
finalStatement.to_csv("balance 2.csv")