In [2]:
import pandas as pd
import os
import openpyxl
import numpy as np
import re
from datetime import datetime
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
import tabulate
lemmatizer = WordNetLemmatizer()

In [3]:
class FinancialReport():
    def __init__(self,file):
        self.file = file
        self.readFile()
        self.findBalanceSheet()
        self.findCashFlowSheet()
    def readFile(self):
        path = os.path.join("financial statements",self.file)
        self.statements = pd.read_excel(path,sheet_name=None)
    def getSheetNames(self):
        return self.statements.keys()
    def findBalanceSheet(self):
        balanceSheetName = [name for name in self.getSheetNames() if "BALANCE" in name][0]
        self.balance = self.statements[balanceSheetName]
        self.balance.columns = ["label"]+list(self.balance.columns[1:])
    def findCashFlowSheet(self):
        cashflowSheetName = [name for name in self.getSheetNames() if "CASH" in name]
        if len(cashflowSheetName) > 0:
            self.cashflow = self.statements[cashflowSheetName[0]]
        else:
            cashflowSheetName = [name for name in self.getSheetNames() if "CONDENSED" in name][-1]
            self.cashflow = self.statements[cashflowSheetName]
        self.cashflow.columns = self.cashflow.iloc[0]
        self.cashflow = self.cashflow.drop(0)
        self.cashflow.columns = ["label"]+list(self.cashflow.columns[1:])
    def getBalanceSheet(self):
        return self.balance
    def getCashFlowSheet(self):
        return self.cashflow
    

In [4]:
class HelperFunctions:
    def clean_up_string(self,string):
        string = string.strip()
        space_positions = [x.start() for x in re.finditer('\ ',  string)]
        for i in reversed(space_positions):
            if string[i+1:] in ["of", "and"]:
                string = string[:i]
        return string
    
    def merge_number(self,a,b):
        if pd.isnull(a):
            return b
        if pd.isnull(b):
            return a
        return max(a,b)
    
    def getFiles(self):
        files = os.listdir("financial statements")
        files.remove(".DS_Store")
        files = [file for file in files if file[0] != "~"]
        return files

    def getFinancialStatements(self):
        files = self.getFiles()
        FinancialReports = [FinancialReport(file) for file in files]
        return FinancialReports
        

In [120]:
class ProcessQuarters:
    
    def getQuarterHeaders(self):
        quarters = [col for col in list(self.finalStatement.columns) if col not in ["title","label","title_order"]]
        return quarters
    def includesYear(self, string):
        return string[-4:].isdigit()
        
    def getLabelHeaders(self):
        headers = [col for col in list(self.finalStatement.columns) if not self.includesYear(col)]
        return headers
    
    def setQuarters(self):
        dates = self.getQuarterHeaders()
        headers = self.getLabelHeaders()
        dates.sort(key=lambda date: datetime.strptime(date.replace(".",""), '%b %d, %Y'))
        self.finalStatement = self.finalStatement[headers+dates]
        self.finalStatement.columns = headers+[self.parseQuarter(date) for date in self.getQuarterHeaders()]
    
    def reorderQuarters(self):
        years = self.getYears()
        years.sort()
        headers = self.getLabelHeaders()
        years = [year for year in years if "Q1 "+year in self.getQuarterHeaders()]
        
        cols = [q + " " + year for year in years for q in ["Q1","Q2","Q3","Q4","Year Ended"]]
        cols = headers + cols
        self.finalStatement = self.finalStatement[cols]
        
    def getYears(self):
        return list(set([x.split(" ")[-1] for x in self.getQuarterHeaders()]))
    
    
   

In [124]:
class FinancialDataMerge:
    
    def addData(self, FinancialReports):
        for [sheet,file] in FinancialReports:
            print(file)
            cols = ["label"]+[col for col in sheet.columns[1:] if col not in self.finalStatement.columns] ## remove columns of data already in main sheet sheet
            sheet = sheet[cols]
            
            sheet = self.parseStatement(sheet)
            
            self.finalStatement = pd.merge(self.finalStatement, sheet, how="outer",on=["label","title"],suffixes=('', '_x'))
            
            self.finalStatement.title_order = [self.merge_number(self.finalStatement.title_order[i],self.finalStatement.title_order_x[i]) for i in range(len(self.finalStatement))] #merge the number for the titles. use greatest num bc bigger num means there is section before it
            self.finalStatement = self.finalStatement.drop(["title_order_x"],axis=1)
    
    def rearrangeFinalStatement(self):
        self.finalStatement = self.finalStatement.replace(" ",np.nan)
        self.finalStatement = self.finalStatement.loc[~self.finalStatement[self.getQuarterHeaders()].isnull().all(axis=1)]
        
        #Make sure each title has same max number index
        self.reconcileTitleOrder()
        
        # put Totals at the bottom of each section
        self.reconcileLabelOrderWithTotals()
        
        self.finalStatement[["title_order","label_order"]] = self.finalStatement[["title_order","label_order"]].astype("int")
        self.finalStatement = self.finalStatement.sort_values(["title_order","label_order"]) # order by title and then location
        
        self.finalStatement = self.finalStatement.drop(["title_order","label_order"],axis=1).reset_index(drop=True)
        
        #self.finalStatement = self.finalStatement.fillna(0)
        
    def parseStatement(self,sheet):
        num = 0
        index = []
        title = False
        for label in sheet.label:
            if ":" in label:
                title = re.sub("\([\w\W]+\)","",label[:-1]).replace("  "," ")
                num += 1
                continue
            if title:
                new_label = label.lower()
                new_label = label.lower()
                new_label = new_label.replace("gain","loss")
                new_label = re.sub("\([\w\W]+\)","",new_label).replace("  "," ").strip()
                new_label = re.sub(" \$|\s*\d+\,*", "", new_label) 
                new_label = self.clean_up_string(new_label)
                index.append([title,num,label,new_label])
            
                if self.total_keyword == new_label.split(" ")[0]:
                    title = np.nan
                    num += 1
    
        sheet = sheet.loc[sheet.label.isin(np.array(index)[:,2])].copy()
        sheet.loc[:,["title","title_order","label"]] = list(np.array(index)[:,[0,1,3]])
        sheet = sheet.set_index(["title","title_order","label"]).reset_index()
        return sheet
    
        
    def reconcileTitleOrder(self):
        title_groupby = self.finalStatement.groupby("title")["title_order"].max().reset_index().dropna(axis=0, how='any')
        title_groupby = title_groupby.loc[title_groupby.title != "nan"]
        self.finalStatement = self.finalStatement.merge(title_groupby, on="title", how="left", suffixes=('', '_x') )
        self.finalStatement["title_order"] = [self.merge_number(self.finalStatement.title_order[i],self.finalStatement.title_order_x[i]) for i in range(len(self.finalStatement))] #merge the number for the titles. 
        self.finalStatement = self.finalStatement.drop(["title_order_x"],axis=1)
        
    def reconcileLabelOrderWithTotals(self):
        totals = self.finalStatement.loc[self.finalStatement.label.str.startswith(self.total_keyword)].reset_index(drop=True) 
        self.finalStatement = self.finalStatement.loc[~self.finalStatement.label.str.startswith(self.total_keyword)].reset_index(drop=True)
        self.finalStatement = pd.concat([self.finalStatement,totals]) 
        self.finalStatement = self.finalStatement.reset_index(drop=True).reset_index().rename(columns={"index":"label_order"})
        
            

In [125]:
class Balance(FinancialDataMerge, ProcessQuarters, HelperFunctions):
    def __init__(self):
        self.total_keyword = "total"
        self.finalStatement = pd.DataFrame(columns=["title","title_order","label"])
        
        self.BalanceStatements = self.getBalanceStatements()
        self.addData(self.BalanceStatements)
        
        self.rearrangeFinalStatement()
        
        self.setQuarters()
        
        self.performBalanceMath()
        
        self.reorderQuarters()
        
      
    def getBalanceStatements(self):
        FinancialReports = self.getFinancialStatements()
        BalanceStatements = [[report.getBalanceSheet().copy(), report.file] for report in FinancialReports]
        return BalanceStatements
    
    def parseQuarter(self,date):
        mo = date.split(" ")[0].strip(".")
        year = date.split(", ")[-1]
        if mo in ["May"]:
            return "Q1 "+str(int(year)+1)
        
        if mo in ["Jul","Aug"]:
            return "Q2 "+str(int(year)+1)
        
        if mo in ["Oct","Nov"]:
            return "Q3 "+str(int(year)+1)
        
        if mo in ["Jan","Feb"]:
            return "Year Ended "+str(int(year))
        
        
    def performBalanceMath(self):
        year_end_data = [col for col in self.getQuarterHeaders() if col[:-5] == "Year Ended"]
        q4 = ["Q4 "+col.split(" ")[-1] for col in year_end_data ]
        print(q4)
        self.finalStatement[q4] = self.finalStatement[year_end_data]
        

In [126]:
balance = Balance()
finalStatement = balance.finalStatement
finalStatement


q3 2021.xlsx
q3 2020.xlsx
q1 2019.xlsx
q2 2021.xlsx
q2 2020.xlsx
10k 2020 (2019).xlsx
10k 2019 (2020).xlsx
q1 2021.xlsx
q2 2019.xlsx
q1 2020.xlsx
10k 2022 (2021).xlsx
q3 2019.xlsx
10k 2021 (2022).xlsx
['Q4 2018', 'Q4 2019', 'Q4 2020', 'Q4 2021', 'Q4 2022']


Unnamed: 0,title,label,Q1 2019,Q2 2019,Q3 2019,Q4 2019,Year Ended 2019,Q1 2020,Q2 2020,Q3 2020,...,Q1 2021,Q2 2021,Q3 2021,Q4 2021,Year Ended 2021,Q1 2022,Q2 2022,Q3 2022,Q4 2022,Year Ended 2022
0,Current assets,cash and cash equivalents,1210.0,1322.0,958.0,1081.0,1081.0,941.0,1177.0,788.0,...,1028.0,2188.0,2471.0,1988.0,1988.0,2066.0,2375.0,801.0,877.0,877.0
1,Current assets,short-term investments,,,,,,,,,...,,,178.0,410.0,410.0,,,275.0,0.0,0.0
2,Current assets,merchandise inventory,2035.0,2202.0,2668.0,2131.0,2131.0,2242.0,2326.0,2720.0,...,2217.0,2242.0,2747.0,2451.0,2451.0,2370.0,2281.0,2721.0,3018.0,3018.0
3,Current assets,other current assets,778.0,780.0,792.0,751.0,751.0,757.0,770.0,770.0,...,920.0,882.0,966.0,1159.0,1159.0,1091.0,1201.0,1410.0,1270.0,1270.0
4,Current assets,"available-for-sale securities, current",164.0,286.0,296.0,288.0,288.0,272.0,294.0,294.0,...,51.0,25.0,,,,475.0,337.0,,,
5,Current assets,total current assets,4187.0,4590.0,4714.0,4251.0,4251.0,4212.0,4567.0,4572.0,...,4216.0,5337.0,6362.0,6008.0,6008.0,6002.0,6194.0,5207.0,5165.0,5165.0
6,,"property and equipment, net of accumulated dep...",2791.0,2832.0,2887.0,2912.0,2912.0,3129.0,3141.0,3225.0,...,2945.0,2895.0,2846.0,2841.0,2841.0,2839.0,2897.0,2924.0,3037.0,3037.0
7,,"operating lease, right-of-use asset",0.0,0.0,0.0,0.0,0.0,5732.0,5807.0,5796.0,...,4851.0,4689.0,4460.0,4217.0,4217.0,4060.0,3975.0,3788.0,,
8,,other long-term assets,607.0,588.0,572.0,886.0,886.0,547.0,528.0,525.0,...,698.0,795.0,705.0,703.0,703.0,703.0,693.0,861.0,884.0,884.0
9,,"accumulated depreciation, depletion and amorti...",,,,,,,,,...,,,,,,,,,5071.0,5071.0


In [None]:
class Income(FinancialDataMerge, ProcessQuarters, HelperFunctions):
    def __init__(self):
        self.total_keyword = "total"
        self.finalStatement = pd.DataFrame(columns=["title","title_order","label"])
        
        self.IncomeStatements = self.getIncomeStatements()
        self.addData(self.IncomeStatements)
        
        self.rearrangeFinalStatement()
        
        self.setQuarters()
        
        #self.reorderQuarters()
        
      
    def getIncomeStatements(self):
        FinancialReports = self.getFinancialStatements()
        IncomeStatements = [[report.getIncomeSheet().copy(), report.file] for report in FinancialReports]
        return IncomeStatements
    
    def parseQuarter(self,date):
        mo = date.split(" ")[0].strip(".")
        year = date.split(", ")[-1]
        if mo in ["May"]:
            return "Q1 "+str(int(year)+1)
        
        if mo in ["Jul","Aug"]:
            return "Q2 "+str(int(year)+1)
        
        if mo in ["Oct","Nov"]:
            return "Q3 "+str(int(year)+1)
        
        if mo in ["Jan","Feb"]:
            return "Year Ended "+str(int(year))
        

In [None]:
income = Income()
finalIncome = income.finalIncome
finalIncome


In [27]:
finalStatement.to_csv("balance 2.csv")