In [1]:
import pandas as pd
import os
import openpyxl
import numpy as np
import re

In [2]:
class FinancialReport():
    def __init__(self,file):
        self.file = file
        self.readFile()
        self.findBalanceSheet()
    def readFile(self):
        path = os.path.join("financial statements",self.file)
        self.statements = pd.read_excel(path,sheet_name=None)
    def getSheetNames(self):
        return self.statements.keys()
    def findBalanceSheet(self):
        balanceSheetName = [name for name in self.getSheetNames() if "BALANCE" in name][0]
        self.balance = self.statements[balanceSheetName]
        self.balance.columns = ["labels"]+list(self.balance.columns[1:])
    def getBalanceSheet(self):
        return self.balance
    

In [34]:
class Balance():
    def __init__(self):
        self.finalBalance = pd.DataFrame(columns=["title","num_title","labels"])
        self.setFinancialStatements()
        self.addDataToBalanceSheet()
        #self.rearrangeFinalBalanceSheet()
        
    def getFiles(self):
        files = os.listdir("financial statements")
        files.remove(".DS_Store")
        files = [file for file in files if file[0] != "~"]
        return files

    def setFinancialStatements(self):
        self.files = self.getFiles()
        self.FinancialReports = [FinancialReport(file) for file in self.files]
        self.addDataToBalanceSheet()
        self.rearrangeFinalBalanceSheet()
        
    def addDataToBalanceSheet(self):
        
        for FinancialStatements in self.FinancialReports[:1]:
            balance = FinancialStatements.getBalanceSheet().copy()
            
            cols = ["labels"]+[col for col in balance.columns[1:] if col not in self.finalBalance.columns] ## remove columns of data already in main balance sheet
            balance = balance[cols]
            
            balance = self.parse_attributes(balance)
            
            balance.loc[:,"labels"] = [re.sub(" \$|\s*\d+\,*", "", label)  if np.any([char.isdigit() for char in label]) else label for label in balance["labels"]  ]
            balance.labels = balance.labels.apply(self.clean_up_string)
            
            
            self.finalBalance = pd.merge(self.finalBalance, balance, how="outer",on=["labels"],suffixes=('', '_new'))

            self.finalBalance.title = [self.finalBalance.title[i] if not pd.isnull(self.finalBalance.title[i]) else self.finalBalance.title_new[i] for i in range(len(self.finalBalance))] #merge titles
            self.finalBalance.num_title = [self.merge_number(self.finalBalance.num_title[i],self.finalBalance.num_title_new[i]) for i in range(len(self.finalBalance))] #merge the number for the titles. use greatest num bc bigger num means there is section before it
            self.finalBalance = self.finalBalance.drop(["title_new","num_title_new"],axis=1)

    def rearrangeFinalBalanceSheet(self):
        self.finalBalance = self.finalBalance.replace(" ",np.nan)

        # put totals at the bottom of each section
        totals = self.finalBalance.loc[self.finalBalance.labels.str.startswith("Total")].reset_index(drop=True) 
        self.finalBalance = self.finalBalance.loc[~self.finalBalance.labels.str.startswith("Total")].reset_index(drop=True)
        self.finalBalance = pd.concat([self.finalBalance,totals]) 
        self.finalBalance = self.finalBalance.reset_index(drop=True).reset_index()
        self.finalBalance = self.finalBalance.sort_values(["num_title","index"])
        

    def parse_attributes(self,balance):
        num = 0
        index = []
        for label in balance.labels:
            if ":" in label:
                title = label[:-1]
                num += 1
                continue
            index.append([title,num,label])
            if "Total" in label:
                title = np.nan
                num += 1

        balance = balance.loc[balance.labels.isin(np.array(index)[:,2])].copy()
        balance.loc[:,["title","num_title","labels"]] = list(index)
        balance = balance.set_index(["title","num_title","labels"]).reset_index()
        return balance
    
    def clean_up_string(self,string):
        string = string.strip()
        space_positions = [x.start() for x in re.finditer('\ ',  string)]
        for i in reversed(space_positions):
            if string[i+1:] in ["of", "and"]:
                string = string[:i]
        return string
    
    def merge_number(self,a,b):
        if pd.isnull(a):
            return b
        if pd.isnull(b):
            return a
        return max(a,b)
    

In [35]:
balance = Balance()
balance.finalBalance

MergeError: Can only pass argument "on" OR "left_index" and "right_index", not a combination of both.

In [655]:
#cols = finalBalance.columns[2:]

#finalBalance = finalBalance.loc[~finalBalance[cols].isnull().all(axis=1)]

In [656]:
finalBalance = finalBalance.replace(" ",np.nan)

# put totals at the bottom of each section
totals = finalBalance.loc[finalBalance.labels.str.startswith("Total")].reset_index(drop=True) 
finalBalance = finalBalance.loc[~finalBalance.labels.str.startswith("Total")].reset_index(drop=True)
finalBalance = pd.concat([finalBalance,totals]) 
finalBalance = finalBalance.reset_index(drop=True).reset_index()
finalBalance = finalBalance.sort_values(["num_title","index"])


In [657]:
finalBalance2 = finalBalance.drop(["index","num_title"],axis=1)

In [658]:
finalBalance2.to_csv("finalBalance.csv")

In [620]:
finalBalance

Unnamed: 0,index,title,num_title,labels,"Oct. 30, 2021","Jan. 30, 2021","Oct. 31, 2020","Feb. 01, 2020","Nov. 02, 2019","May 04, 2019",...,"May 05, 2018","Jul. 31, 2021","Aug. 01, 2020","Aug. 03, 2019","Feb. 03, 2018","May 01, 2021","May 02, 2020","Aug. 04, 2018","Jan. 29, 2022","Nov. 03, 2018"
0,0,Current assets,0.0,Cash and cash equivalents,801.0,1988.0,2471.0,1364.0,788.0,941.0,...,1210.0,2375.0,2188.0,1177.0,1783.0,2066.0,1028.0,1322.0,877.0,958.0
1,1,Current assets,0.0,Short-term investments,275.0,410.0,178.0,,,,...,,,,,,,,,0.0,
2,2,Current assets,0.0,Merchandise inventory,2721.0,2451.0,2747.0,2156.0,2720.0,2242.0,...,2035.0,2281.0,2242.0,2326.0,1997.0,2370.0,2217.0,2202.0,3018.0,2668.0
3,3,Current assets,0.0,Other current assets,1410.0,1159.0,966.0,706.0,770.0,757.0,...,778.0,1201.0,882.0,770.0,788.0,1091.0,920.0,780.0,1270.0,792.0
23,23,Current assets,0.0,"Available-for-sale Securities, Current",,,,290.0,294.0,272.0,...,164.0,337.0,25.0,294.0,,475.0,51.0,286.0,,296.0
25,25,Current assets,0.0,Short-term Investments,,,,,,,...,,,,,0.0,,,,,
37,37,Current assets,0.0,"Cash and Cash Equivalents, at Carrying Value",,,,,,,...,,,,,,,,,,
38,38,Current assets,0.0,"Other Assets, Current",,,,,,,...,,,,,,,,,,
40,40,Current assets,0.0,Total current assets,5207.0,6008.0,6362.0,4516.0,4572.0,4212.0,...,4187.0,6194.0,5337.0,4567.0,4568.0,6002.0,4216.0,4590.0,5165.0,4714.0
4,4,,1.0,"Property and equipment, net of accumulated dep...",2924.0,2841.0,2846.0,3122.0,3225.0,3129.0,...,2791.0,2897.0,2895.0,3141.0,2805.0,2839.0,2945.0,2832.0,3037.0,2887.0
