In [11]:
import pandas as pd
import os
import openpyxl
import numpy as np
import re
from datetime import datetime

In [15]:
class FinancialReport():
    def __init__(self,file):
        self.file = file
        self.readFile()
        self.findBalanceSheet()
    def readFile(self):
        path = os.path.join("financial statements",self.file)
        self.statements = pd.read_excel(path,sheet_name=None)
    def getSheetNames(self):
        return self.statements.keys()
    def findBalanceSheet(self):
        balanceSheetName = [name for name in self.getSheetNames() if "BALANCE" in name][0]
        self.balance = self.statements[balanceSheetName]
        self.balance.columns = ["label"]+list(self.balance.columns[1:])
    def getBalanceSheet(self):
        return self.balance
    

In [37]:
class Balance():
    def __init__(self):
        self.finalBalance = pd.DataFrame(columns=["title","title_order","label"])
        self.setFinancialStatements()
        self.addDataToBalanceSheet()
#         self.rearrangeFinalBalanceSheet()
#         self.setQuarters()
        
    def getFiles(self):
        files = os.listdir("financial statements")
        files.remove(".DS_Store")
        files = [file for file in files if file[0] != "~"]
        return files

    def setFinancialStatements(self):
        self.files = self.getFiles()
        self.FinancialReports = [FinancialReport(file) for file in self.files]
        
    def addDataToBalanceSheet(self):
        
        for FinancialStatements in self.FinancialReports:
            balance = FinancialStatements.getBalanceSheet().copy()
            
            cols = ["label"]+[col for col in balance.columns[1:] if col not in self.finalBalance.columns] ## remove columns of data already in main balance sheet
            balance = balance[cols]
            
            balance = self.parseStatement(balance)
            
            self.finalBalance = pd.merge(self.finalBalance, balance, how="outer",on=["label","title"],suffixes=('', '_x'))
            
            self.finalBalance.title_order = [self.merge_number(self.finalBalance.title_order[i],self.finalBalance.title_order_x[i]) for i in range(len(self.finalBalance))] #merge the number for the titles. use greatest num bc bigger num means there is section before it
            self.finalBalance = self.finalBalance.drop(["title_order_x"],axis=1)

    def rearrangeFinalBalanceSheet(self):
        self.finalBalance = self.finalBalance.replace(" ",np.nan)

        # put totals at the bottom of each section
        totals = self.finalBalance.loc[self.finalBalance.label.str.startswith("total")].reset_index(drop=True) 
        self.finalBalance = self.finalBalance.loc[~self.finalBalance.label.str.startswith("total")].reset_index(drop=True)
        self.finalBalance = pd.concat([self.finalBalance,totals]) 
        self.finalBalance = self.finalBalance.reset_index(drop=True).reset_index()
        self.finalBalance = self.finalBalance.sort_values(["title_order","index"]) # order by title and then location
        self.finalBalance = self.finalBalance.drop(["index","title_order"],axis=1)

    def parseStatement(self,sheet):
        num = 0
        index = []
        title = False
        for label in sheet.label:
            if ":" in label:
                title = re.sub("\([\w\W]+\)","",label[:-1]).replace("  "," ")
                num += 1
                continue
            if title:
                new_label = label.lower()
                new_label = new_label.replace("gain","loss")
                new_label = re.sub("\([\w\W]+\)","",new_label).replace("  "," ").strip()
                new_label = re.sub(" \$|\s*\d+\,*", "", new_label) 
                new_label = self.clean_up_string(new_label)
                
                print([title,num,label,new_label])
                index.append([title,num,label,new_label])
            
            if "total" == new_label[:5]:
                title = np.nan
                num += 1

        sheet = sheet.loc[sheet.label.isin(np.array(index)[:,2])].copy()
        sheet.loc[:,["title","title_order","label"]] = list(np.array(index)[:,[0,1,3]])
        sheet = sheet.set_index(["title","title_order","label"]).reset_index()
        return sheet
    
    def clean_up_string(self,string):
        string = string.strip()
        space_positions = [x.start() for x in re.finditer('\ ',  string)]
        for i in reversed(space_positions):
            if string[i+1:] in ["of", "and"]:
                string = string[:i]
        return string
    
    def merge_number(self,a,b):
        if pd.isnull(a):
            return b
        if pd.isnull(b):
            return a
        return max(a,b)
    def setQuarters(self):
        dates = list(self.finalBalance.columns[2:])
        dates.sort(key=lambda date: datetime.strptime(date.replace(".",""), '%b %d, %Y'))
        self.finalBalance = self.finalBalance[["title","label"]+dates]
        self.finalBalance.columns = ["title","label"]+[self.parseQuarter(date) for date in self.finalBalance.columns[2:]]
    def parseQuarter(self,date):
        mo = date.split(" ")[0].strip(".")
        year = date.split(", ")[-1]
        if mo in ["May"]:
            return "Q1 "+str(int(year)+1)
        
        if mo in ["Jul","Aug"]:
            return "Q2 "+str(int(year)+1)
        
        if mo in ["Oct","Nov"]:
            return "Q3 "+str(int(year)+1)
        
        if mo in ["Jan","Feb"]:
            return "Q4 "+str(int(year))
        
        

In [38]:
balance = Balance()
balance.finalBalance

['Current assets', 1, 'Cash and cash equivalents', 'cash and cash equivalents']
['Current assets', 1, 'Short-term investments', 'short-term investments']
['Current assets', 1, 'Merchandise inventory', 'merchandise inventory']
['Current assets', 1, 'Other current assets', 'other current assets']
['Current assets', 1, 'Total current assets', 'total current assets']
[nan, 2, 'Property and equipment, net of accumulated depreciation', 'property and equipment, net of accumulated depreciation']
[nan, 2, 'Operating Lease, Right-of-Use Asset', 'operating lease, right-of-use asset']
[nan, 2, 'Other long-term assets', 'other long-term assets']
[nan, 2, 'Total assets', 'total assets']
['Current liabilities', 4, 'Accounts payable', 'accounts payable']
['Current liabilities', 4, 'Accrued expenses and other current liabilities', 'accrued expenses and other current liabilities']
['Current liabilities', 4, 'Operating Lease, Liability, Current', 'operating lease, liability, current']
['Current liabiliti

Unnamed: 0,title_order,title,label,"Oct. 30, 2021","Jan. 30, 2021","Oct. 31, 2020","Feb. 01, 2020","Nov. 02, 2019","May 04, 2019","Feb. 02, 2019","May 05, 2018","Jul. 31, 2021","Aug. 01, 2020","Aug. 03, 2019","Feb. 03, 2018","May 01, 2021","May 02, 2020","Aug. 04, 2018","Jan. 29, 2022","Nov. 03, 2018"
0,1,Current assets,cash and cash equivalents,801.0,1988.0,2471.0,1364.0,788.0,941.0,1081.0,1210.0,2375.0,2188.0,1177.0,1783.0,2066.0,1028.0,1322.0,877.0,958.0
1,1,Current assets,short-term investments,275.0,410.0,178.0,,,,,,,,,0.0,,,,0.0,
2,1,Current assets,merchandise inventory,2721.0,2451.0,2747.0,2156.0,2720.0,2242.0,2131.0,2035.0,2281.0,2242.0,2326.0,1997.0,2370.0,2217.0,2202.0,3018.0,2668.0
3,1,Current assets,other current assets,1410.0,1159.0,966.0,706.0,770.0,757.0,751.0,778.0,1201.0,882.0,770.0,788.0,1091.0,920.0,780.0,1270.0,792.0
4,1,Current assets,total current assets,5207.0,6008.0,6362.0,4516.0,4572.0,4212.0,4251.0,4187.0,6194.0,5337.0,4567.0,4568.0,6002.0,4216.0,4590.0,5165.0,4714.0
5,2,,"property and equipment, net of accumulated dep...",2924.0,2841.0,2846.0,3122.0,3225.0,3129.0,2912.0,2791.0,2897.0,2895.0,3141.0,2805.0,2839.0,2945.0,2832.0,3037.0,2887.0
6,2,,"operating lease, right-of-use asset",3788.0,4217.0,4460.0,5402.0,5796.0,5732.0,0.0,0.0,3975.0,4689.0,5807.0,,4060.0,4851.0,0.0,,0.0
7,2,,other long-term assets,861.0,703.0,705.0,639.0,525.0,547.0,886.0,607.0,693.0,795.0,528.0,616.0,703.0,698.0,588.0,884.0,572.0
8,2,,total assets,12780.0,13769.0,14373.0,13679.0,14118.0,13620.0,8049.0,7585.0,13759.0,13716.0,14043.0,7989.0,13604.0,12710.0,8010.0,12761.0,8173.0
9,4,Current liabilities,accounts payable,1630.0,1743.0,2284.0,1174.0,1241.0,994.0,1126.0,1072.0,1583.0,1629.0,1246.0,1181.0,1530.0,971.0,1297.0,1951.0,1299.0


In [24]:
balance.finalBalance.to_csv("finalBalance2.csv")

In [79]:
'Oct. 30, 2021'.split(", ")[-1]

'2021'