In [100]:
import pandas as pd
import os
import openpyxl
import numpy as np
import re
from datetime import datetime
import requests

In [173]:
class HelperFunctions():
    def getYears(self,quarters): #go through each file name and find the year of filing
        return [int(re.findall("\d{4}",quarter)[0]) for quarter in quarters]
    
    def getUniqueYears(self,quarters):
        return set(self.getYears(quarters))
    
    def getPossibleFilingsFromYears(self, years):
        return [q + " " + str(year) for year in years for q in ["Q1","Q2","Q3","Q4","Year Ended"]]
    
    def getExcelSheetsPostDate(self,Excel):
        years = range(self.fromDate,self.endDate+1)
        names = self.getPossibleFilingsFromYears(years)
        names = ["Year Ended "+str(self.fromDate-1)]+names
        files = [filing for name in names for filing in Excel.keys() if name in filing] 
        Excel = {file:Excel[file] for file in files}
        return Excel
    
    def reorderQuarters(self,compiledStatement):
        quarters = self.getQuarters(compiledStatement)
        years = self.getUniqueYears(quarters)
        cols = self.getPossibleFilingsFromYears(years)
        cols = [col for col in cols if col in quarters]
        compiledStatement = compiledStatement[["label"] + cols]
        return compiledStatement
        
    def getQuarters(self, compiledStatement):
        return list(compiledStatement.columns[1:])
    
    
    def cleanup_label(self, label):
        new_label = label.lower()
        new_label = new_label.replace("gain","loss").replace("decrease","increase").replace("  "," ").replace("—"," - ").strip()
        new_label = re.sub("\s\([\w\W]+?\)","",new_label)
        new_label = re.sub(" \$|\s*\d+\,*", "", new_label) 
        return new_label
    
    
    def addData(self, Excel):
        compiledStatement = pd.DataFrame(columns=["label"])
        
        for quarter,sheet in list(Excel.items()):
            quarter =  re.sub("\s\([\w\W]+?\)","",quarter) #remove the parenthesis ie. (2021) from the quarter
            sheet.columns = ["label",quarter]
            sheet.label = sheet.label.apply(self.cleanup_label)
            
            #sheet = sheet.dropna(axis=0, how="any")
            sheet = sheet[(~sheet[quarter].astype("string").str.contains("[a-zA-Z]",regex=True))|sheet[quarter].isnull()]

            sheeti = statementi = 0
            newStatement = pd.DataFrame(columns=["label"])
            #merge the current finalstatement and new sheet into one statement 
            #go through each row of the two statements and keep track of the indexes of each statement
            #continue until one of either the compiled statement or sheet ends
            while sheeti < len(sheet) and statementi < len(compiledStatement):
              
                sheet_label = sheet.iloc[sheeti].label
                statement_label = compiledStatement.iloc[statementi].label
                newRowStatement = compiledStatement.iloc[statementi:statementi+1].reset_index(drop=True) #grab the row and drop the index so that they merge
                newRowSheet = sheet.iloc[sheeti:sheeti+1].reset_index(drop=True)
                
                #check if the labels are the same 
                if statement_label == sheet_label:
                    #if labels are the same then create a new row by concating the rows
                    newRowSheet = newRowSheet.drop(columns="label")
                    newRow = pd.concat([newRowStatement,newRowSheet],axis=1)
                    
                    #concat the newRow to the newstatement
                    newStatement = pd.concat([newStatement, newRow])
                    #move on to the next row
                    sheeti+=1
                    statementi+=1
                    
                else:
                    maxi = min(sheeti+4,len(sheet))
                    
                    #next, before I append the final statement label, I want to check if the statement label corresponds to the next label for the sheet
                    #if it corresponds, then I will move the sheet up instead
                    if statement_label in list(sheet.iloc[sheeti:maxi].label): 
                        newStatement = pd.concat([newStatement,newRowSheet]) #concat the sheet row to the new statement
                        sheeti += 1
                        continue
                    
                    #if the two labels don't match, default is to add the row of the final statment
                    newStatement = pd.concat([newStatement,newRowStatement])
                    statementi += 1
                    
            #if either the finalstatement or the new sheet hasn't finished, then add the end of the statement to the new one
            newStatement = pd.concat([newStatement,sheet.iloc[sheeti:len(sheet)]])
            newStatement = pd.concat([newStatement,compiledStatement.iloc[statementi:len(compiledStatement)]])
            
            compiledStatement = newStatement
            
        compiledStatement = compiledStatement.reset_index(drop=True)
        return compiledStatement
    

In [174]:
class CompileIncomeStatement(HelperFunctions):
    def __init__(self,ticker,fromDate, endDate=2022):
        self.ticker = ticker
        self.fromDate = fromDate
        self.endDate = endDate
        
        self.path = os.path.join("input",f"Financial Statement {self.ticker}")
        self.input = os.path.join(self.path, f"Income Statements All-{self.ticker}.xlsx")
        self.Excel = self.readExcel()
        self.Excel = self.getExcelSheetsPostDate(self.Excel)
        
        self.compiledStatement = self.addData(self.Excel)
        self.compiledStatement = self.performIncomeMath(self.compiledStatement)
        self.compiledStatement = self.reorderQuarters(self.compiledStatement)
    
    def readExcel(self):
        Excel = pd.read_excel(self.input,sheet_name=None)
        return Excel
    
    
    
    def performIncomeMath(self,compiledStatement):
        quarters = self.getQuarters(compiledStatement)
        years = self.getUniqueYears(quarters)
        
        for year in years:
            year = str(year)
            if "Year Ended "+year in quarters and "Q3 "+year in quarters: 
                compiledStatement["Q4 "+year] = compiledStatement["Year Ended "+year] - compiledStatement["Q3 "+year]
                compiledStatement.loc[compiledStatement.label.str.startswith("weighted-average"),"Q4 "+year] = compiledStatement.loc[compiledStatement.label.str.startswith("weighted-average"),"Year Ended "+year]
        return compiledStatement

            

In [175]:
class Income:
    def __init__(self,ticker,fromDate):
        self.ticker = ticker
        self.fromDate = fromDate
        
        self.path = os.path.join("input",f"Financial Statement {self.ticker}")
        self.output = os.path.join(self.path, f"Compiled Income Statement-{self.ticker}.xlsx")
        
        self.createSeparateStatements()
        self.createStatement()
        
        self.writeExcel()
        
    def createSeparateStatements(self):
        self.separateCompiledStatement = pd.DataFrame()
        for year in range(self.fromDate,2023):
            statement = CompileIncomeStatement(self.ticker, year,year)
            self.separateCompiledStatement = pd.concat([self.separateCompiledStatement,statement.compiledStatement],axis=1)
    
    def createStatement(self):
        self.compiledStatement = CompileIncomeStatement(self.ticker, self.fromDate).compiledStatement
        
    def writeExcel(self):
        Excel = pd.ExcelWriter(self.output)
        self.compiledStatement.to_excel(Excel, index=False, sheet_name="Compiled Statement")
        self.separateCompiledStatement.to_excel(Excel, index=False, sheet_name="Separately Compiled Statement")
        
        Excel.save()
         
    

In [178]:
income = Income("MSFT",2019)
income.compiledStatement



Unnamed: 0,label,Year Ended 2018,Q1 2019,Q2 2019,Q3 2019,Q4 2019,Year Ended 2019,Q1 2020,Q2 2020,Q3 2020,...,Q1 2021,Q2 2021,Q3 2021,Q4 2021,Year Ended 2021,Q1 2022,Q2 2022,Q3 2022,Q4 2022,Year Ended 2022
0,revenue,110360.0,29084.0,32471.0,30571.0,95272.0,125843.0,33055.0,36906.0,35021.0,...,37154.0,43076.0,41706.0,126382.0,168088.0,45317.0,51728.0,49360.0,148910.0,198270.0
1,cost of revenue,38353.0,9905.0,12423.0,10170.0,32740.0,42910.0,10406.0,12358.0,10975.0,...,11002.0,14194.0,13045.0,39187.0,52232.0,13646.0,16960.0,15615.0,47035.0,62650.0
2,gross margin,72007.0,19179.0,20048.0,20401.0,62532.0,82933.0,22649.0,24548.0,24046.0,...,26152.0,28882.0,28661.0,87195.0,115856.0,31671.0,34768.0,33745.0,101875.0,135620.0
3,research and development,14726.0,3977.0,4070.0,4316.0,12560.0,16876.0,4565.0,4603.0,4887.0,...,4926.0,4899.0,5204.0,15512.0,20716.0,5599.0,5758.0,6306.0,18206.0,24512.0
4,sales and marketing,17469.0,4098.0,4588.0,4565.0,13648.0,18213.0,4337.0,4933.0,4911.0,...,4231.0,4947.0,5082.0,15035.0,20117.0,4547.0,5379.0,5595.0,16230.0,21825.0
5,general and administrative,4754.0,1149.0,1132.0,1179.0,3706.0,4885.0,1061.0,1121.0,1273.0,...,1119.0,1139.0,1327.0,3780.0,5107.0,1287.0,1384.0,1480.0,4420.0,5900.0
6,impairment and restructuring,0.0,,,,,,,,,...,,,,,,,,,,
7,restructuring,,,,,,0.0,,,,...,,,,,,,,,,
8,operating income,35058.0,9955.0,10258.0,10341.0,32618.0,42959.0,12686.0,13891.0,12975.0,...,15876.0,17897.0,17048.0,52868.0,69916.0,20238.0,22247.0,20364.0,63019.0,83383.0
9,"other income, net",1416.0,266.0,127.0,145.0,584.0,729.0,0.0,194.0,-132.0,...,248.0,440.0,188.0,998.0,1186.0,286.0,268.0,-174.0,507.0,333.0


In [88]:
class CompileCashflowStatement(HelperFunctions):
    def __init__(self,ticker,fromDate,self.endDate):
        self.ticker = ticker
        self.fromDate = fromDate
        self.endDate = endDate
        
        self.path = os.path.join("input",f"Financial Statement {self.ticker}")
        self.input = os.path.join(self.path, f"Cashflow Statements All-{self.ticker}.xlsx")
        self.output = os.path.join(self.path, f"Compiled Cashflow Statement-{self.ticker}.xlsx")
        
        
        self.Excel = self.readExcel()
        self.Excel = self.getExcelSheetsPostDate(self.Excel)
        
        self.compiledStatement = self.addData(self.Excel)

        self.compiledStatement = self.reorderQuarters(self.compiledStatement)

        
    def readExcel(self):
        Excel = pd.read_excel(self.input,sheet_name=None)
        return Excel
    
    
   
            

In [94]:
cashflow = Cashflow("WMT",2020)
cashflow.compiledStatement



Unnamed: 0,label,Year Ended 2020,Q1 2021,Q2 2021,Q3 2021,Year Ended 2021,Q1 2022,Q2 2022,Q3 2022,Year Ended 2022,Q1 2023
0,cash flows from operating activities:,,,,,,,,,,
1,consolidated net income,15201.0,4074.0,10513.0,15714.0,13706.0,2811.0,7175.0,10307.0,13940.0,2103.0
2,adjustments to reconcile income from continuin...,,,,,,,,,,
3,adjustments to reconcile consolidated net inco...,,,,,,,,,,
4,depreciation and amortization,10987.0,2791.0,5562.0,8333.0,11152.0,2661.0,5302.0,7952.0,10658.0,2680.0
5,unrealized and losses,-1886.0,-783.0,-4006.0,-6883.0,,,,,,
6,(losss) and losses for disposal of business op...,15.0,,,,,,,,,
7,net unrealized and realized and losses,,,,,-8589.0,2077.0,3019.0,1831.0,2440.0,1989.0
8,losses on disposal of business operations,,,,,8401.0,433.0,433.0,433.0,433.0,0.0
9,asda pension contribution,-1036.0,,,,0.0,,,,0.0,
