# Setup

## Check Python version

In [264]:
from platform import  python_version

In [265]:
python_version()

'3.10.5'

## Install/Download necessary packages

In [266]:
import sys

In [267]:
# # Uncomment these if any packages are not installed in your current jupyter env 
# # Installing a pip package in the current kernel
# # Pandas also installs the numpy package
# !{sys.executable} -m pip install pandas  
# !{sys.executable} -m pip install requests
# !{sys.executable} -m pip install matplotlib
# !{sys.executable} -m pip install sklearn
# !{sys.executable} -m pip install featuretools

In [268]:
# import the required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import sklearn
import os
import datetime as dt
import featuretools as ft
from featuretools.selection import selection
from IPython.core.interactiveshell import InteractiveShell 

# Importing and Preprocessing the Data

In [269]:
def view_files():
    path = os.getcwd()
    path = f"{path}\data"
    return(os.listdir(path))

view_files()

['absa.csv',
 'AllShares_growth.csv',
 'household_financial_assets-currency_and_deposits.csv',
 'investment_Qgrowth.csv',
 'inv_by_assets_intellectual.csv',
 'share_prices.csv']

## Importing and Cleaning

In [270]:
def check_file(file):
    #grab the file path from which to import the dataset
    path = os.getcwd()
    path = f"{path}\data"
    path = f"{path}\{file}"
    return(path)

In [271]:
check_file("absa.csv")

'C:\\GitHub\\DS_PROJ\\data\\absa.csv'

In [272]:
def import_data(path):
    # read the csv file as a dataframe and remove unnecessary columns
    df = pd.read_csv(filepath_or_buffer=path)
    return(df)

In [273]:
absa = import_data(check_file("absa.csv"))
absa.head(1)

Unnamed: 0,Bank,Unit,Table,Time series code,D_M_1993M01: 1993M01,D_M_1993M02: 1993M02,D_M_1993M03: 1993M03,D_M_1993M04: 1993M04,D_M_1993M05: 1993M05,D_M_1993M06: 1993M06,...,D_M_2021M07: 2021M07,D_M_2021M08: 2021M08,D_M_2021M09: 2021M09,D_M_2021M10: 2021M10,D_M_2021M11: 2021M11,D_M_2021M12: 2021M12,D_M_2022M01: 2022M01,D_M_2022M02: 2022M02,D_M_2022M03: 2022M03,D_M_2022M04: 2022M04
0,B_34118: Absa Bank Ltd,U_RT: R'000 (thousands),T_T01R001: Item 1: DEPOSITS (total of items 2 ...,RBD-B_34118-T_T01R001-U_RT,,,,,,,...,,,,,,,,,,


The data is not stored in the format typically useful for the pandas framework.

First, the unnecessary variables/indicators such as `Units` and `Time series code"` 

is be removed or transformed and the excessive variable naming trimmed.

In [274]:
def clean_names1(df, bank_prefix="B_34118: "):
    df = df.drop(["Unit", "Time series code"], axis="columns")
    
    # Remove ugly string labels of columns
    df.columns = df.columns.str.replace(pat="D_M_[0-9]{4}M[0-9]{2}:", repl="", regex=True)
    df["Bank"] = df["Bank"].str.replace(pat=bank_prefix, repl="", regex=True)
    df["Bank"] = df["Bank"].str.replace(pat = " ", repl = "_")
    return(df)
    

In [275]:
absa = clean_names1(absa)

In [276]:
absa.head(1)

Unnamed: 0,Bank,Table,1993M01,1993M02,1993M03,1993M04,1993M05,1993M06,1993M07,1993M08,...,2021M07,2021M08,2021M09,2021M10,2021M11,2021M12,2022M01,2022M02,2022M03,2022M04
0,Absa_Bank_Ltd,T_T01R001: Item 1: DEPOSITS (total of items 2 ...,,,,,,,,,...,,,,,,,,,,


Below is a check to ensure that all the rows that have lable 

`T_T[0-9]{2}R[0-9]{3}:` or `T_T[0-9]{2}R[0-9]{3}_A:` do not

contain any observations and can therefore be removed

In [277]:
InteractiveShell.ast_node_interactivity = "all"

In [278]:
all(absa[absa["Table"].str.contains("T_T[0-9]{2}R[0-9]{3}:", 
                                    regex = True)].isna().sum(axis=1, 
                                                              skipna=False) == len(absa.columns) - 2);
all(absa[absa["Table"].str.contains("T_T[0-9]{2}R[0-9]{3}_A:", 
                                    regex = True)].isna().sum(axis=1, 
                                                              skipna=False) == len(absa.columns) - 2)

True

True

In [279]:
InteractiveShell.ast_node_interactivity = "last"

Now, we can remove the rows that contain the labels

mentioned above

In [280]:
def remove_empty(df):
    # Remove empty title rows
    df = df[~df["Table"].str.contains("T_T[0-9]{2}R[0-9]{3}:", regex = True)]
    df = df[~df["Table"].str.contains("T_T[0-9]{2}R[0-9]{3}_A:", regex = True)]
    
    return(df)

In [281]:
absa = remove_empty(absa)

In [282]:
absa.head(1)

Unnamed: 0,Bank,Table,1993M01,1993M02,1993M03,1993M04,1993M05,1993M06,1993M07,1993M08,...,2021M07,2021M08,2021M09,2021M10,2021M11,2021M12,2022M01,2022M02,2022M03,2022M04
1,Absa_Bank_Ltd,T_T01R001C01: T01R001C01: DEPOSITS (total of i...,9588342.0,11392410.0,10668670.0,11432864.0,11014770.0,11042960.0,10561166.0,12471006.0,...,218471395.0,209031404.0,215453291.0,214590582.0,218067373.0,230444938.0,202177025.0,208657411.0,221079725.0,220710857.0


In [283]:
# absa.loc[absa["Table"].str.contains("T_T01R[0-9]{3}C[0-9]{2}: T01R[0-9]{3}[A]{0,1}C[0-9]{2}: ", 
#                                regex=True), :].iloc[:,1:3].set_index("Table").head(15)

absa.loc[absa["Table"].str.contains("T_T01R001|T_T01R002|T_T02R032", 
                               regex=True), :].iloc[:,1:3].set_index("Table")

Unnamed: 0_level_0,1993M01
Table,Unnamed: 1_level_1
T_T01R001C01: T01R001C01: DEPOSITS (total of items 2 and 32): Cheque (1),9588342.0
T_T01R001C02: T01R001C02: DEPOSITS (total of items 2 and 32): Savings (2),7800740.0
T_T01R001C03: T01R001C03: DEPOSITS (total of items 2 and 32): Up to 1 day (3),8946659.0
T_T01R001C04: T01R001C04: DEPOSITS (total of items 2 and 32): More than 1 day to 1 month (4),9417927.0
T_T01R001C05: T01R001C05: DEPOSITS (total of items 2 and 32): More than 1 month to 6 months (5),21301640.0
T_T01R001C06: T01R001C06: DEPOSITS (total of items 2 and 32): More than 6 months (6),8981624.0
T_T01R001C07: T01R001C07: DEPOSITS (total of items 2 and 32): TOTAL (7),66036932.0
T_T01R001C08: T01R001C08: DEPOSITS (total of items 2 and 32): NCDs/PNs i (included in col. 7) (8),
"T_T01R002C01: T01R002C01: DEPOSITS DENOMINATED IN RAND (total of items 3, 6, 12, 13 and 29): Cheque (1)",9588342.0
"T_T01R002C02: T01R002C02: DEPOSITS DENOMINATED IN RAND (total of items 3, 6, 12, 13 and 29): Savings (2)",7800740.0


In [284]:
66034650.0 + 2282.0 == 66036932.0

# i.e. what we have above is
r_001 = absa.loc[absa["Table"].str.contains("DEPOSITS [(]total of items 2 and 32[)]: TOTAL [(]7[)]"),
        :].iloc[:,3]
r_002 = absa.loc[absa["Table"].str.contains("DEPOSITS DENOMINATED IN RAND [(]total of items 3, 6, 12, 13 and 29[)]: TOTAL [(]7[)]"),
          :].iloc[:,3]
r_032 = absa.loc[absa["Table"].str.contains("DEPOSITS DENOMINATED IN FOREIGN CURRENCY [(]total of items 33 to 38[)]: TOTAL [(]7[)]"),
          :].iloc[:,3]
int(r_001) == int(r_002) + int(r_032)


True

In [285]:
any(absa["Table"].str.contains("of which", case=False))
absa[absa["Table"].str.contains("Of which : in foreign currency", case=False)]["Table"]

374    T_T03R041C14: T03R041C14: OTHER BORROWED FUNDS...
380    T_T03R042C14: T03R042C14: Loans received under...
386    T_T03R043C14: T03R043C14: SA Reserve Bank and ...
392    T_T03R044C14: T03R044C14: SA banksb: Of which ...
398    T_T03R450C14: T03R045AC14: Insurers and pensio...
404    T_T03R045C14: T03R045C14: Insurersg: Of which ...
410    T_T03R046C14: T03R046C14: Pension funds: Of wh...
416    T_T03R470C14: T03R047AC14: Financial and non-f...
422    T_T03R047C14: T03R047C14: Other financial corp...
428    T_T03R048C14: T03R048C14: Non-financial corpor...
434    T_T03R049C14: T03R049C14: Foreign sector: Of w...
440    T_T03R050C14: T03R050C14: Other: Of which : in...
446    T_T03R051C14: T03R051C14: Collateralised borro...
452    T_T03R052C14: T03R052C14: SA banksb: Of which ...
458    T_T03R530C14: T03R053AC14: Financial corporate...
464    T_T03R053C14: T03R053C14: Financial corporate ...
470    T_T03R054C14: T03R054C14: Non-financial corpor...
476    T_T03R055C14: T03R055C14

And finally, we can remove the ugly naming

In [286]:
# def clean_names2(df):
#     # Label the different tables withing the df, i.e liablities, assets, etc.
#     df["Table"] = df["Table"].str.replace("T_T0[0-4]R[0-9]{3,4}C[0-9]{2}: T0[1-4]R[0-9]{3}[A]{0,1}C[0-9]{2}: ", 
#                                           regex = True, repl="L_")
#     df["Table"] = df["Table"].str.replace("T_T0[5]R[0-9]{3,4}C[0-9]{2}: T0[5]R[0-9]{3}[A]{0,1}C[0-9]{2}: ", 
#                                       regex = True, repl="E_")
#     df["Table"] = df["Table"].str.replace("T_T0[6-9]R[0-9]{3,4}C[0-9]{2}: T0[6-9]R[0-9]{3}[A]{0,1}C[0-9]{2}: ", 
#                                           regex = True, repl="A_")
#     df["Table"] = df["Table"].str.replace("T_T1[0-3]R[0-9]{3,4}C[0-9]{2}: T1[0-3]R[0-9]{3}[A]{0,1}C[0-9]{2}: ", 
#                                           regex = True, repl="A_")
#     #  remove bracket explanations
#     df["Table"] = df["Table"].str.replace("[(][0-9a-z\s,]{2,}[)][:] ", regex = True, repl="")
#     return(df)

In [287]:
def clean_names2(df):
    # Label the different tables withing the df, i.e liablities, assets, etc.
    df["Table"] = df["Table"].str.replace("T_T0[1-2](R[0-9]{3,4})C[0-9]{2}: T0[1-2]R[0-9]{3}[A]{0,1}C[0-9]{2}: ", 
                                          regex = True, repl=r"L_T1-2_\1_")
    df["Table"] = df["Table"].str.replace("T_T0[3-4](R[0-9]{3,4})C[0-9]{2}: T0[3-4]R[0-9]{3}[A]{0,1}C[0-9]{2}: ", 
                                          regex = True, repl=r"L_T3-4_\1_")
    df["Table"] = df["Table"].str.replace("T_T0[5](R[0-9]{3,4})C[0-9]{2}: T0[5]R[0-9]{3}[A]{0,1}C[0-9]{2}: ", 
                                      regex = True, repl=r"E_T5_\1_")
    df["Table"] = df["Table"].str.replace("T_T0[6-9](R[0-9]{3,4})C[0-9]{2}: T0[6-9]R[0-9]{3}[A]{0,1}C[0-9]{2}: ", 
                                          regex = True, repl=r"A_T6-13_\1_")
    df["Table"] = df["Table"].str.replace("T_T1[0-3](R[0-9]{3,4})C[0-9]{2}: T1[0-3]R[0-9]{3}[A]{0,1}C[0-9]{2}: ", 
                                          regex = True, repl=r"A_T6-13_\1_")
    df["Table"] = df["Table"].str.replace(",", "")
    # remove bracket explanations
#     df["Table"] = df["Table"].str.replace("[(][0-9a-z\s,]{2,}[)][:] ", regex = True, repl="")
    return(df)

In [288]:
absa = clean_names2(absa)

In [289]:
absa.head(1)

Unnamed: 0,Bank,Table,1993M01,1993M02,1993M03,1993M04,1993M05,1993M06,1993M07,1993M08,...,2021M07,2021M08,2021M09,2021M10,2021M11,2021M12,2022M01,2022M02,2022M03,2022M04
1,Absa_Bank_Ltd,L_T1-2_R001_DEPOSITS (total of items 2 and 32)...,9588342.0,11392410.0,10668670.0,11432864.0,11014770.0,11042960.0,10561166.0,12471006.0,...,218471395.0,209031404.0,215453291.0,214590582.0,218067373.0,230444938.0,202177025.0,208657411.0,221079725.0,220710857.0


In [290]:
# test = pd.DataFrame(absa[absa["Table"].str.contains("Of which : in foreign currency", case=False)]["Table"])

In [291]:
# test = test.drop_duplicates()

In [292]:
# test2 = list(test["Table"].str.replace(" Of which : in foreign currency.*", 
#                                           regex = True, repl=""))
# doubles = list()
# for item in test2:
#     #print(str(item))
#     doubles.append(list(absa[absa["Table"].str.contains(item)]["Table"]))
# doubles

In [293]:
list(absa["Table"])

['L_T1-2_R001_DEPOSITS (total of items 2 and 32): Cheque (1)',
 'L_T1-2_R001_DEPOSITS (total of items 2 and 32): Savings (2)',
 'L_T1-2_R001_DEPOSITS (total of items 2 and 32): Up to 1 day (3)',
 'L_T1-2_R001_DEPOSITS (total of items 2 and 32): More than 1 day to 1 month (4)',
 'L_T1-2_R001_DEPOSITS (total of items 2 and 32): More than 1 month to 6 months (5)',
 'L_T1-2_R001_DEPOSITS (total of items 2 and 32): More than 6 months (6)',
 'L_T1-2_R001_DEPOSITS (total of items 2 and 32): TOTAL (7)',
 'L_T1-2_R001_DEPOSITS (total of items 2 and 32): NCDs/PNs i  (included in col. 7) (8)',
 'L_T1-2_R002_DEPOSITS DENOMINATED IN RAND (total of items 3 6 12 13 and 29): Cheque (1)',
 'L_T1-2_R002_DEPOSITS DENOMINATED IN RAND (total of items 3 6 12 13 and 29): Savings (2)',
 'L_T1-2_R002_DEPOSITS DENOMINATED IN RAND (total of items 3 6 12 13 and 29): Up to 1 day (3)',
 'L_T1-2_R002_DEPOSITS DENOMINATED IN RAND (total of items 3 6 12 13 and 29): More than 1 day to 1 month (4)',
 'L_T1-2_R002_DEPOSI

In [294]:
# totals = 
list(absa[absa["Table"].str.contains("total", case=False)]["Table"])

['L_T1-2_R001_DEPOSITS (total of items 2 and 32): Cheque (1)',
 'L_T1-2_R001_DEPOSITS (total of items 2 and 32): Savings (2)',
 'L_T1-2_R001_DEPOSITS (total of items 2 and 32): Up to 1 day (3)',
 'L_T1-2_R001_DEPOSITS (total of items 2 and 32): More than 1 day to 1 month (4)',
 'L_T1-2_R001_DEPOSITS (total of items 2 and 32): More than 1 month to 6 months (5)',
 'L_T1-2_R001_DEPOSITS (total of items 2 and 32): More than 6 months (6)',
 'L_T1-2_R001_DEPOSITS (total of items 2 and 32): TOTAL (7)',
 'L_T1-2_R001_DEPOSITS (total of items 2 and 32): NCDs/PNs i  (included in col. 7) (8)',
 'L_T1-2_R002_DEPOSITS DENOMINATED IN RAND (total of items 3 6 12 13 and 29): Cheque (1)',
 'L_T1-2_R002_DEPOSITS DENOMINATED IN RAND (total of items 3 6 12 13 and 29): Savings (2)',
 'L_T1-2_R002_DEPOSITS DENOMINATED IN RAND (total of items 3 6 12 13 and 29): Up to 1 day (3)',
 'L_T1-2_R002_DEPOSITS DENOMINATED IN RAND (total of items 3 6 12 13 and 29): More than 1 day to 1 month (4)',
 'L_T1-2_R002_DEPOSI

So we want to exclude several types of columns. That is, 

Those not containing the uppercase label `TOTAL`

First, to make things easier, the data is split into the three

major components of the Balance Sheet. That is, liabilities, 
assets, and equity

The dataframe must frist be transposed to ensure
it is in the correct form for the model from the start

In [295]:
list(absa.loc[absa["Table"].str.contains("TOTAL"),:]["Table"])

['L_T1-2_R001_DEPOSITS (total of items 2 and 32): TOTAL (7)',
 'L_T1-2_R002_DEPOSITS DENOMINATED IN RAND (total of items 3 6 12 13 and 29): TOTAL (7)',
 'L_T1-2_R003_SA banksb (total of items 4 and 5): TOTAL (7)',
 'L_T1-2_R004_NCDs/PNsi: TOTAL (7)',
 'L_T1-2_R005_Other deposits: TOTAL (7)',
 'L_T1-2_R006_Central and provincial government sector depositsc (total of items 7 10 and 11): TOTAL (7)',
 'L_T1-2_R007_Central government of the Republic (total of items 8 and 9): TOTAL (7)',
 'L_T1-2_R008_Tax and loan account: TOTAL (7)',
 'L_T1-2_R009_Other: TOTAL (7)',
 'L_T1-2_R010_Provincial governments: TOTAL (7)',
 'L_T1-2_R011_Social security funds: TOTAL (7)',
 'L_T1-2_R012_Other monetary institutionsh: TOTAL (7)',
 'L_T1-2_R013_Other domestic parties (total of items 14 to 20 25 26 27 and 28): TOTAL (7)',
 'L_T1-2_R014_Local government: TOTAL (7)',
 'L_T1-2_R015_Public financial corporate sectord (such as IDC DBSA): TOTAL (7)',
 'L_T1-2_R016_Public Investment Corporation (PIC): TOTAL (7)

It is clear from the above that the columns containing enough

aggregated information are tagged with `(total of items ...)`

so this can be filtered

In [296]:
test = absa.loc[absa["Table"].str.contains("TOTAL"),:]
list(test.iloc[:,1])

['L_T1-2_R001_DEPOSITS (total of items 2 and 32): TOTAL (7)',
 'L_T1-2_R002_DEPOSITS DENOMINATED IN RAND (total of items 3 6 12 13 and 29): TOTAL (7)',
 'L_T1-2_R003_SA banksb (total of items 4 and 5): TOTAL (7)',
 'L_T1-2_R004_NCDs/PNsi: TOTAL (7)',
 'L_T1-2_R005_Other deposits: TOTAL (7)',
 'L_T1-2_R006_Central and provincial government sector depositsc (total of items 7 10 and 11): TOTAL (7)',
 'L_T1-2_R007_Central government of the Republic (total of items 8 and 9): TOTAL (7)',
 'L_T1-2_R008_Tax and loan account: TOTAL (7)',
 'L_T1-2_R009_Other: TOTAL (7)',
 'L_T1-2_R010_Provincial governments: TOTAL (7)',
 'L_T1-2_R011_Social security funds: TOTAL (7)',
 'L_T1-2_R012_Other monetary institutionsh: TOTAL (7)',
 'L_T1-2_R013_Other domestic parties (total of items 14 to 20 25 26 27 and 28): TOTAL (7)',
 'L_T1-2_R014_Local government: TOTAL (7)',
 'L_T1-2_R015_Public financial corporate sectord (such as IDC DBSA): TOTAL (7)',
 'L_T1-2_R016_Public Investment Corporation (PIC): TOTAL (7)

In [297]:
test = test.loc[test["Table"].str.contains("[(]total of items ", regex=True), ["Table"]]
test.head()

Unnamed: 0,Table
7,L_T1-2_R001_DEPOSITS (total of items 2 and 32)...
16,L_T1-2_R002_DEPOSITS DENOMINATED IN RAND (tota...
25,L_T1-2_R003_SA banksb (total of items 4 and 5)...
52,L_T1-2_R006_Central and provincial government ...
61,L_T1-2_R007_Central government of the Republic...


In [298]:
dups = test["Table"].str.replace(".*(R[0-9]{3}).*", regex=True, repl=r"\1").duplicated()
dups

7       False
16      False
25      False
52      False
61      False
        ...  
2069     True
2070     True
2071     True
2072     True
2073     True
Name: Table, Length: 85, dtype: bool

In [299]:
# These can all be removed since they are the totals of collections of previous
# totals in the dataframe
list(test.loc[dups,:]["Table"])

['L_T3-4_R078_TOTAL LIABILITIES TO THE PUBLIC (total of items 1 41 58 and 67): Medium-term (2)',
 'L_T3-4_R078_TOTAL LIABILITIES TO THE PUBLIC (total of items 1 41 58 and 67): Long-term (3)',
 'L_T3-4_R078_TOTAL LIABILITIES TO THE PUBLIC (total of items 1 41 58 and 67): TOTAL (4)',
 'L_T3-4_R078_TOTAL LIABILITIES TO THE PUBLIC (total of items 1 41 58 and 67): Of which :  in foreign currency (5)',
 'L_T3-4_R095_TOTAL LIABILITIES (total of items 78 to 80): Medium-term (2)',
 'L_T3-4_R095_TOTAL LIABILITIES (total of items 78 to 80): Long-term (3)',
 'L_T3-4_R095_TOTAL LIABILITIES (total of items 78 to 80): TOTAL (4)',
 'L_T3-4_R095_TOTAL LIABILITIES (total of items 78 to 80): Of which :  in foreign currency (5)',
 'E_T5_R096_TOTAL EQUITY (total of items 97 and 101): Of which: liabilities to the foreign sector (2)',
 'E_T5_R096_TOTAL EQUITY (total of items 97 and 101): Of which: in foreign currency (included in col 1 ) (3)',
 'E_T5_R102_TOTAL EQUITY AND LIABILITIES (total of items 95 and 9

In [300]:
df = absa

In [301]:
def filter_totals(df):
    df = df.loc[df["Table"].str.contains("TOTAL"),:]
    df = df.loc[df["Table"].str.contains("[(]total of items ", regex=True), :]
    dups = df["Table"].str.replace(".*(R[0-9]{3}).*", regex=True, repl=r"\1").duplicated()
    df = df.loc[~dups,:]
    return(df)

In [302]:
df

Unnamed: 0,Bank,Table,1993M01,1993M02,1993M03,1993M04,1993M05,1993M06,1993M07,1993M08,...,2021M07,2021M08,2021M09,2021M10,2021M11,2021M12,2022M01,2022M02,2022M03,2022M04
1,Absa_Bank_Ltd,L_T1-2_R001_DEPOSITS (total of items 2 and 32)...,9588342.0,11392410.0,10668670.0,11432864.0,11014770.0,11042960.0,10561166.0,12471006.0,...,218471395.0,209031404.0,215453291.0,214590582.0,218067373.0,230444938.0,202177025.0,208657411.0,221079725.0,220710857.0
2,Absa_Bank_Ltd,L_T1-2_R001_DEPOSITS (total of items 2 and 32)...,7800740.0,7555104.0,7670861.0,7780098.0,7756258.0,7709445.0,7722106.0,7613571.0,...,219810888.0,219708150.0,219832044.0,220612457.0,224095757.0,225800105.0,223125550.0,216880586.0,216446346.0,218785374.0
3,Absa_Bank_Ltd,L_T1-2_R001_DEPOSITS (total of items 2 and 32)...,8946659.0,6782139.0,9577124.0,8180529.0,7692343.0,8151043.0,8020508.0,8174811.0,...,140121157.0,160037358.0,164035947.0,167933686.0,181585523.0,150430271.0,171484466.0,174135086.0,172916558.0,164697147.0
4,Absa_Bank_Ltd,L_T1-2_R001_DEPOSITS (total of items 2 and 32)...,9417927.0,9651219.0,9436300.0,9473765.0,9701689.0,9933172.0,9589893.0,9227178.0,...,78737825.0,59329833.0,60904840.0,66533680.0,60978989.0,81578456.0,68695763.0,65715791.0,63862009.0,85939653.0
5,Absa_Bank_Ltd,L_T1-2_R001_DEPOSITS (total of items 2 and 32)...,21301640.0,19636868.0,18328838.0,17065947.0,16792250.0,16847682.0,17196235.0,16287624.0,...,115411994.0,114107539.0,115562156.0,115253808.0,124818553.0,122143067.0,118478176.0,119878691.0,115171793.0,113965361.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2069,Absa_Bank_Ltd,A_T6-13_R277_TOTAL ASSETS (total of items 103 ...,,,,,,,,,...,,,,,,,,,,
2070,Absa_Bank_Ltd,A_T6-13_R277_TOTAL ASSETS (total of items 103 ...,937881.0,839061.0,812196.0,504627.0,950088.0,814981.0,409169.0,370140.0,...,52670911.0,53912565.0,55315248.0,54637880.0,57076273.0,55828960.0,58033424.0,60998142.0,57034200.0,59875968.0
2071,Absa_Bank_Ltd,A_T6-13_R277_TOTAL ASSETS (total of items 103 ...,,,,,,,,,...,,,,,,,,,,
2072,Absa_Bank_Ltd,A_T6-13_R277_TOTAL ASSETS (total of items 103 ...,79593628.0,81404394.0,81034337.0,78063262.0,77755058.0,78371846.0,77567705.0,79331923.0,...,127733124.0,127216868.0,121372441.0,119401796.0,117699282.0,108863961.0,102269580.0,115017291.0,120936069.0,124521793.0


In [303]:
absa = filter_totals(absa)

In [304]:
InteractiveShell.ast_node_interactivity = "all"

In [305]:
absa.head(1)
absa.info()

Unnamed: 0,Bank,Table,1993M01,1993M02,1993M03,1993M04,1993M05,1993M06,1993M07,1993M08,...,2021M07,2021M08,2021M09,2021M10,2021M11,2021M12,2022M01,2022M02,2022M03,2022M04
7,Absa_Bank_Ltd,L_T1-2_R001_DEPOSITS (total of items 2 and 32)...,66036932.0,65948319.0,66446620.0,64813147.0,64317315.0,65148147.0,64450271.0,66006474.0,...,987428308.0,978982466.0,990882087.0,1002621000.0,1026975000.0,1025364000.0,997923770.0,1011085000.0,1022591000.0,1048495000.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 67 entries, 7 to 2068
Columns: 353 entries, Bank to  2022M04
dtypes: float64(351), object(2)
memory usage: 185.3+ KB


In [306]:
InteractiveShell.ast_node_interactivity = "last"

In [307]:
def reformat(df):
    df = df.drop("Bank", axis=1)
    df = df.set_index("Table").T
    df.columns.name = None
    df.index.name = "Date"
    df.index = pd.to_datetime(df.index, format=" %YM%m")
    df.columns = df.columns.str.replace(pat=" ", repl="_")
    df.columns = df.columns.str.replace(pat="total_of_items", repl="tot")
    df = df.apply(pd.to_numeric)
    
    return(df)

In [308]:
absa = reformat(absa)

In [309]:
absa.head(1)

Unnamed: 0_level_0,L_T1-2_R001_DEPOSITS_(tot_2_and_32):_TOTAL_(7),L_T1-2_R002_DEPOSITS_DENOMINATED_IN_RAND_(tot_3_6_12_13_and_29):_TOTAL_(7),L_T1-2_R003_SA_banksb_(tot_4_and_5):_TOTAL_(7),L_T1-2_R006_Central_and_provincial_government_sector_depositsc_(tot_7_10_and_11):_TOTAL_(7),L_T1-2_R007_Central_government_of_the_Republic_(tot_8_and_9):_TOTAL_(7),L_T1-2_R013_Other_domestic_parties_(tot_14_to_20_25_26_27_and_28):_TOTAL_(7),L_T1-2_R020_Private_financial_corporate_sectore_(tot_21_to_24):_TOTAL_(7),L_T1-2_R029_Foreign_sector_(tot_30_and_31):_TOTAL_(7),L_T1-2_R032_DEPOSITS_DENOMINATED_IN_FOREIGN_CURRENCY_(tot_33_to_38):_TOTAL_(7),L_T1-2_R038_Foreign_sector_(tot_39_and_40):_TOTAL_(7),...,A_T6-13_R246_Acceptances_commercial_paper_bills_promissory_notes_and_similar_acknowledgements_of_debt_discounted_or_purchased_(tot_247_250_to_254_and_257):_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),A_T6-13_R247_Bankers'_acceptances_(tot_248_and_249):_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),A_T6-13_R254_Land_Bank_bills_(tot_255_and_256):_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),A_T6-13_R258_NON-FINANCIAL_ASSETS_(tot_259_and_264):_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),A_T6-13_R259_Tangible_assets_(tot_260_to_263):_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),A_T6-13_R264_Intangible_assets_(tot_265_and_266):_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),A_T6-13_R267_OTHER_ASSETS_(tot_268_to_272_and_276):_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),A_T6-13_R268_Clients'_liabilities_per_contra_(tot_280_to_283):_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),A_T6-13_R272_Assets_acquired_or_bought_in_to_protect_an_advance_or_investment_(tot_273_to_275):_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),A_T6-13_R277_TOTAL_ASSETS_(tot_103_110_195_258_and_267):_Domestic_assets_(1)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-01-01,66036932.0,66034650.0,9340101.0,1238972.0,1023743.0,54180359.0,,1275218.0,2282.0,2282.0,...,122004.0,,,1541557.0,1173537.0,,5228902.0,3577948.0,,78655747.0


In [310]:
absa.iloc[:,absa.columns.str.startswith(("L_"))]

Unnamed: 0_level_0,L_T1-2_R001_DEPOSITS_(tot_2_and_32):_TOTAL_(7),L_T1-2_R002_DEPOSITS_DENOMINATED_IN_RAND_(tot_3_6_12_13_and_29):_TOTAL_(7),L_T1-2_R003_SA_banksb_(tot_4_and_5):_TOTAL_(7),L_T1-2_R006_Central_and_provincial_government_sector_depositsc_(tot_7_10_and_11):_TOTAL_(7),L_T1-2_R007_Central_government_of_the_Republic_(tot_8_and_9):_TOTAL_(7),L_T1-2_R013_Other_domestic_parties_(tot_14_to_20_25_26_27_and_28):_TOTAL_(7),L_T1-2_R020_Private_financial_corporate_sectore_(tot_21_to_24):_TOTAL_(7),L_T1-2_R029_Foreign_sector_(tot_30_and_31):_TOTAL_(7),L_T1-2_R032_DEPOSITS_DENOMINATED_IN_FOREIGN_CURRENCY_(tot_33_to_38):_TOTAL_(7),L_T1-2_R038_Foreign_sector_(tot_39_and_40):_TOTAL_(7),...,L_T3-4_R058_FOREIGN_CURRENCY_FUNDING_(tot_59_to_63_and_66):_TOTAL_(4),L_T3-4_R067_OTHER_LIABILITIES_TO_THE_PUBLIC_(tot_68_73_74_and_77):_TOTAL_(4),L_T3-4_R068_Debt_securities:_(tot_69_to_72):_TOTAL_(4),L_T3-4_R074_Other_(tot_75_and_76):_TOTAL_(4),L_T3-4_R078_TOTAL_LIABILITIES_TO_THE_PUBLIC_(tot_1_41_58_and_67):_Short-term_(1),L_T3-4_R080_OTHER_LIABILITIES_(tot_81_85_86_90_91_and_94):_TOTAL_(4),L_T3-4_R081_Liabilities_i.r.o_derivative_instruments:_to_domestic_sector_(tot_82_to_84):_TOTAL_(4),L_T3-4_R086_Other_trading_liabilities_(excluding_derivatives):_to_domestic_sector_(tot_87_to_89):_TOTAL_(4),L_T3-4_R091_Other_liabilities:_to_domestic_sector_(tot_92_and_93):_TOTAL_(4),L_T3-4_R095_TOTAL_LIABILITIES_(tot_78_to_80):_Short-term_(1)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-01-01,6.603693e+07,66034650.0,9340101.0,1238972.0,1023743.0,54180359.0,,1275218.0,2282.0,2282.0,...,3914415.0,890830.0,,432645.0,38199384.0,48375.0,,,48375.0,41825706.0
1993-02-01,6.594832e+07,65946508.0,7342899.0,1153121.0,883687.0,56171183.0,,1279305.0,1811.0,1811.0,...,4011103.0,1163403.0,,421806.0,38708005.0,409408.0,,,409408.0,42428505.0
1993-03-01,6.644662e+07,66444850.0,7094763.0,2424744.0,2202394.0,55828437.0,,1096906.0,1770.0,1770.0,...,4324758.0,999009.0,,761229.0,40502925.0,40645.0,,,40645.0,43994752.0
1993-04-01,6.481315e+07,64811391.0,5858248.0,1222010.0,956580.0,56569531.0,,1161602.0,1756.0,1756.0,...,3858337.0,941248.0,,583016.0,39037928.0,39945.0,,,39945.0,42171610.0
1993-05-01,6.431732e+07,64315553.0,4455192.0,1452509.0,1140486.0,56186211.0,,1170514.0,1762.0,1762.0,...,4043798.0,1500329.0,,899079.0,38184397.0,47131.0,,,47131.0,41107367.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-01,1.025364e+09,969525149.0,74043961.0,84466156.0,72963540.0,777565198.0,154672857.0,13455772.0,55838418.0,19556815.0,...,86112.0,65220729.0,60874922.0,0.0,765032283.0,89087148.0,28579204.0,21146258.0,15053098.0,834200962.0
2022-01-01,9.979238e+08,942489021.0,76540882.0,60756325.0,49547141.0,766702422.0,157463170.0,15710556.0,55434749.0,17981285.0,...,83140.0,62932589.0,59891887.0,0.0,742181990.0,88157362.0,23261773.0,21147825.0,20445284.0,804531069.0
2022-02-01,1.011085e+09,955334991.0,83899831.0,58478887.0,47578108.0,776108179.0,160886115.0,12397194.0,55750362.0,19490661.0,...,129230.0,62710912.0,59864949.0,0.0,757025034.0,90975660.0,23959742.0,21209727.0,26563602.0,819278316.0
2022-03-01,1.022591e+09,970112283.0,82881639.0,60095060.0,50107072.0,791864419.0,169637210.0,12753889.0,52478855.0,18369705.0,...,120278.0,61256220.0,57653124.0,0.0,757564804.0,104419699.0,25560539.0,23046067.0,20686142.0,832970081.0


In [311]:
def liabilities(df):
    df = df.iloc[:,df.columns.str.startswith(("L_"))]
    return(df)

def equity(df):
    df = df.iloc[:,df.columns.str.startswith(("E_"))]
    return(df)

def assets(df):
    df = df.iloc[:,df.columns.str.startswith(("A_"))]
    return(df)

In [312]:
absa_l = liabilities(absa)
absa_e = equity(absa)
absa_a = assets(absa)

In [313]:
absa_l.head(1)

Unnamed: 0_level_0,L_T1-2_R001_DEPOSITS_(tot_2_and_32):_TOTAL_(7),L_T1-2_R002_DEPOSITS_DENOMINATED_IN_RAND_(tot_3_6_12_13_and_29):_TOTAL_(7),L_T1-2_R003_SA_banksb_(tot_4_and_5):_TOTAL_(7),L_T1-2_R006_Central_and_provincial_government_sector_depositsc_(tot_7_10_and_11):_TOTAL_(7),L_T1-2_R007_Central_government_of_the_Republic_(tot_8_and_9):_TOTAL_(7),L_T1-2_R013_Other_domestic_parties_(tot_14_to_20_25_26_27_and_28):_TOTAL_(7),L_T1-2_R020_Private_financial_corporate_sectore_(tot_21_to_24):_TOTAL_(7),L_T1-2_R029_Foreign_sector_(tot_30_and_31):_TOTAL_(7),L_T1-2_R032_DEPOSITS_DENOMINATED_IN_FOREIGN_CURRENCY_(tot_33_to_38):_TOTAL_(7),L_T1-2_R038_Foreign_sector_(tot_39_and_40):_TOTAL_(7),...,L_T3-4_R058_FOREIGN_CURRENCY_FUNDING_(tot_59_to_63_and_66):_TOTAL_(4),L_T3-4_R067_OTHER_LIABILITIES_TO_THE_PUBLIC_(tot_68_73_74_and_77):_TOTAL_(4),L_T3-4_R068_Debt_securities:_(tot_69_to_72):_TOTAL_(4),L_T3-4_R074_Other_(tot_75_and_76):_TOTAL_(4),L_T3-4_R078_TOTAL_LIABILITIES_TO_THE_PUBLIC_(tot_1_41_58_and_67):_Short-term_(1),L_T3-4_R080_OTHER_LIABILITIES_(tot_81_85_86_90_91_and_94):_TOTAL_(4),L_T3-4_R081_Liabilities_i.r.o_derivative_instruments:_to_domestic_sector_(tot_82_to_84):_TOTAL_(4),L_T3-4_R086_Other_trading_liabilities_(excluding_derivatives):_to_domestic_sector_(tot_87_to_89):_TOTAL_(4),L_T3-4_R091_Other_liabilities:_to_domestic_sector_(tot_92_and_93):_TOTAL_(4),L_T3-4_R095_TOTAL_LIABILITIES_(tot_78_to_80):_Short-term_(1)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-01-01,66036932.0,66034650.0,9340101.0,1238972.0,1023743.0,54180359.0,,1275218.0,2282.0,2282.0,...,3914415.0,890830.0,,432645.0,38199384.0,48375.0,,,48375.0,41825706.0


In [314]:
list(absa_l.iloc[:,
                absa_l.columns.str.contains("L_T[13]-[24]_R[0-9]{3}_[A-Z]{3,}")].columns[1:-1])

['L_T1-2_R002_DEPOSITS_DENOMINATED_IN_RAND_(tot_3_6_12_13_and_29):_TOTAL_(7)',
 'L_T1-2_R032_DEPOSITS_DENOMINATED_IN_FOREIGN_CURRENCY_(tot_33_to_38):_TOTAL_(7)',
 'L_T3-4_R041_OTHER_BORROWED_FUNDS_(tot_42_51_and_57):_TOTAL_(4)',
 'L_T3-4_R058_FOREIGN_CURRENCY_FUNDING_(tot_59_to_63_and_66):_TOTAL_(4)',
 'L_T3-4_R067_OTHER_LIABILITIES_TO_THE_PUBLIC_(tot_68_73_74_and_77):_TOTAL_(4)',
 'L_T3-4_R078_TOTAL_LIABILITIES_TO_THE_PUBLIC_(tot_1_41_58_and_67):_Short-term_(1)',
 'L_T3-4_R080_OTHER_LIABILITIES_(tot_81_85_86_90_91_and_94):_TOTAL_(4)']

In [315]:
list(absa_e.columns[:-2])

['E_T5_R096_TOTAL_EQUITY_(tot_97_and_101):_TOTAL_(1)']

In [316]:
absa_a.iloc[:,absa_a.columns.str.contains("A_T6-13_R[0-9]{3}_[A-Z-]{4,}")].columns[:-1]

Index(['A_T6-13_R103_CENTRAL_BANK_MONEY_AND_GOLD_(tot_104_to_106):_TOTAL_ASSETS_(Col_1_plus_col_3)_(5)',
       'A_T6-13_R110_DEPOSITS_LOANS_AND_ADVANCES_(tot_111_117_118_126_135_139_150_166_171_and_180_less_item_194):_TOTAL_ASSETS_(Col_1_plus_col_3)_(5)',
       'A_T6-13_R195_INVESTMENTS_AND_BILLS_including_trading_portfolio_assets_(tot_196_207_213_217_221_225_229_233_237_241_and_246_less_item_245):_TOTAL_ASSETS_(Col_1_plus_col_3)_(5)',
       'A_T6-13_R258_NON-FINANCIAL_ASSETS_(tot_259_and_264):_TOTAL_ASSETS_(Col_1_plus_col_3)_(5)',
       'A_T6-13_R267_OTHER_ASSETS_(tot_268_to_272_and_276):_TOTAL_ASSETS_(Col_1_plus_col_3)_(5)'],
      dtype='object')

The columns referring to `RXXX` indexes prior to its own

will be removed, and the the `tot_XXX` columns referred to in the 

brackets above will be used to include more information


In [317]:
absa_a.iloc[:,absa_a.columns.str.contains("A_T6-13_R[0-9]{3}_[A-Z]{3,}_.*")]

Unnamed: 0_level_0,A_T6-13_R103_CENTRAL_BANK_MONEY_AND_GOLD_(tot_104_to_106):_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),A_T6-13_R110_DEPOSITS_LOANS_AND_ADVANCES_(tot_111_117_118_126_135_139_150_166_171_and_180_less_item_194):_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),A_T6-13_R195_INVESTMENTS_AND_BILLS_including_trading_portfolio_assets_(tot_196_207_213_217_221_225_229_233_237_241_and_246_less_item_245):_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),A_T6-13_R267_OTHER_ASSETS_(tot_268_to_272_and_276):_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),A_T6-13_R277_TOTAL_ASSETS_(tot_103_110_195_258_and_267):_Domestic_assets_(1)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1993-01-01,1659838.0,64985349.0,6177982.0,5228902.0,7.865575e+07
1993-02-01,1475877.0,66786416.0,6056794.0,5541530.0,8.056533e+07
1993-03-01,2027696.0,65508514.0,5967352.0,5560926.0,8.022214e+07
1993-04-01,1394141.0,63811411.0,5565634.0,5273498.0,7.755864e+07
1993-05-01,1448786.0,64089480.0,5728603.0,4564490.0,7.680497e+07
...,...,...,...,...,...
2021-12-01,0.0,64701459.0,43229500.0,932988.0,1.244072e+09
2022-01-01,0.0,62134502.0,38480505.0,1654560.0,1.216991e+09
2022-02-01,0.0,76409017.0,38606905.0,1357.0,1.237919e+09
2022-03-01,0.0,75906076.0,45028622.0,1360.0,1.244144e+09


In [318]:
def clean_liab(df):
    keep = list(absa_l.iloc[:,
                            absa_l.columns.str.contains("L_T[13]-[24]_R[0-9]{3}_[A-Z]{3,}")].columns[1:-1])
    del keep[-2]
    df = df[keep]
    df.columns = df.columns.str.replace("T[13]-[24]_R[0-9]{3}_", "", regex=True)
    df.columns = df.columns.str.replace("_[(].*[)]:", "", regex=True)
    df.columns = df.columns.str.replace("_[(][0-9][)]", "", regex=True)
    return(df)

def clean_assets(df):
    keep = list(absa_a.iloc[:,
                            absa_a.columns.str.contains("A_T6-13_R[0-9]{3}_[A-Z-]{4,}")].columns[:-1])
    df = df[keep]
    df.columns = df.columns.str.replace("T6-13_R[0-9]{3}_", "", regex=True)
    df.columns = df.columns.str.replace("_[(].*[)]:", "", regex=True)
    df.columns = df.columns.str.replace("_[(][0-9][)]", "", regex=True)
    df.columns = df.columns.str.replace("_[(].*[)]", "", regex=True)
    return(df)

def clean_equity(df):
    keep = list(absa_e.columns[:-2])
    df = df[keep]
    df.columns = df.columns.str.replace("T5_R[0-9]{3}_", "", regex=True)
    df.columns = df.columns.str.replace("_[(].*[)]:", "", regex=True)
    df.columns = df.columns.str.replace("_[(][0-9][)]", "", regex=True)
    return(df)

In [319]:
absa_l = clean_liab(absa_l)
absa_e = clean_equity(absa_e)
absa_a = clean_assets(absa_a)

In [320]:
absa_a.head(1)

Unnamed: 0_level_0,A_CENTRAL_BANK_MONEY_AND_GOLD_TOTAL_ASSETS,A_DEPOSITS_LOANS_AND_ADVANCES_TOTAL_ASSETS,A_INVESTMENTS_AND_BILLS_including_trading_portfolio_assets_TOTAL_ASSETS,A_NON-FINANCIAL_ASSETS_TOTAL_ASSETS,A_OTHER_ASSETS_TOTAL_ASSETS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1993-01-01,1659838.0,64985349.0,6177982.0,1541557.0,5228902.0


In [321]:
absa = pd.concat([absa_l, absa_a, absa_e], axis = 1)

In [322]:
absa = absa.fillna(0)

In [323]:
absa.head(2)

Unnamed: 0_level_0,L_DEPOSITS_DENOMINATED_IN_RAND_TOTAL,L_DEPOSITS_DENOMINATED_IN_FOREIGN_CURRENCY_TOTAL,L_OTHER_BORROWED_FUNDS_TOTAL,L_FOREIGN_CURRENCY_FUNDING_TOTAL,L_OTHER_LIABILITIES_TO_THE_PUBLIC_TOTAL,L_OTHER_LIABILITIES_TOTAL,A_CENTRAL_BANK_MONEY_AND_GOLD_TOTAL_ASSETS,A_DEPOSITS_LOANS_AND_ADVANCES_TOTAL_ASSETS,A_INVESTMENTS_AND_BILLS_including_trading_portfolio_assets_TOTAL_ASSETS,A_NON-FINANCIAL_ASSETS_TOTAL_ASSETS,A_OTHER_ASSETS_TOTAL_ASSETS,E_TOTAL_EQUITY_TOTAL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1993-01-01,66034650.0,2282.0,0.0,3914415.0,890830.0,48375.0,1659838.0,64985349.0,6177982.0,1541557.0,5228902.0,3506820.0
1993-02-01,65946508.0,1811.0,1393510.0,4011103.0,1163403.0,409408.0,1475877.0,66786416.0,6056794.0,1543777.0,5541530.0,3506820.0


In [325]:
def differenced(df):
    df = df.diff()
    df = df.iloc[1:,:]
    df.insert(0, "Bank", "ABSA")
    return(df)

In [327]:
absa = differenced(absa)

In [328]:
absa.head(1)

Unnamed: 0_level_0,Bank,L_DEPOSITS_DENOMINATED_IN_RAND_TOTAL,L_DEPOSITS_DENOMINATED_IN_FOREIGN_CURRENCY_TOTAL,L_OTHER_BORROWED_FUNDS_TOTAL,L_FOREIGN_CURRENCY_FUNDING_TOTAL,L_OTHER_LIABILITIES_TO_THE_PUBLIC_TOTAL,L_OTHER_LIABILITIES_TOTAL,A_CENTRAL_BANK_MONEY_AND_GOLD_TOTAL_ASSETS,A_DEPOSITS_LOANS_AND_ADVANCES_TOTAL_ASSETS,A_INVESTMENTS_AND_BILLS_including_trading_portfolio_assets_TOTAL_ASSETS,A_NON-FINANCIAL_ASSETS_TOTAL_ASSETS,A_OTHER_ASSETS_TOTAL_ASSETS,E_TOTAL_EQUITY_TOTAL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1993-02-01,ABSA,-88142.0,-471.0,1393510.0,96688.0,272573.0,361033.0,-183961.0,1801067.0,-121188.0,2220.0,312628.0,0.0


In [None]:
absa.to_csv("")

In [None]:
%matplotlib inline
plt.figure(figsize=(12,4), num="NUM")
plt.xlabel("Year")
plt.ylabel("BS Component")
final_repo_rate["Repo Rate"].plot(grid=True, xlim=(pd.Timestamp("1993-01-31"), pd.Times
yticks=range(23));

Some of the highly overlapping columns are removed now, 
and others will be removed after the necessary
additional features have been calculated.

In [None]:
# This dataset has a particular problem with overlapping or highly
# correlated features that contain elements from other columns
# Thus, we remove these highly correlated features

def remove_corr(df):
    df.reset_index(inplace=True)
    es = ft.EntitySet(id="Absa_BS")
    ent_set = es.add_dataframe(dataframe_name="Absa_Group_Ltd",
                                dataframe=df, 
                                already_sorted=False, index = "index")
    
    fm, features = ft.dfs(entityset=ent_set,
                      target_dataframe_name="Absa_Group_Ltd",
                      trans_primitives=[],
                      agg_primitives=[], 
                      max_depth=1)
    # From experimentation, the 0.97 threshold seems to remove the 
    # columns that are verbatim totals of others and not removing
    # other columns required for feature analysis.
    # A lower threshold will be used at a later stage after some
    # columns have been used in calculation of additional 
    # features
    fm = ft.selection.remove_highly_correlated_features(fm, 
                                                        pct_corr_threshold=0.97)
    fm.reset_index(inplace=True)
    fm.set_index("index", inplace=True)
    fm.columns.name = "Date"
    fm.index.name = None

    return(fm)

absa = remove_corr(absa)

### Creating the Liquidity Ratio Variable

Now, we want to create a loan to deposit ratio to create a liquidity ratio variable for the bank

In [None]:
absa.iloc[0,absa.columns.str.contains("^L.*deposits", case=False)]#[1:8]#.sum()

In [None]:
absa.iloc[0,absa.columns.str.contains("LOANS", case=False, regex=True)]

In [None]:
def create_total_deposits(df_global):
    df = df_global
    deposits = df.columns.str.contains("^L.*deposits", 
                                           case=False, 
                                           regex=True)
    drops = list(df.columns[deposits])
    df["L_TOTAL_DEPOSITS"] = df.iloc[:,deposits].sum(axis=1)
    df = df.drop(columns=drops)
    return(df)



In [None]:
absa = create_total_deposits(absa)

In [None]:
def create_total_loans(df_global):
    df = df_global
    loans = df.columns.str.contains("Loans", 
                                       case=False, 
                                       regex=True)
    drops = list(df.columns[loans])
    df["A_TOTAL_LOANS"] = df.iloc[:,loans].sum(axis=1)
    df = df.drop(columns=drops)
    return(df)


In [None]:
absa = create_total_loans(absa)

In [None]:
def liquidity_ratio(df):
    
    

absa[["A_TOTAL_LOANS", "L_TOTAL_DEPOSITS"]]

some of these columns seem to appear 'twice' and will therefore be removed

## Removing Highly Correlated Features

This dataset contains groups of variables that sum together to form larger aggregates. To ensure
that the model only uses features that are relevant once, we remove a subset of those features that
are too highly correlated with others. This would also introduce the issue of multicolinearity if the
features are not removed.

In [None]:
# This dataset has a particular problem with overlapping or highly
# correlated features that contain elements from other columns
# Thus, we remove these highly correlated features

def remove_corr(df):
    df.reset_index(inplace=True)
    es = ft.EntitySet(id="Absa_BS")
    ent_set = es.add_dataframe(dataframe_name="Absa_Group_Ltd",
                                dataframe=df, 
                                already_sorted=False, index = "index")
    
    fm, features = ft.dfs(entityset=ent_set,
                      target_dataframe_name="Absa_Group_Ltd",
                      trans_primitives=[],
                      agg_primitives=[], 
                      max_depth=1)
    # From experimentation, the 0.9825 threshold seems to remove the 
    # columns that are verbatim totals of others
    # A lower threshold will be used at a later stage after some
    # columns have been used in calculation of additional 
    # features
    fm = ft.selection.remove_highly_correlated_features(fm, 
                                                        pct_corr_threshold=0.9825)
    fm.reset_index(inplace=True)
    fm.set_index("index", inplace=True)
    fm.columns.name = "Date"
    fm.index.name = None

    return(fm)

absa = remove_corr(absa)

In [None]:
absa.info()