# Setup

## Check Python version

In [1]:
from platform import  python_version

In [3]:
python_version()

'3.10.5'

## Install/Download necessary packages

In [4]:
import sys

In [221]:
# # Uncomment these if any packages are not installed in your current jupyter env 
# # Installing a pip package in the current kernel
# # Pandas also installs the numpy package
# !{sys.executable} -m pip install pandas  
# !{sys.executable} -m pip install requests
# !{sys.executable} -m pip install matplotlib
# !{sys.executable} -m pip install sklearn
!{sys.executable} -m pip install featuretools

Collecting featuretools
  Using cached featuretools-1.11.0-py3-none-any.whl (362 kB)
Collecting distributed>=2021.10.0
  Using cached distributed-2022.6.1-py3-none-any.whl (882 kB)
Collecting woodwork>=0.16.2
  Using cached woodwork-0.16.4-py3-none-any.whl (207 kB)
Collecting dask[dataframe]>=2021.10.0
  Using cached dask-2022.6.1-py3-none-any.whl (1.1 MB)
Collecting holidays>=0.13
  Using cached holidays-0.14.2-py3-none-any.whl (179 kB)
Installing collected packages: holidays, dask, woodwork, distributed, featuretools
Successfully installed dask-2022.6.1 distributed-2022.6.1 featuretools-1.11.0 holidays-0.14.2 woodwork-0.16.4


In [222]:
# import the required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import sklearn
import os
import datetime as dt
import featuretools as ft

# Importing the Data

In [7]:
def view_files():
    path = os.getcwd()
    path = f"{path}\data"
    return(os.listdir(path))

files = view_files()
files

['20220703_120137530608_RBD.csv',
 '20220703_120253586011_RBD.xlsx',
 '20220703_120455835661_RBD.csv',
 'main.csv',
 'mainexcel.xlsx',
 'test1.xlsx',
 'test1csv.csv']

In [403]:
def view_dataset(i, bank_prefix="B_34118: "):
    #grab the file path from which to import the dataset
    path = os.getcwd()
    path = f"{path}\data"
    path = f"{path}\{files[i]}"
    
    # read the csv file as a dataframe and remove unnecessary columns
    df = pd.read_csv(filepath_or_buffer=path)
    df = df.drop(["Unit", "Time series code"], axis="columns")
    
    #
    df.columns = df.columns.str.replace(pat="D_M_[0-9]{4}M[0-9]{2}:", repl="", regex=True)
    df["Bank"] = df["Bank"].str.replace(pat=bank_prefix, repl="", regex=True)
    df["Bank"] = df["Bank"].str.replace(pat = " ", repl = "_")
    df = df[~df["Table"].str.contains("T_T[0-9]{2}R[0-9]{3}:", regex = True)]
    df = df[~df["Table"].str.contains("T_T[0-9]{2}R[0-9]{3}_A:", regex = True)]
    df["Table"] = df["Table"].str.replace("T_T[0-9]{2}R[0-9]{3,4}C[0-9]{2}: T[0-9]{2}R[0-9]{3}[A]{0,1}C[0-9]{2}: ", 
                                          regex = True, repl="")
    df["Table"] = df["Table"].str.replace("[(][0-9a-z\s,]{2,}[)][:] ", regex = True, repl="")
    #
    df = df.set_index("Table").T.drop("Bank")
    df.index = pd.to_datetime(df.index, format=" %YM%m")
    df.dropna(axis=1, inplace=True)
    df.columns = df.columns.str.replace(pat=" ", repl="_")
    df = df.apply(pd.to_numeric)
    df = ft.selection.remove_highly_null_features(df)
    df = ft.selection.remove_single_value_features(df)
    df = df.loc[:,~df.columns.duplicated()]    
    
    return(df, f"file picked: {path}")

absa = view_dataset(6)[0]
# df[df.iloc[:,2:].isna().sum(axis=1) == len(df.iloc[0, 2:])]

In [404]:
# verify that the correct file was extracted
view_dataset(6)[1]

'file picked: C:\\GitHub\\DS_PROJ\\data\\test1csv.csv'

In [405]:
# This dataset has a particular problem with overlapping or highly
# correlated features that contain elements from other columns
# Thus, we remove these highly correlated features

def remove_corr(df):
    df.reset_index(inplace=True)
    ent_set = es.add_dataframe(
    dataframe_name="Absa_Group_Ltd",
    dataframe=df, 
    already_sorted=False, index = "index")
    
    fm, features = ft.dfs(entityset=ent_set,
                      target_dataframe_name="Absa_Group_Ltd",
                      trans_primitives=[],
                      agg_primitives=[], 
                      max_depth=2)
    fm = ft.selection.remove_highly_correlated_features(fm)
    fm.reset_index(inplace=True)
    fm.set_index("index", inplace=True)
    fm.columns.name = "Date"
    fm.index.name = None

    return(fm)

absa = remove_corr(absa)

In [406]:
absa

index,DEPOSITS_Cheque_(1),DEPOSITS_More_than_1_day_to_1_month_(4),DEPOSITS_More_than_1_month_to_6_months_(5),SA_banksb_Cheque_(1),SA_banksb_Savings_(2),SA_banksb_Up_to_1_day_(3),SA_banksb_More_than_1_day_to_1_month_(4),SA_banksb_More_than_1_month_to_6_months_(5),SA_banksb_More_than_6_months_(6),SA_banksb_TOTAL_(7),...,"Acceptances,_commercial_paper,_bills,_promissory_notes_and_similar_acknowledgements_of_debt_discounted_or_purchased_TOTAL_ASSETS_(Col_1_plus_col_3)_(5)",Promissory_notes:_Domestic_assets_(1),Promissory_notes:_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),NON-FINANCIAL_ASSETS_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),Other_fixed_property:_Domestic_assets_(1),"Computer_equipment,_including_peripherals:_Domestic_assets_(1)",OTHER_ASSETS_Domestic_assets_(1),OTHER_ASSETS_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),Remittances_in_transit:_Domestic_assets_(1),Remittances_in_transit:_TOTAL_ASSETS_(Col_1_plus_col_3)_(5)
1993-01-01,9588342.0,9417927.0,21301640.0,497129.0,2493.0,2427763.0,1299494.0,4050363.0,1062859.0,9340101.0,...,122004.0,59713.0,59713.0,1541557.0,10667.0,510004.0,5228902.0,5228902.0,519182.0,519182.0
1993-02-01,11392410.0,9651219.0,19636868.0,533695.0,3184.0,558999.0,1368054.0,3927600.0,951367.0,7342899.0,...,124212.0,60818.0,60818.0,1543777.0,10706.0,500910.0,5541530.0,5541530.0,892414.0,892414.0
1993-03-01,10668670.0,9436300.0,18328838.0,382512.0,3552.0,1005921.0,2193618.0,2545995.0,963165.0,7094763.0,...,143389.0,62171.0,62171.0,1969848.0,2790.0,564249.0,5560926.0,5560926.0,583282.0,583282.0
1993-04-01,11432864.0,9473765.0,17065947.0,656901.0,3409.0,821520.0,1398312.0,1902702.0,1075404.0,5858248.0,...,147137.0,63133.0,63133.0,2018578.0,2790.0,605950.0,5273498.0,5273498.0,1115825.0,1115825.0
1993-05-01,11014770.0,9701689.0,16792250.0,246988.0,3325.0,489638.0,494457.0,2099513.0,1121271.0,4455192.0,...,2313651.0,1136910.0,1136910.0,1923699.0,2790.0,631471.0,4564490.0,4564490.0,479624.0,479624.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-01,230444938.0,81578456.0,122143067.0,10698588.0,608869.0,29366537.0,4134711.0,4622448.0,24612808.0,74043961.0,...,11205152.0,0.0,0.0,14.0,0.0,1624951.0,18960997.0,932988.0,5745679.0,0.0
2022-01-01,202177025.0,68695763.0,118478176.0,10894463.0,359913.0,28509203.0,7520792.0,3753949.0,25502562.0,76540882.0,...,6524698.0,0.0,0.0,13.0,0.0,1598978.0,34728023.0,1654560.0,12264059.0,0.0
2022-02-01,208657411.0,65715791.0,119878691.0,10998058.0,360534.0,34326442.0,7940248.0,3146471.0,27128078.0,83899831.0,...,4925175.0,0.0,0.0,12.0,0.0,1573858.0,41053559.0,1357.0,6848575.0,0.0
2022-03-01,221079725.0,63862009.0,115171793.0,10902479.0,446287.0,31915990.0,8345073.0,3221035.0,28050775.0,82881639.0,...,4570468.0,0.0,0.0,11.0,0.0,1544518.0,30988902.0,1360.0,4788536.0,0.0


index,DEPOSITS_Cheque_(1),DEPOSITS_More_than_1_day_to_1_month_(4),DEPOSITS_More_than_1_month_to_6_months_(5),SA_banksb_Cheque_(1),SA_banksb_Savings_(2),SA_banksb_Up_to_1_day_(3),SA_banksb_More_than_1_day_to_1_month_(4),SA_banksb_More_than_1_month_to_6_months_(5),SA_banksb_More_than_6_months_(6),SA_banksb_TOTAL_(7),...,"Acceptances,_commercial_paper,_bills,_promissory_notes_and_similar_acknowledgements_of_debt_discounted_or_purchased_TOTAL_ASSETS_(Col_1_plus_col_3)_(5)",Promissory_notes:_Domestic_assets_(1),Promissory_notes:_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),NON-FINANCIAL_ASSETS_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),Other_fixed_property:_Domestic_assets_(1),"Computer_equipment,_including_peripherals:_Domestic_assets_(1)",OTHER_ASSETS_Domestic_assets_(1),OTHER_ASSETS_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),Remittances_in_transit:_Domestic_assets_(1),Remittances_in_transit:_TOTAL_ASSETS_(Col_1_plus_col_3)_(5)
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DEPOSITS_Cheque_(1),1.000000,0.833213,0.920604,0.709034,0.555639,0.746258,0.579929,-0.180943,0.474060,0.775980,...,0.056748,-0.208216,-0.381260,-0.777765,-0.535305,0.713616,0.819474,-0.317595,0.644348,-0.627223
DEPOSITS_More_than_1_day_to_1_month_(4),0.833213,1.000000,0.857871,0.650522,0.497498,0.550134,0.737293,-0.076657,0.420344,0.688907,...,-0.000819,-0.169986,-0.364087,-0.622507,-0.517173,0.625144,0.744964,-0.181535,0.512017,-0.531766
DEPOSITS_More_than_1_month_to_6_months_(5),0.920604,0.857871,1.000000,0.568410,0.407119,0.670297,0.573591,-0.024914,0.385300,0.697345,...,-0.095995,-0.184413,-0.423319,-0.785525,-0.580051,0.689261,0.794643,-0.289592,0.587390,-0.631611
SA_banksb_Cheque_(1),0.709034,0.650522,0.568410,1.000000,0.744907,0.607205,0.604587,-0.167878,0.605083,0.805292,...,0.366704,-0.181969,-0.194837,-0.471439,-0.248346,0.558349,0.677490,-0.201024,0.400970,-0.337783
SA_banksb_Savings_(2),0.555639,0.497498,0.407119,0.744907,1.000000,0.565542,0.555056,-0.087463,0.552101,0.719309,...,0.481606,-0.053471,-0.099122,-0.323887,-0.194101,0.323795,0.409038,-0.140836,0.203199,-0.243529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Computer_equipment,_including_peripherals:_Domestic_assets_(1)",0.713616,0.625144,0.689261,0.558349,0.323795,0.452727,0.507527,-0.339734,0.129300,0.459327,...,-0.019505,-0.248774,-0.231344,-0.566019,-0.407908,1.000000,0.764066,-0.122842,0.718195,-0.423835
OTHER_ASSETS_Domestic_assets_(1),0.819474,0.744964,0.794643,0.677490,0.409038,0.608212,0.635750,-0.182191,0.460686,0.702416,...,-0.003379,-0.171482,-0.270842,-0.681445,-0.416717,0.764066,1.000000,-0.189481,0.734546,-0.498747
OTHER_ASSETS_TOTAL_ASSETS_(Col_1_plus_col_3)_(5),-0.317595,-0.181535,-0.289592,-0.201024,-0.140836,-0.317749,-0.041840,0.143874,-0.155457,-0.246294,...,-0.099655,-0.098397,0.002360,0.607590,0.265975,-0.122842,-0.189481,1.000000,-0.220981,0.496664
Remittances_in_transit:_Domestic_assets_(1),0.644348,0.512017,0.587390,0.400970,0.203199,0.422628,0.421286,-0.242022,0.123403,0.401806,...,-0.030751,-0.086016,-0.211031,-0.599974,-0.344182,0.718195,0.734546,-0.220981,1.000000,-0.375880
