In [29]:
import pandas as pd

In [30]:
transaction_path = pd.read_csv('./data/input/Transaction Path.csv')
transaction_path.head()

Unnamed: 0,Transaction ID,Account_To,Account_From
0,1957155,27356852,76206810
1,2147025,44242297,24826358
2,3065073,10295384,52104303
3,6622100,45519330,69315008
4,14877473,28680375,44586370


In [31]:
transaction_detail = pd.read_csv('data/input/Transaction Detail.csv')
transaction_detail.head()

Unnamed: 0,Transaction ID,Transaction Date,Value,Cancelled?
0,1957155,2023-02-01,128.78,N
1,28234510,2023-02-01,163.82,N
2,33688648,2023-02-01,54.71,N
3,41670299,2023-02-01,88.1,N
4,42825784,2023-02-01,217.22,Y


In [32]:
account_holders = pd.read_csv('data/input/Account Holders.csv')
account_holders.head()

Unnamed: 0,Account Holder ID,Name,Date of Birth,Contact Number,First Line of Address
0,70390615,Mahmoud Hehnke,28/08/1995,7479286250,18535 Loftsgordon Park
1,20123998,Maynord Surgeoner,21/08/1997,7716107305,6422 Buena Vista Plaza
2,54374080,Giraldo Kimbley,22/03/1995,7489940612,93005 Summer Ridge Avenue
3,97027297,Blake Dudson,30/06/1955,7253587445,2 Huxley Hill
4,89920386,Ajay Douce,19/12/1930,7395580534,90176 Miller Alley


In [33]:
account_information = pd.read_csv('data/input/Account Information.csv')
account_information.head()

Unnamed: 0,Account Number,Account Type,Account Holder ID,Balance Date,Balance
0,10005367,Platinum,70390615,2023-01-31,728.25
1,10011977,Basic,20123998,2023-01-31,676.54
2,10024680,Platinum,54374080,2023-01-31,567.46
3,10031238,Basic,97027297,2023-01-31,576.52
4,10034341,Joint,"89920386, 97325900",2023-01-31,390.39


# Preprocessing

In [34]:
# For the Transaction Path table, make sure field naming convention matches the other tables
transaction_path_prep = ( transaction_path
    # Rename the columns (substitute the underscores in the column names with spaces)
    .rename(columns = lambda x: x.replace("_", " "))
)

transaction_path_prep.head()

Unnamed: 0,Transaction ID,Account To,Account From
0,1957155,27356852,76206810
1,2147025,44242297,24826358
2,3065073,10295384,52104303
3,6622100,45519330,69315008
4,14877473,28680375,44586370


In [35]:
# For the Account Information table
account_information_prep = ( account_information
    # Make sure there are no null values in the Account Holder ID
    .dropna(subset=['Account Holder ID'])
    # Ensure there is only one row per Account Holder ID
    .assign(Account_Holder_ID = lambda x: x['Account Holder ID'].str.split(', '))
    .explode('Account_Holder_ID')
    .astype({'Account_Holder_ID': 'int64'})
    .reset_index(drop=True)
    .drop(columns=['Account Holder ID'])
    .rename(columns={'Account_Holder_ID': 'Account Holder ID'})
)

account_information_prep.head(5)

Unnamed: 0,Account Number,Account Type,Balance Date,Balance,Account Holder ID
0,10005367,Platinum,2023-01-31,728.25,70390615
1,10011977,Basic,2023-01-31,676.54,20123998
2,10024680,Platinum,2023-01-31,567.46,54374080
3,10031238,Basic,2023-01-31,576.52,97027297
4,10034341,Joint,2023-01-31,390.39,89920386


In [36]:
account_holders_prep = ( account_holders
    # Ensure every phone number starts with 07
    .assign(
        Contact_Number = lambda x: x['Contact Number']
                                    .astype(str)
                                    .str.replace(r'^7', '07', regex=True)
    )
    # Remove the Contact Number column
    .drop(columns=['Contact Number'])
    # Rename the Contact Number column
    .rename(columns={'Contact_Number': 'Contact Number'})
    
)

account_holders_prep.head()

Unnamed: 0,Account Holder ID,Name,Date of Birth,First Line of Address,Contact Number
0,70390615,Mahmoud Hehnke,28/08/1995,18535 Loftsgordon Park,7479286250
1,20123998,Maynord Surgeoner,21/08/1997,6422 Buena Vista Plaza,7716107305
2,54374080,Giraldo Kimbley,22/03/1995,93005 Summer Ridge Avenue,7489940612
3,97027297,Blake Dudson,30/06/1955,2 Huxley Hill,7253587445
4,89920386,Ajay Douce,19/12/1930,90176 Miller Alley,7395580534


In [62]:
( transaction_detail
    # Merge the tables together  
    .merge(
        transaction_path_prep.rename(columns={'Account From' : 'Account Number'}),
        on='Transaction ID',
        how='inner'
    )
    .merge(
        account_information_prep,
        on='Account Number',
        how='inner'
    )
    .merge(
        account_holders_prep,
        on='Account Holder ID',
        how='inner'
    )
    # Filter out the Cancelled Transactions and remove the Cancelled? column
    .query('`Cancelled?` == "N"')
    .drop(columns=['Cancelled?'])
    # Filter by Transactions whose value is greater than 1000
    .query('Value > 1000')
    # Filter out Platinum Account Holders
    .query('`Account Type` != "Platinum"')
    .reset_index(drop=True)
    # Output the data
    .to_csv(path_or_buf='./data/output/output_2023_07.csv', index=False, quoting=1, quotechar='"', sep=';')
)