In [49]:
import pandas as pd

In [50]:
"""
- Input the data
- For the Transaction Path table:
    - Make sure field naming convention matches the other tables
        i.e. instead of Account_From it should be Account From
- For the Account Information table:
    - Make sure there are no null values in the Account Holder ID
    - Ensure there is one row per Account Holder ID
        Joint accounts will have 2 Account Holders, we want a row for each of them
- For the Account Holders table:
    - Make sure the phone numbers start with 07
- Bring the tables together
- Filter out cancelled transactions 
- Filter to transactions greater than £1,000 in value 
- Filter out Platinum accounts
Output the data
"""

'\n- Input the data\n- For the Transaction Path table:\n    - Make sure field naming convention matches the other tables\n        i.e. instead of Account_From it should be Account From\n- For the Account Information table:\n    - Make sure there are no null values in the Account Holder ID\n    - Ensure there is one row per Account Holder ID\n        Joint accounts will have 2 Account Holders, we want a row for each of them\n- For the Account Holders table:\n    - Make sure the phone numbers start with 07\n- Bring the tables together\n- Filter out cancelled transactions \n- Filter to transactions greater than £1,000 in value \n- Filter out Platinum accounts\nOutput the data\n'

In [51]:
trans_detail = pd.read_csv("Transaction Detail.csv")
trans_path = pd.read_csv("Transaction Path.csv")
accnt_holders = pd.read_csv("Account Holders.csv")
accnt_info = pd.read_csv("Account Information.csv")

In [52]:
#Rename trans_path columns
trans_path.rename({'Account_To': 'Account To', 'Account_From': 'Account From'}, axis =1, inplace= True)

In [53]:
accnt_info['Account Holder ID'].isnull().value_counts()

False    3000
Name: Account Holder ID, dtype: int64

In [54]:
accnt_info['Account Holder ID'].duplicated().value_counts()

False    3000
Name: Account Holder ID, dtype: int64

In [55]:
accnt_holders['Account Holder ID'].duplicated().value_counts()

False    3072
Name: Account Holder ID, dtype: int64

In [56]:
accnt_holders['Contact Number'].astype(str).str.startswith('7').value_counts()

True    3072
Name: Contact Number, dtype: int64

In [57]:
transactions = pd.merge(trans_path, right = trans_detail,on= 'Transaction ID', how='left')

In [58]:
transactions

Unnamed: 0,Transaction ID,Account To,Account From,Transaction Date,Value,Cancelled?
0,1957155,27356852,76206810,2023-02-01,128.78,N
1,2147025,44242297,24826358,2023-02-09,170.19,N
2,3065073,10295384,52104303,2023-02-06,87.57,N
3,6622100,45519330,69315008,2023-02-07,85.76,N
4,14877473,28680375,44586370,2023-02-02,84.65,N
...,...,...,...,...,...,...
8776,9996102963,17925406,40530538,2023-02-13,112.38,N
8777,9996177785,37678813,60789634,2023-02-13,78.14,N
8778,9997003500,54458410,17810734,2023-02-09,138.80,N
8779,9997164946,57426365,23333877,2023-02-01,123.23,N


In [59]:
transactions = transactions[transactions['Cancelled?'] != 'Y']
transactions = transactions[transactions['Value'] > 1000]
transactions['Account From'] = transactions['Account From'].astype('str')

In [60]:
transactions.value_counts()

Transaction ID  Account To  Account From  Transaction Date  Value   Cancelled?
45024251        15826579    53727603      2023-02-11        1137.9  N             1
7088915162      78501355    72971287      2023-02-10        1038.8  N             1
6977551879      13197335    51718829      2023-02-09        1043.8  N             1
6997835902      41919083    94247629      2023-02-07        1738.7  N             1
6998049056      66706067    59135815      2023-02-10        1202.5  N             1
                                                                                 ..
3481459262      58811985    54868017      2023-02-08        1024.6  N             1
3478144698      99481977    76169332      2023-02-02        1695.5  N             1
3472782188      62412215    50772829      2023-02-12        1278.9  N             1
3463915588      93192347    83128181      2023-02-09        1325.7  N             1
9909979543      10034341    53161068      2023-02-03        1566.6  N            

In [62]:
accnt_info['Account Holder ID'] = accnt_info['Account Holder ID'].astype('str')
accnt_holders['Account Holder ID'] = accnt_holders['Account Holder ID'].astype('str')

In [61]:
accnt_info['Account Holder ID'] = accnt_info['Account Holder ID'].str.split(',')
accnt_info = accnt_info.explode('Account Holder ID')

In [63]:
accounts = accnt_holders.merge(accnt_info, on='Account Holder ID')

In [64]:
transactions['Account From'] = transactions['Account From'].astype('str')
accounts['Account Number'] = accounts['Account Number'].astype('str')
df= transactions.merge(accounts, left_on='Account From', right_on='Account Number')

In [65]:
df = df[df['Account Type'] != 'Platinum']

In [66]:
df

Unnamed: 0,Transaction ID,Account To,Account From,Transaction Date,Value,Cancelled?,Account Holder ID,Name,Date of Birth,Contact Number,First Line of Address,Account Number,Account Type,Balance Date,Balance
2,46779876,21694967,82023979,2023-02-02,1019.5,N,76986856,Barb Eminson,20/07/1974,7748168711,3149 American Ash Hill,82023979,Gold,2023-01-31,744.37
3,57491925,19094015,65589565,2023-02-12,1761.1,N,37386889,Annabel Medd,17/02/1959,7175713923,24 Warner Junction,65589565,Gold,2023-01-31,321.99
4,108536548,86207903,10553619,2023-02-06,1085.4,N,58583983,Dud Gaunt,04/12/1983,7681115087,3 Canary Avenue,10553619,Basic,2023-01-31,756.20
5,2843750091,20090784,10553619,2023-02-04,1263.9,N,58583983,Dud Gaunt,04/12/1983,7681115087,3 Canary Avenue,10553619,Basic,2023-01-31,756.20
6,176916326,33632099,18185362,2023-02-12,1955.6,N,83289626,Agace Sporle,28/02/1999,7182359874,96 Emmet Pass,18185362,Basic,2023-01-31,609.36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,9847562024,84778718,29358670,2023-02-12,1183.8,N,14221230,Joice Lufkin,01/08/1962,7943159313,6033 Dunning Center,29358670,Gold,2023-01-31,874.94
493,9849651040,19789505,88160468,2023-02-01,1041.9,N,21827357,Raff Pakeman,09/03/1997,7596092076,76 Ridge Oak Alley,88160468,Gold,2023-01-31,640.07
494,9859809917,25678043,24529850,2023-02-08,2283.2,N,74244392,Coralie Knellen,28/09/1968,7134906360,682 Morning Pass,24529850,Basic,2023-01-31,800.79
495,9881869013,41252281,53845615,2023-02-10,1052.9,N,53284971,Melita Grouvel,20/01/1991,7528594995,1 Victoria Point,53845615,Basic,2023-01-31,597.04


In [68]:
df.to_csv('output.csv')