## Feature Enrichment

### Historical data enrichment

Pick one client (Site, aka sender_BIC) to do the enrichment as every site will be the same process

In [1]:
site_input_dir = "/tmp/dataset/horizontal_credit_fraud_data/"
site_name = "ZHSZUS33_Bank_1"

In [2]:
import os
import random
import string

import pandas as pd
history_file_name = os.path.join(site_input_dir, site_name,"history.csv" )
df_history = pd.read_csv(history_file_name)
df_history

Unnamed: 0,Time,Amount,Class,Sender_BIC,Receiver_BIC,UETR,Currency,Beneficiary_BIC,Currency_Country
0,700.0,93.20,0,ZHSZUS33,YSYCESMM,DFPTD6L2WVJU31ZSUV3LBU,GBP,YXRXGB22,United Kingdom
1,3400.0,18.95,0,ZHSZUS33,HCBHSGSG,I7H98XWUHINEXUDAE6JCDR,USD,XITXUS33,United States
2,3600.0,2.09,0,ZHSZUS33,FBSFCHZH,3YPINHZKL14PCDPGGDD39Q,CHF,FBSFCHZH,Switzerland
3,3900.0,120.96,0,ZHSZUS33,ZNZZAU3M,UC4S81W0YW2W4Z6UKFC0LO,CHF,FBSFCHZH,Switzerland
4,4700.0,3.63,0,ZHSZUS33,YSYCESMM,GS0VKRX7G8D8MR72UQOSMA,USD,XITXUS33,United States
...,...,...,...,...,...,...,...,...,...
62380,39325100.0,750.00,0,ZHSZUS33,ZHSZUS33,ONWTHOYPMBSWRDT3E0PVHK,AUD,ZNZZAU3M,Australia
62381,39325800.0,1.00,0,ZHSZUS33,ZHSZUS33,EZRGTK9MA2C18FO26BE6CL,GBP,YXRXGB22,United Kingdom
62382,39326500.0,17.77,0,ZHSZUS33,YXRXGB22,KP73RLLG472CDSHJTMRLMG,GBP,YXRXGB22,United Kingdom
62383,39327200.0,47.90,0,ZHSZUS33,YXRXGB22,HZXFL3U7EN90JJF9P8U0K3,GBP,YXRXGB22,United Kingdom


In [3]:


history_summary = df_history.groupby('Currency').agg(
                     hist_trans_volume=('UETR', 'count'),
                     hist_total_amount=('Amount', 'sum'),
                     hist_average_amount=('Amount', 'mean')
).reset_index()

history_summary

Unnamed: 0,Currency,hist_trans_volume,hist_total_amount,hist_average_amount
0,AUD,12408,1093528.06,88.130888
1,CHF,12398,1105027.09,89.129464
2,GBP,12416,1110904.27,89.473604
3,SGD,12532,1101539.24,87.89812
4,USD,12631,1101784.19,87.22858


# Enrich Feature with Currency

In [4]:
import pandas as pd
dataset_names = ["train", "test"]
results = {}

temp_ds_df = {}
temp_resampled_df = {}


for ds_name in dataset_names:
    file_name = os.path.join(site_input_dir, site_name , f"{ds_name}.csv" )
    ds_df  = pd.read_csv(file_name)
    ds_df['Time'] = pd.to_datetime(ds_df['Time'], unit='s')

    # Set the Time column as the index
    ds_df.set_index('Time', inplace=True)
    
    resampled_df = ds_df.resample('1H').agg(
                     trans_volume=('UETR', 'count'),
                     total_amount=('Amount', 'sum'),
                     average_amount=('Amount', 'mean')
                     ).reset_index()
    
    temp_ds_df[ds_name] = ds_df
    temp_resampled_df[ds_name] = resampled_df
    


In [5]:
for ds_name in dataset_names:
        
    ds_df = temp_ds_df[ds_name]
    resampled_df = temp_resampled_df[ds_name]
    
    c_df = ds_df[['Currency']].resample('1H').agg({'Currency': 'first'}).reset_index()
    # Add Currency_Country to the resampled data by joining with the original DataFrame
    resampled_df2 = pd.merge(resampled_df, 
                            c_df,
                            on='Time'
                            )
    resampled_df3 = pd.merge(resampled_df2, 
                             history_summary,
                             on='Currency'
                            )
    resampled_df4 = resampled_df3.copy()
    resampled_df4['x2_y1'] = resampled_df4['average_amount']/resampled_df4['hist_trans_volume']
    
    ds_df = ds_df.sort_values('Time')
    resampled_df4 = resampled_df4.sort_values('Time')
    merged_df = pd.merge_asof(ds_df, resampled_df4, on='Time' )
    
    merged_df = merged_df.drop(columns=['Currency_y']).rename(columns={'Currency_x': 'Currency'})

    
    results[ds_name] = merged_df
    
    
    

print(results)

{'train':                      Time  Class  Amount Sender_BIC Receiver_BIC  \
0     1971-04-01 04:35:00      0  740.66   ZHSZUS33     WPUWDEFF   
1     1971-04-01 04:36:40      0   88.61   ZHSZUS33     XITXUS33   
2     1971-04-01 04:36:40      0   15.00   ZHSZUS33     ZHSZUS33   
3     1971-04-01 05:16:40      0   31.96   ZHSZUS33     WPUWDEFF   
4     1971-04-01 05:25:00      0    9.00   ZHSZUS33     FBSFCHZH   
...                   ...    ...     ...        ...          ...   
39954 1972-03-10 22:20:00      0   80.00   ZHSZUS33     YSYCESMM   
39955 1972-03-10 22:30:00      0   60.50   ZHSZUS33     WPUWDEFF   
39956 1972-03-10 22:36:40      0   20.32   ZHSZUS33     SHSHKHH1   
39957 1972-03-10 22:51:40      0   79.99   ZHSZUS33     HCBHSGSG   
39958 1972-03-10 22:55:00      0    2.69   ZHSZUS33     ZHSZUS33   

                         UETR Currency Beneficiary_BIC Currency_Country  \
0      KLT6PBX4VCAQ4II9MBQJBP      USD        ZHSZUS33    United States   
1      67HH2PAKPZ3DWT7U

# Enrich feature for beneficiary country

In [6]:

history_summary2 = df_history.groupby('Beneficiary_BIC').agg(
                     hist_trans_volume=('UETR', 'count'),
                     hist_total_amount=('Amount', 'sum'),
                     hist_average_amount=('Amount', 'mean')
).reset_index()

history_summary2

Unnamed: 0,Beneficiary_BIC,hist_trans_volume,hist_total_amount,hist_average_amount
0,FBSFCHZH,12398,1105027.09,89.129464
1,HCBHSGSG,12532,1101539.24,87.89812
2,XITXUS33,6242,531754.18,85.189712
3,YXRXGB22,12416,1110904.27,89.473604
4,ZHSZUS33,6389,570030.01,89.220537
5,ZNZZAU3M,12408,1093528.06,88.130888


In [7]:
import pandas as pd
dataset_names = ["train", "test"]
results2 = {}
for ds_name in dataset_names:
    ds_df = temp_ds_df[ds_name]
    resampled_df = temp_resampled_df[ds_name]
    
    c_df = ds_df[['Beneficiary_BIC']].resample('1H').agg({'Beneficiary_BIC': 'first'}).reset_index()
    
    # Add Beneficiary_BIC to the resampled data by joining with the original DataFrame
    resampled_df2 = pd.merge(resampled_df, 
                            c_df,
                            on='Time'
                            )
    
    resampled_df3 = pd.merge(resampled_df2, 
                             history_summary2,
                             on='Beneficiary_BIC'
                            )
    
    
    resampled_df4 = resampled_df3.copy()
    resampled_df4['x3_y2'] = resampled_df4['average_amount']/resampled_df4['hist_trans_volume']
   
    ds_df = ds_df.sort_values('Time')
    resampled_df4 = resampled_df4.sort_values('Time')

    merged_df2 = pd.merge_asof(ds_df, resampled_df4, on='Time' )
    merged_df2 = merged_df2.drop(columns=['Beneficiary_BIC_y']).rename(columns={'Beneficiary_BIC_x': 'Beneficiary_BIC'})
    
    
    results2[ds_name] = merged_df2

print(results2)

{'train':                      Time  Class  Amount Sender_BIC Receiver_BIC  \
0     1971-04-01 04:35:00      0  740.66   ZHSZUS33     WPUWDEFF   
1     1971-04-01 04:36:40      0   88.61   ZHSZUS33     XITXUS33   
2     1971-04-01 04:36:40      0   15.00   ZHSZUS33     ZHSZUS33   
3     1971-04-01 05:16:40      0   31.96   ZHSZUS33     WPUWDEFF   
4     1971-04-01 05:25:00      0    9.00   ZHSZUS33     FBSFCHZH   
...                   ...    ...     ...        ...          ...   
39954 1972-03-10 22:20:00      0   80.00   ZHSZUS33     YSYCESMM   
39955 1972-03-10 22:30:00      0   60.50   ZHSZUS33     WPUWDEFF   
39956 1972-03-10 22:36:40      0   20.32   ZHSZUS33     SHSHKHH1   
39957 1972-03-10 22:51:40      0   79.99   ZHSZUS33     HCBHSGSG   
39958 1972-03-10 22:55:00      0    2.69   ZHSZUS33     ZHSZUS33   

                         UETR Currency Beneficiary_BIC Currency_Country  \
0      KLT6PBX4VCAQ4II9MBQJBP      USD        ZHSZUS33    United States   
1      67HH2PAKPZ3DWT7U

In [8]:
final_results = {}
for name in results:
    df = results[name]
    df2 = results2[name]
    df3 = df2[["Time", "Beneficiary_BIC", "x3_y2"]].copy()
    df4 = pd.merge(df, df3, on=['Time', 'Beneficiary_BIC'])
    final_results[name] = df4

    
for name in final_results:
    site_dir = os.path.join(site_input_dir, site_name)
    os.makedirs(site_dir, exist_ok=True)
    enrich_file_name = os.path.join(site_dir, f"{name}_enrichment.csv")
    print(enrich_file_name)
    final_results[name].to_csv(enrich_file_name) 
    
final_results["train"]

/tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/train_enrichment.csv
/tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/test_enrichment.csv


Unnamed: 0,Time,Class,Amount,Sender_BIC,Receiver_BIC,UETR,Currency,Beneficiary_BIC,Currency_Country,trans_volume,total_amount,average_amount,hist_trans_volume,hist_total_amount,hist_average_amount,x2_y1,x3_y2
0,1971-04-01 04:35:00,0,740.66,ZHSZUS33,WPUWDEFF,KLT6PBX4VCAQ4II9MBQJBP,USD,ZHSZUS33,United States,3,844.27,281.423333,12631,1101784.19,87.22858,0.022280,0.044048
1,1971-04-01 04:36:40,0,88.61,ZHSZUS33,XITXUS33,67HH2PAKPZ3DWT7UOO4HRZ,GBP,YXRXGB22,United Kingdom,3,844.27,281.423333,12631,1101784.19,87.22858,0.022280,0.044048
2,1971-04-01 04:36:40,0,15.00,ZHSZUS33,ZHSZUS33,PKBHTDJCXNY3D150C408EZ,SGD,HCBHSGSG,Singapore,3,844.27,281.423333,12631,1101784.19,87.22858,0.022280,0.044048
3,1971-04-01 05:16:40,0,31.96,ZHSZUS33,WPUWDEFF,YPTUDPVINZMIF7UAHDUD18,USD,ZHSZUS33,United States,5,200.40,40.080000,12631,1101784.19,87.22858,0.003173,0.006273
4,1971-04-01 05:25:00,0,9.00,ZHSZUS33,FBSFCHZH,03PMB0KC5F5IVYE2EWE4GJ,CHF,FBSFCHZH,Switzerland,5,200.40,40.080000,12631,1101784.19,87.22858,0.003173,0.006273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41234,1972-03-10 22:20:00,0,80.00,ZHSZUS33,YSYCESMM,R1RBFXC13YQFIWV3RUZUT2,SGD,HCBHSGSG,Singapore,6,256.49,42.748333,12631,1101784.19,87.22858,0.003384,0.006691
41235,1972-03-10 22:30:00,0,60.50,ZHSZUS33,WPUWDEFF,ONFBWM90ZLOGAS48UPQ7C2,CHF,FBSFCHZH,Switzerland,6,256.49,42.748333,12631,1101784.19,87.22858,0.003384,0.006691
41236,1972-03-10 22:36:40,0,20.32,ZHSZUS33,SHSHKHH1,E8U7ZC7K8GUTW61AQ5KUN0,CHF,FBSFCHZH,Switzerland,6,256.49,42.748333,12631,1101784.19,87.22858,0.003384,0.006691
41237,1972-03-10 22:51:40,0,79.99,ZHSZUS33,HCBHSGSG,OEG3ECX64I3QYZT6Y5ACVK,CHF,FBSFCHZH,Switzerland,6,256.49,42.748333,12631,1101784.19,87.22858,0.003384,0.006691


In [9]:
! tree {site_input_dir}

[01;34m/tmp/dataset/horizontal_credit_fraud_data/[0m
├── [01;34mFBSFCHZH_Bank_6[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── [01;34mHCBHSGSG_Bank_9[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── history.csv
├── [01;34mSHSHKHH1_Bank_2[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── test.csv
├── train.csv
├── [01;34mWPUWDEFF_Bank_4[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── [01;34mXITXUS33_Bank_10[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── [01;34mYMNYFRPP_Bank_5[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── [01;34mYSYCESMM_Bank_7[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── [01;34mYXRXGB22_Bank_3[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── [01;34mZHSZUS33_Bank_1[0m
│   ├── history.csv
│   ├── test.csv
│   ├── test_enrichment.csv
│   ├── train.csv
│   └── train_enrichment.csv
└── [01;34mZNZZAU3M_Bank_8[0m
    ├── history.csv
    ├── t

In [10]:
ls -al  /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/

total 19736
drwxrwxr-x  2 chester chester    4096 Aug 15 15:05 [0m[01;34m.[0m/
drwxrwxr-x 12 chester chester    4096 Aug 15 15:03 [01;34m..[0m/
-rw-rw-r--  1 chester chester 5267738 Aug 15 15:03 history.csv
-rw-rw-r--  1 chester chester  961133 Aug 15 15:03 test.csv
-rw-rw-r--  1 chester chester 2253439 Aug 15 15:05 test_enrichment.csv
-rw-rw-r--  1 chester chester 3384485 Aug 15 15:03 train.csv
-rw-rw-r--  1 chester chester 8323055 Aug 15 15:05 train_enrichment.csv


In [11]:
! find /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/ -exec wc -l {} \;

wc: /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/: Is a directory
0 /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/
39960 /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/train.csv
11466 /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/test_enrichment.csv
41240 /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/train_enrichment.csv
11350 /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/test.csv
62386 /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/history.csv


Let's go back to the [XGBoost Notebook](./xgboost.ipynb)