## Feature Enrichment

### Historical data enrichment

Pick one client (Site, aka sender_BIC) to do the enrichment as every site will be the same process

In [1]:
site_input_dir = "/tmp/dataset/horizontal_credit_fraud_data/"
site_name = "ZHSZUS33_Bank_1"

In [2]:
import os
import random
import string

import pandas as pd
history_file_name = os.path.join(site_input_dir, site_name,"history.csv" )
df_history = pd.read_csv(history_file_name)
df_history

Unnamed: 0,Time,Amount,Class,Sender_BIC,Receiver_BIC,UETR,Currency,Beneficiary_BIC,Currency_Country
0,400.0,4.99,0,ZHSZUS33,XITXUS33,446BLSMWIB3PB82LRJZW4H,USD,XITXUS33,United States
1,1200.0,12.99,0,ZHSZUS33,ZNZZAU3M,BTK68YF15LCNPJSIWHM3BO,AUD,ZNZZAU3M,Australia
2,1300.0,0.89,0,ZHSZUS33,SHSHKHH1,41PUGII256YNHEINGXLR2V,SGD,HCBHSGSG,Singapore
3,1500.0,5.00,0,ZHSZUS33,YXRXGB22,USMEDNJ0DBN6PZJLMMRTRZ,USD,XITXUS33,United States
4,1700.0,34.09,0,ZHSZUS33,FBSFCHZH,G2W0ABVBRI6VTBTPHMXOGT,CHF,FBSFCHZH,Switzerland
...,...,...,...,...,...,...,...,...,...
62547,39325800.0,89.10,0,ZHSZUS33,HCBHSGSG,PSBVTE12ZOFOPB9RTL72I4,CHF,FBSFCHZH,Switzerland
62548,39326000.0,670.31,0,ZHSZUS33,ZNZZAU3M,Q33M1ICXXX04CNT7HX84EL,AUD,ZNZZAU3M,Australia
62549,39326100.0,0.49,0,ZHSZUS33,YXRXGB22,XUV3SY5BSJ6PG2E3ZTV4XZ,USD,ZHSZUS33,United States
62550,39326500.0,16.58,0,ZHSZUS33,SHSHKHH1,8X1VEYXHCL68O2A3UTI05H,USD,ZHSZUS33,United States


In [3]:


history_summary = df_history.groupby('Currency').agg(
                     hist_trans_volume=('UETR', 'count'),
                     hist_total_amount=('Amount', 'sum'),
                     hist_average_amount=('Amount', 'mean')
).reset_index()

history_summary

Unnamed: 0,Currency,hist_trans_volume,hist_total_amount,hist_average_amount
0,AUD,12350,1102057.69,89.23544
1,CHF,12604,1114845.23,88.4517
2,GBP,12500,1101974.09,88.157927
3,SGD,12341,1043060.55,84.519938
4,USD,12757,1105161.14,86.631743


# Enrich Feature with Currency

In [4]:
import pandas as pd
dataset_names = ["train", "test"]
results = {}

temp_ds_df = {}
temp_resampled_df = {}


for ds_name in dataset_names:
    file_name = os.path.join(site_input_dir, site_name , f"{ds_name}.csv" )
    ds_df  = pd.read_csv(file_name)
    ds_df['Time'] = pd.to_datetime(ds_df['Time'], unit='s')

    # Set the Time column as the index
    ds_df.set_index('Time', inplace=True)
    
    resampled_df = ds_df.resample('1H').agg(
                     trans_volume=('UETR', 'count'),
                     total_amount=('Amount', 'sum'),
                     average_amount=('Amount', 'mean')
                     ).reset_index()
    
    temp_ds_df[ds_name] = ds_df
    temp_resampled_df[ds_name] = resampled_df
    


In [5]:
for ds_name in dataset_names:
        
    ds_df = temp_ds_df[ds_name]
    resampled_df = temp_resampled_df[ds_name]
    
    c_df = ds_df[['Currency']].resample('1H').agg({'Currency': 'first'}).reset_index()
    # Add Currency_Country to the resampled data by joining with the original DataFrame
    resampled_df2 = pd.merge(resampled_df, 
                            c_df,
                            on='Time'
                            )
    resampled_df3 = pd.merge(resampled_df2, 
                             history_summary,
                             on='Currency'
                            )
    resampled_df4 = resampled_df3.copy()
    resampled_df4['x2_y1'] = resampled_df4['average_amount']/resampled_df4['hist_trans_volume']
    
    ds_df = ds_df.sort_values('Time')
    resampled_df4 = resampled_df4.sort_values('Time')
    merged_df = pd.merge_asof(ds_df, resampled_df4, on='Time' )
    
    merged_df = merged_df.drop(columns=['Currency_y']).rename(columns={'Currency_x': 'Currency'})

    
    results[ds_name] = merged_df
    
    
    

print(results)

{'train':                      Time  Class  Amount Sender_BIC Receiver_BIC  \
0     1971-04-01 04:25:00      0   55.98   ZHSZUS33     YXRXGB22   
1     1971-04-01 04:28:20      0   85.24   ZHSZUS33     SHSHKHH1   
2     1971-04-01 04:35:00      0  399.99   ZHSZUS33     ZNZZAU3M   
3     1971-04-01 04:35:00      0  150.00   ZHSZUS33     WPUWDEFF   
4     1971-04-01 04:56:40      0    1.29   ZHSZUS33     HCBHSGSG   
...                   ...    ...     ...        ...          ...   
39829 1972-03-10 20:53:20      0  138.18   ZHSZUS33     SHSHKHH1   
39830 1972-03-10 20:53:20      0   10.56   ZHSZUS33     YMNYFRPP   
39831 1972-03-10 21:31:40      0    9.42   ZHSZUS33     ZHSZUS33   
39832 1972-03-10 21:43:20      0    0.89   ZHSZUS33     SHSHKHH1   
39833 1972-03-10 22:36:40      0    3.99   ZHSZUS33     FBSFCHZH   

                         UETR Currency Beneficiary_BIC Currency_Country  \
0      H2HBC91SHS9P7P24ZWYTSC      USD        ZHSZUS33    United States   
1      OCRGX6R54U768WQC

# Enrich feature for beneficiary country

In [6]:

history_summary2 = df_history.groupby('Beneficiary_BIC').agg(
                     hist_trans_volume=('UETR', 'count'),
                     hist_total_amount=('Amount', 'sum'),
                     hist_average_amount=('Amount', 'mean')
).reset_index()

history_summary2

Unnamed: 0,Beneficiary_BIC,hist_trans_volume,hist_total_amount,hist_average_amount
0,FBSFCHZH,12604,1114845.23,88.4517
1,HCBHSGSG,12341,1043060.55,84.519938
2,XITXUS33,6350,539001.78,84.88217
3,YXRXGB22,12500,1101974.09,88.157927
4,ZHSZUS33,6407,566159.36,88.36575
5,ZNZZAU3M,12350,1102057.69,89.23544


In [7]:
import pandas as pd
dataset_names = ["train", "test"]
results2 = {}
for ds_name in dataset_names:
    ds_df = temp_ds_df[ds_name]
    resampled_df = temp_resampled_df[ds_name]
    
    c_df = ds_df[['Beneficiary_BIC']].resample('1H').agg({'Beneficiary_BIC': 'first'}).reset_index()
    
    # Add Beneficiary_BIC to the resampled data by joining with the original DataFrame
    resampled_df2 = pd.merge(resampled_df, 
                            c_df,
                            on='Time'
                            )
    
    resampled_df3 = pd.merge(resampled_df2, 
                             history_summary2,
                             on='Beneficiary_BIC'
                            )
    
    
    resampled_df4 = resampled_df3.copy()
    resampled_df4['x3_y2'] = resampled_df4['average_amount']/resampled_df4['hist_trans_volume']
   
    ds_df = ds_df.sort_values('Time')
    resampled_df4 = resampled_df4.sort_values('Time')

    merged_df2 = pd.merge_asof(ds_df, resampled_df4, on='Time' )
    merged_df2 = merged_df2.drop(columns=['Beneficiary_BIC_y']).rename(columns={'Beneficiary_BIC_x': 'Beneficiary_BIC'})
    
    
    results2[ds_name] = merged_df2

print(results2)

{'train':                      Time  Class  Amount Sender_BIC Receiver_BIC  \
0     1971-04-01 04:25:00      0   55.98   ZHSZUS33     YXRXGB22   
1     1971-04-01 04:28:20      0   85.24   ZHSZUS33     SHSHKHH1   
2     1971-04-01 04:35:00      0  399.99   ZHSZUS33     ZNZZAU3M   
3     1971-04-01 04:35:00      0  150.00   ZHSZUS33     WPUWDEFF   
4     1971-04-01 04:56:40      0    1.29   ZHSZUS33     HCBHSGSG   
...                   ...    ...     ...        ...          ...   
39829 1972-03-10 20:53:20      0  138.18   ZHSZUS33     SHSHKHH1   
39830 1972-03-10 20:53:20      0   10.56   ZHSZUS33     YMNYFRPP   
39831 1972-03-10 21:31:40      0    9.42   ZHSZUS33     ZHSZUS33   
39832 1972-03-10 21:43:20      0    0.89   ZHSZUS33     SHSHKHH1   
39833 1972-03-10 22:36:40      0    3.99   ZHSZUS33     FBSFCHZH   

                         UETR Currency Beneficiary_BIC Currency_Country  \
0      H2HBC91SHS9P7P24ZWYTSC      USD        ZHSZUS33    United States   
1      OCRGX6R54U768WQC

In [8]:
final_results = {}
for name in results:
    df = results[name]
    df2 = results2[name]
    df3 = df2[["Time", "Beneficiary_BIC", "x3_y2"]].copy()
    df4 = pd.merge(df, df3, on=['Time', 'Beneficiary_BIC'])
    final_results[name] = df4

    
for name in final_results:
    site_dir = os.path.join(site_input_dir, site_name)
    os.makedirs(site_dir, exist_ok=True)
    enrich_file_name = os.path.join(site_dir, f"{name}_enrichment.csv")
    print(enrich_file_name)
    final_results[name].to_csv(enrich_file_name) 
    
final_results["train"]

/tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/train_enrichment.csv
/tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/test_enrichment.csv


Unnamed: 0,Time,Class,Amount,Sender_BIC,Receiver_BIC,UETR,Currency,Beneficiary_BIC,Currency_Country,trans_volume,total_amount,average_amount,hist_trans_volume,hist_total_amount,hist_average_amount,x2_y1,x3_y2
0,1971-04-01 04:25:00,0,55.98,ZHSZUS33,YXRXGB22,H2HBC91SHS9P7P24ZWYTSC,USD,ZHSZUS33,United States,6,801.08,133.513333,12757,1105161.14,86.631743,0.010466,0.020839
1,1971-04-01 04:28:20,0,85.24,ZHSZUS33,SHSHKHH1,OCRGX6R54U768WQC48L9RS,USD,ZHSZUS33,United States,6,801.08,133.513333,12757,1105161.14,86.631743,0.010466,0.020839
2,1971-04-01 04:35:00,0,399.99,ZHSZUS33,ZNZZAU3M,J3JOWJ4RTQ12Z08MPLTEFH,USD,ZHSZUS33,United States,6,801.08,133.513333,12757,1105161.14,86.631743,0.010466,0.020839
3,1971-04-01 04:35:00,0,150.00,ZHSZUS33,WPUWDEFF,MTYATWRKHXFQ726XHEF9UH,CHF,FBSFCHZH,Switzerland,6,801.08,133.513333,12757,1105161.14,86.631743,0.010466,0.020839
4,1971-04-01 04:56:40,0,1.29,ZHSZUS33,HCBHSGSG,VTGX2RPS4UMP4WO88L87DN,SGD,HCBHSGSG,Singapore,6,801.08,133.513333,12757,1105161.14,86.631743,0.010466,0.020839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40979,1972-03-10 20:53:20,0,138.18,ZHSZUS33,SHSHKHH1,7GQUVWZFC6EKG8P81AVJ79,USD,XITXUS33,United States,3,238.64,79.546667,12757,1105161.14,86.631743,0.006236,0.012527
40980,1972-03-10 20:53:20,0,10.56,ZHSZUS33,YMNYFRPP,8YSJ6EWY2Q7K0EHWVHOCTT,GBP,YXRXGB22,United Kingdom,3,238.64,79.546667,12757,1105161.14,86.631743,0.006236,0.012527
40981,1972-03-10 21:31:40,0,9.42,ZHSZUS33,ZHSZUS33,BLWIOUCVS2QP4A5IFM9YDP,SGD,HCBHSGSG,Singapore,2,10.31,5.155000,12341,1043060.55,84.519938,0.000418,0.000418
40982,1972-03-10 21:43:20,0,0.89,ZHSZUS33,SHSHKHH1,GV9ITWOBAPPP7A0K4J3B1U,SGD,HCBHSGSG,Singapore,2,10.31,5.155000,12341,1043060.55,84.519938,0.000418,0.000418


In [9]:
! tree {site_input_dir}

[01;34m/tmp/dataset/horizontal_credit_fraud_data/[0m
├── [01;34mFBSFCHZH_Bank_6[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── [01;34mHCBHSGSG_Bank_9[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── history.csv
├── [01;34mSHSHKHH1_Bank_2[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── test.csv
├── train.csv
├── [01;34mWPUWDEFF_Bank_4[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── [01;34mXITXUS33_Bank_10[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── [01;34mYMNYFRPP_Bank_5[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── [01;34mYSYCESMM_Bank_7[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── [01;34mYXRXGB22_Bank_3[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── [01;34mZHSZUS33_Bank_1[0m
│   ├── history.csv
│   ├── test.csv
│   ├── test_enrichment.csv
│   ├── train.csv
│   └── train_enrichment.csv
└── [01;34mZNZZAU3M_Bank_8[0m
    ├── history.csv
    ├── t

In [10]:
ls -al  /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/

total 19512
drwxrwxr-x  2 chester chester    4096 Aug 16 08:54 [0m[01;34m.[0m/
drwxrwxr-x 12 chester chester    4096 Aug 16 08:53 [01;34m..[0m/
-rw-rw-r--  1 chester chester 5283159 Aug 16 08:53 history.csv
-rw-rw-r--  1 chester chester  960477 Aug 16 08:53 test.csv
-rw-rw-r--  1 chester chester 2195468 Aug 16 08:54 test_enrichment.csv
-rw-rw-r--  1 chester chester 3373972 Aug 16 08:53 train.csv
-rw-rw-r--  1 chester chester 8149134 Aug 16 08:54 train_enrichment.csv


In [11]:
! find /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/ -exec wc -l {} \;

wc: /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/: Is a directory
0 /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/
39835 /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/train.csv
11407 /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/test_enrichment.csv
40985 /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/train_enrichment.csv
11341 /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/test.csv
62553 /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/history.csv


Let's go back to the [XGBoost Notebook](./xgboost.ipynb)