## Feature Enrichment

### Historical data enrichment

Pick one client (Site, aka sender_BIC) to do the enrichment as every site will be the same process

In [None]:
site_input_dir = "/tmp/dataset/horizontal_credit_fraud_data/"
site_name = "ZHSZUS33_Bank_1"

In [None]:
import os
import random
import string

import pandas as pd
history_file_name = os.path.join(site_input_dir, site_name,"history.csv" )
df_history = pd.read_csv(history_file_name)
df_history

In [None]:
history_summary = df_history.groupby('Currency').agg(
                     hist_trans_volume=('UETR', 'count'),
                     hist_total_amount=('Amount', 'sum'),
                     hist_average_amount=('Amount', 'mean')
).reset_index()

history_summary

# Enrich Feature with Currency

In [None]:
import pandas as pd
dataset_names = ["train", "test"]
results = {}

temp_ds_df = {}
temp_resampled_df = {}


for ds_name in dataset_names:
    file_name = os.path.join(site_input_dir, site_name , f"{ds_name}.csv" )
    ds_df  = pd.read_csv(file_name)
    ds_df['Time'] = pd.to_datetime(ds_df['Time'], unit='s')

    # Set the Time column as the index
    ds_df.set_index('Time', inplace=True)
    
    resampled_df = ds_df.resample('1H').agg(
                     trans_volume=('UETR', 'count'),
                     total_amount=('Amount', 'sum'),
                     average_amount=('Amount', 'mean')
                     ).reset_index()
    
    temp_ds_df[ds_name] = ds_df
    temp_resampled_df[ds_name] = resampled_df


In [None]:
for ds_name in dataset_names:
        
    ds_df = temp_ds_df[ds_name]
    resampled_df = temp_resampled_df[ds_name]
    
    c_df = ds_df[['Currency']].resample('1H').agg({'Currency': 'first'}).reset_index()
    # Add Currency_Country to the resampled data by joining with the original DataFrame
    resampled_df2 = pd.merge(resampled_df, 
                            c_df,
                            on='Time'
                            )
    resampled_df3 = pd.merge(resampled_df2, 
                             history_summary,
                             on='Currency'
                            )
    resampled_df4 = resampled_df3.copy()
    resampled_df4['x2_y1'] = resampled_df4['average_amount']/resampled_df4['hist_trans_volume']
    
    ds_df = ds_df.sort_values('Time')
    resampled_df4 = resampled_df4.sort_values('Time')
    
    merged_df = pd.merge_asof(ds_df, resampled_df4, on='Time' )
    merged_df = merged_df.drop(columns=['Currency_y']).rename(columns={'Currency_x': 'Currency'})
    
    results[ds_name] = merged_df
    
print(results)

# Enrich feature for beneficiary country

In [None]:

history_summary2 = df_history.groupby('Beneficiary_BIC').agg(
                     hist_trans_volume=('UETR', 'count'),
                     hist_total_amount=('Amount', 'sum'),
                     hist_average_amount=('Amount', 'mean')
).reset_index()

history_summary2

In [None]:
import pandas as pd
dataset_names = ["train", "test"]
results2 = {}
for ds_name in dataset_names:
    ds_df = temp_ds_df[ds_name]
    resampled_df = temp_resampled_df[ds_name]
    
    c_df = ds_df[['Beneficiary_BIC']].resample('1H').agg({'Beneficiary_BIC': 'first'}).reset_index()
    
    # Add Beneficiary_BIC to the resampled data by joining with the original DataFrame
    resampled_df2 = pd.merge(resampled_df, 
                            c_df,
                            on='Time'
                            )
    
    resampled_df3 = pd.merge(resampled_df2, 
                             history_summary2,
                             on='Beneficiary_BIC'
                            )
    
    
    resampled_df4 = resampled_df3.copy()
    resampled_df4['x3_y2'] = resampled_df4['average_amount']/resampled_df4['hist_trans_volume']
   
    ds_df = ds_df.sort_values('Time')
    resampled_df4 = resampled_df4.sort_values('Time')

    merged_df2 = pd.merge_asof(ds_df, resampled_df4, on='Time' )
    merged_df2 = merged_df2.drop(columns=['Beneficiary_BIC_y']).rename(columns={'Beneficiary_BIC_x': 'Beneficiary_BIC'})
    
    results2[ds_name] = merged_df2

print(results2)

In [None]:
final_results = {}
for name in results:
    df = results[name]
    df2 = results2[name]
    df3 = df2[["Time", "Beneficiary_BIC", "x3_y2"]].copy()
    df4 = pd.merge(df, df3, on=['Time', 'Beneficiary_BIC'])
    final_results[name] = df4

    
for name in final_results:
    site_dir = os.path.join(site_input_dir, site_name)
    os.makedirs(site_dir, exist_ok=True)
    enrich_file_name = os.path.join(site_dir, f"{name}_enrichment.csv")
    print(enrich_file_name)
    final_results[name].to_csv(enrich_file_name) 
    
final_results["train"]

In [None]:
! tree {site_input_dir}

In [None]:
ls -al  /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/

In [None]:
! find /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/ -exec wc -l {} \;

Let's go back to the [XGBoost Notebook](../xgboost.ipynb)