# PreProcess Step

* Encode categorical data
* normalize the features


### Load Data

In [7]:
site_input_dir = "/tmp/dataset/horizontal_credit_fraud_data/"
site_name = "ZHSZUS33_Bank_1"

In [8]:
import os
import random
import string

import pandas as pd
dataset_names = ["train", "test"]
datasets = {}

for ds_name in dataset_names:
    file_name = os.path.join(site_input_dir, site_name, f"{ds_name}_enrichment.csv" )
    df = pd.read_csv(file_name)
    datasets[ds_name] = df
    print(df)

       Unnamed: 0                 Time  Class  Amount Sender_BIC Receiver_BIC  \
0               0  1971-04-01 04:35:00      0  740.66   ZHSZUS33     WPUWDEFF   
1               1  1971-04-01 04:36:40      0   88.61   ZHSZUS33     XITXUS33   
2               2  1971-04-01 04:36:40      0   15.00   ZHSZUS33     ZHSZUS33   
3               3  1971-04-01 05:16:40      0   31.96   ZHSZUS33     WPUWDEFF   
4               4  1971-04-01 05:25:00      0    9.00   ZHSZUS33     FBSFCHZH   
...           ...                  ...    ...     ...        ...          ...   
41234       41234  1972-03-10 22:20:00      0   80.00   ZHSZUS33     YSYCESMM   
41235       41235  1972-03-10 22:30:00      0   60.50   ZHSZUS33     WPUWDEFF   
41236       41236  1972-03-10 22:36:40      0   20.32   ZHSZUS33     SHSHKHH1   
41237       41237  1972-03-10 22:51:40      0   79.99   ZHSZUS33     HCBHSGSG   
41238       41238  1972-03-10 22:55:00      0    2.69   ZHSZUS33     ZHSZUS33   

                         UE

### Categorical encoding

In [9]:
category_columns = ['Currency_Country', 'Beneficiary_BIC', 'Currency', 'UETR', 'Receiver_BIC', 'Sender_BIC']

for ds_name in dataset_names:
    df = datasets[ds_name]
    df_encoded = pd.get_dummies(df, columns=category_columns)
    print(df_encoded)


       Unnamed: 0                 Time  Class  Amount  trans_volume  \
0               0  1971-04-01 04:35:00      0  740.66             3   
1               1  1971-04-01 04:36:40      0   88.61             3   
2               2  1971-04-01 04:36:40      0   15.00             3   
3               3  1971-04-01 05:16:40      0   31.96             5   
4               4  1971-04-01 05:25:00      0    9.00             5   
...           ...                  ...    ...     ...           ...   
41234       41234  1972-03-10 22:20:00      0   80.00             6   
41235       41235  1972-03-10 22:30:00      0   60.50             6   
41236       41236  1972-03-10 22:36:40      0   20.32             6   
41237       41237  1972-03-10 22:51:40      0   79.99             6   
41238       41238  1972-03-10 22:55:00      0    2.69             6   

       total_amount  average_amount  hist_trans_volume  hist_total_amount  \
0            844.27      281.423333              12631         1101784

### Normalization

In [4]:
df.columns

Index(['Unnamed: 0', 'Time', 'Class', 'Amount', 'Sender_BIC', 'Receiver_BIC',
       'UETR', 'Currency', 'Beneficiary_BIC', 'Currency_Country',
       'trans_volume', 'total_amount', 'average_amount', 'hist_trans_volume',
       'hist_total_amount', 'hist_average_amount', 'x2_y1', 'x3_y2'],
      dtype='object')

In [11]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler


processed_dfs = {}

numerical_columns = ['Timestamp', 'Class', 'Amount', 'trans_volume', 'total_amount', 'average_amount', 'hist_trans_volume',
       'hist_total_amount', 'hist_average_amount', 'x2_y1', 'x3_y2']

for ds_name in dataset_names:
    df = datasets[ds_name]
    
    # Convert 'Time' column to datetime
    df['Time'] = pd.to_datetime(df['Time'])
    # Convert datetime to Unix timestamp
    df['Timestamp'] = df['Time'].astype(int) / 10**9  # convert to seconds
    
    # Separate numerical and categorical features
    numerical_features = df[numerical_columns]
    categorical_features = df[category_columns]

    # Initialize the MinMaxScaler (or StandardScaler)
    scaler = MinMaxScaler()
    
    # Fit and transform the numerical data
    numerical_normalized = pd.DataFrame(scaler.fit_transform(numerical_features), columns=numerical_features.columns)
    
    # Combine the normalized numerical features with the categorical features
    df_combined = pd.concat([categorical_features, numerical_normalized], axis=1)
    
    
#     # one-hot encoding
#     df_combined = pd.get_dummies(df_combined, columns=category_columns)

    print("Combined DataFrame with Normalized Numerical Features:")
    print(df_combined)
    
    processed_dfs[ds_name] = df_combined
    

Combined DataFrame with Normalized Numerical Features:
      Currency_Country Beneficiary_BIC Currency                    UETR  \
0        United States        ZHSZUS33      USD  KLT6PBX4VCAQ4II9MBQJBP   
1       United Kingdom        YXRXGB22      GBP  67HH2PAKPZ3DWT7UOO4HRZ   
2            Singapore        HCBHSGSG      SGD  PKBHTDJCXNY3D150C408EZ   
3        United States        ZHSZUS33      USD  YPTUDPVINZMIF7UAHDUD18   
4          Switzerland        FBSFCHZH      CHF  03PMB0KC5F5IVYE2EWE4GJ   
...                ...             ...      ...                     ...   
41234        Singapore        HCBHSGSG      SGD  R1RBFXC13YQFIWV3RUZUT2   
41235      Switzerland        FBSFCHZH      CHF  ONFBWM90ZLOGAS48UPQ7C2   
41236      Switzerland        FBSFCHZH      CHF  E8U7ZC7K8GUTW61AQ5KUN0   
41237      Switzerland        FBSFCHZH      CHF  OEG3ECX64I3QYZT6Y5ACVK   
41238      Switzerland        FBSFCHZH      CHF  6GJ2LIIO7J3BHJTFYCZ9IR   

      Receiver_BIC Sender_BIC  Timestamp  Cl

In [13]:
    
for name in processed_dfs:
    site_dir = os.path.join(site_input_dir, site_name)
    os.makedirs(site_dir, exist_ok=True)
    pre_processed_file_name = os.path.join(site_dir, f"{name}_normalized.csv")
    print(pre_processed_file_name)
    processed_dfs[name].to_csv(pre_processed_file_name) 


/tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/train_normalized.csv
/tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/test_normalized.csv


In [14]:
! tree /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1

[01;34m/tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1[0m
├── history.csv
├── test.csv
├── test_enrichment.csv
├── test_normalized.csv
├── train.csv
├── train_enrichment.csv
└── train_normalized.csv

0 directories, 7 files


Let's go back to the [XGBoost Notebook](./xgboost.ipynb)