# PreProcess Step

* Encode categorical data
* normalize the features


### Load Data

In [6]:
site_input_dir = "/tmp/dataset/horizontal_credit_fraud_data/"
site_name = "ZHSZUS33_Bank_1"

In [7]:
import os
import random
import string

import pandas as pd
dataset_names = ["train", "test"]
datasets = {}

for ds_name in dataset_names:
    file_name = os.path.join(site_input_dir, site_name, f"{ds_name}_enrichment.csv" )
    df = pd.read_csv(file_name)
    datasets[ds_name] = df
    print(df)

       Unnamed: 0                 Time  Class  Amount Sender_BIC Receiver_BIC  \
0               0  1971-04-01 04:25:00      0   55.98   ZHSZUS33     YXRXGB22   
1               1  1971-04-01 04:28:20      0   85.24   ZHSZUS33     SHSHKHH1   
2               2  1971-04-01 04:35:00      0  399.99   ZHSZUS33     ZNZZAU3M   
3               3  1971-04-01 04:35:00      0  150.00   ZHSZUS33     WPUWDEFF   
4               4  1971-04-01 04:56:40      0    1.29   ZHSZUS33     HCBHSGSG   
...           ...                  ...    ...     ...        ...          ...   
40979       40979  1972-03-10 20:53:20      0  138.18   ZHSZUS33     SHSHKHH1   
40980       40980  1972-03-10 20:53:20      0   10.56   ZHSZUS33     YMNYFRPP   
40981       40981  1972-03-10 21:31:40      0    9.42   ZHSZUS33     ZHSZUS33   
40982       40982  1972-03-10 21:43:20      0    0.89   ZHSZUS33     SHSHKHH1   
40983       40983  1972-03-10 22:36:40      0    3.99   ZHSZUS33     FBSFCHZH   

                         UE

### Categorical encoding

In [8]:
category_columns = ['Currency_Country', 'Beneficiary_BIC', 'Currency', 'UETR', 'Receiver_BIC', 'Sender_BIC']

for ds_name in dataset_names:
    df = datasets[ds_name]
    df_encoded = pd.get_dummies(df, columns=category_columns)
    print(df_encoded)


       Unnamed: 0                 Time  Class  Amount  trans_volume  \
0               0  1971-04-01 04:25:00      0   55.98             6   
1               1  1971-04-01 04:28:20      0   85.24             6   
2               2  1971-04-01 04:35:00      0  399.99             6   
3               3  1971-04-01 04:35:00      0  150.00             6   
4               4  1971-04-01 04:56:40      0    1.29             6   
...           ...                  ...    ...     ...           ...   
40979       40979  1972-03-10 20:53:20      0  138.18             3   
40980       40980  1972-03-10 20:53:20      0   10.56             3   
40981       40981  1972-03-10 21:31:40      0    9.42             2   
40982       40982  1972-03-10 21:43:20      0    0.89             2   
40983       40983  1972-03-10 22:36:40      0    3.99             1   

       total_amount  average_amount  hist_trans_volume  hist_total_amount  \
0            801.08      133.513333              12757         1105161

### Normalization

In [9]:
df.columns

Index(['Unnamed: 0', 'Time', 'Class', 'Amount', 'Sender_BIC', 'Receiver_BIC',
       'UETR', 'Currency', 'Beneficiary_BIC', 'Currency_Country',
       'trans_volume', 'total_amount', 'average_amount', 'hist_trans_volume',
       'hist_total_amount', 'hist_average_amount', 'x2_y1', 'x3_y2'],
      dtype='object')

In [10]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler


processed_dfs = {}

numerical_columns = ['Timestamp', 'Class', 'Amount', 'trans_volume', 'total_amount', 'average_amount', 'hist_trans_volume',
       'hist_total_amount', 'hist_average_amount', 'x2_y1', 'x3_y2']

for ds_name in dataset_names:
    df = datasets[ds_name]
    
    # Convert 'Time' column to datetime
    df['Time'] = pd.to_datetime(df['Time'])
    # Convert datetime to Unix timestamp
    df['Timestamp'] = df['Time'].astype(int) / 10**9  # convert to seconds
    
    # Separate numerical and categorical features
    numerical_features = df[numerical_columns]
    categorical_features = df[category_columns]

    # Initialize the MinMaxScaler (or StandardScaler)
    scaler = MinMaxScaler()
    
    # Fit and transform the numerical data
    numerical_normalized = pd.DataFrame(scaler.fit_transform(numerical_features), columns=numerical_features.columns)
    
    # Combine the normalized numerical features with the categorical features
    df_combined = pd.concat([categorical_features, numerical_normalized], axis=1)
    
    
#     # one-hot encoding
#     df_combined = pd.get_dummies(df_combined, columns=category_columns)

    print("Combined DataFrame with Normalized Numerical Features:")
    print(df_combined)
    
    processed_dfs[ds_name] = df_combined
    

Combined DataFrame with Normalized Numerical Features:
      Currency_Country Beneficiary_BIC Currency                    UETR  \
0        United States        ZHSZUS33      USD  H2HBC91SHS9P7P24ZWYTSC   
1        United States        ZHSZUS33      USD  OCRGX6R54U768WQC48L9RS   
2        United States        ZHSZUS33      USD  J3JOWJ4RTQ12Z08MPLTEFH   
3          Switzerland        FBSFCHZH      CHF  MTYATWRKHXFQ726XHEF9UH   
4            Singapore        HCBHSGSG      SGD  VTGX2RPS4UMP4WO88L87DN   
...                ...             ...      ...                     ...   
40979    United States        XITXUS33      USD  7GQUVWZFC6EKG8P81AVJ79   
40980   United Kingdom        YXRXGB22      GBP  8YSJ6EWY2Q7K0EHWVHOCTT   
40981        Singapore        HCBHSGSG      SGD  BLWIOUCVS2QP4A5IFM9YDP   
40982        Singapore        HCBHSGSG      SGD  GV9ITWOBAPPP7A0K4J3B1U   
40983    United States        ZHSZUS33      USD  2LIZB2NB0IR52KVFC6MSN4   

      Receiver_BIC Sender_BIC  Timestamp  Cl

In [11]:
    
for name in processed_dfs:
    site_dir = os.path.join(site_input_dir, site_name)
    os.makedirs(site_dir, exist_ok=True)
    pre_processed_file_name = os.path.join(site_dir, f"{name}_normalized.csv")
    print(pre_processed_file_name)
    processed_dfs[name].to_csv(pre_processed_file_name) 


/tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/train_normalized.csv
/tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/test_normalized.csv


In [12]:
! tree /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1

[01;34m/tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1[0m
├── history.csv
├── test.csv
├── test_enrichment.csv
├── test_normalized.csv
├── train.csv
├── train_enrichment.csv
└── train_normalized.csv

0 directories, 7 files


Let's go back to the [XGBoost Notebook](./xgboost.ipynb)