# PreProcess Step

* Encode categorical data
* normalize the features


### Load Data

In [2]:
site_input_dir = "/tmp/dataset/horizontal_credit_fraud_data/"
site_name = "ZHSZUS33_Bank_1"

In [3]:
import os
import random
import string

import pandas as pd
dataset_names = ["train", "test"]
datasets = {}

for ds_name in dataset_names:
    file_name = os.path.join(site_input_dir, site_name, f"{ds_name}_enrichment.csv" )
    df = pd.read_csv(file_name)
    datasets[ds_name] = df
    print(df)

       Unnamed: 0                 Time  Class  Amount Sender_BIC Receiver_BIC  \
0               0  1971-04-01 04:30:00      0  348.06   ZHSZUS33     YXRXGB22   
1               1  1971-04-01 04:35:00      0    2.69   ZHSZUS33     YMNYFRPP   
2               2  1971-04-01 04:40:00      0   16.63   ZHSZUS33     XITXUS33   
3               3  1971-04-01 04:51:40      0   54.80   ZHSZUS33     XITXUS33   
4               4  1971-04-01 05:16:40      0   31.96   ZHSZUS33     ZHSZUS33   
...           ...                  ...    ...     ...        ...          ...   
40804       40804  1972-03-10 19:01:40      0   12.99   ZHSZUS33     WPUWDEFF   
40805       40805  1972-03-10 21:30:00      0   52.34   ZHSZUS33     YXRXGB22   
40806       40806  1972-03-10 21:36:40      0  220.28   ZHSZUS33     YSYCESMM   
40807       40807  1972-03-10 22:30:00      0   60.50   ZHSZUS33     YXRXGB22   
40808       40808  1972-03-10 22:58:20      0   24.79   ZHSZUS33     ZHSZUS33   

                         UE

### Categorical encoding

In [4]:
category_columns = ['Currency_Country', 'Beneficiary_BIC', 'Currency', 'UETR', 'Receiver_BIC', 'Sender_BIC']

for ds_name in dataset_names:
    df = datasets[ds_name]
    df_encoded = pd.get_dummies(df, columns=category_columns)
    print(df_encoded)


       Unnamed: 0                 Time  Class  Amount  trans_volume  \
0               0  1971-04-01 04:30:00      0  348.06             4   
1               1  1971-04-01 04:35:00      0    2.69             4   
2               2  1971-04-01 04:40:00      0   16.63             4   
3               3  1971-04-01 04:51:40      0   54.80             4   
4               4  1971-04-01 05:16:40      0   31.96             4   
...           ...                  ...    ...     ...           ...   
40804       40804  1972-03-10 19:01:40      0   12.99             1   
40805       40805  1972-03-10 21:30:00      0   52.34             2   
40806       40806  1972-03-10 21:36:40      0  220.28             2   
40807       40807  1972-03-10 22:30:00      0   60.50             2   
40808       40808  1972-03-10 22:58:20      0   24.79             2   

       total_amount  average_amount  hist_trans_volume  hist_total_amount  \
0            422.18         105.545              12541         1124650

### Normalization

In [5]:
df.columns

Index(['Unnamed: 0', 'Time', 'Class', 'Amount', 'Sender_BIC', 'Receiver_BIC',
       'UETR', 'Currency', 'Beneficiary_BIC', 'Currency_Country',
       'trans_volume', 'total_amount', 'average_amount', 'hist_trans_volume',
       'hist_total_amount', 'hist_average_amount', 'x2_y1', 'x3_y2'],
      dtype='object')

In [6]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler


processed_dfs = {}

numerical_columns = ['Timestamp', 'Class', 'Amount', 'trans_volume', 'total_amount', 'average_amount', 'hist_trans_volume',
       'hist_total_amount', 'hist_average_amount', 'x2_y1', 'x3_y2']

for ds_name in dataset_names:
    df = datasets[ds_name]
    
    # Convert 'Time' column to datetime
    df['Time'] = pd.to_datetime(df['Time'])
    # Convert datetime to Unix timestamp
    df['Timestamp'] = df['Time'].astype(int) / 10**9  # convert to seconds
    
    # Separate numerical and categorical features
    numerical_features = df[numerical_columns]
    categorical_features = df[category_columns]

    # Initialize the MinMaxScaler (or StandardScaler)
    scaler = MinMaxScaler()
    
    # Fit and transform the numerical data
    numerical_normalized = pd.DataFrame(scaler.fit_transform(numerical_features), columns=numerical_features.columns)
    
    # Combine the normalized numerical features with the categorical features
    df_combined = pd.concat([categorical_features, numerical_normalized], axis=1)
    
    
#     # one-hot encoding
#     df_combined = pd.get_dummies(df_combined, columns=category_columns)

    print("Combined DataFrame with Normalized Numerical Features:")
    print(df_combined)
    
    processed_dfs[ds_name] = df_combined
    

Combined DataFrame with Normalized Numerical Features:
      Currency_Country Beneficiary_BIC Currency                    UETR  \
0        United States        XITXUS33      USD  MV2B0B0S1NUTY8OCOEQ2QE   
1            Singapore        HCBHSGSG      SGD  CQD9INGI7GJATKWRK0D44Z   
2          Switzerland        FBSFCHZH      CHF  IJXYXLV8SF72RU3MRSJ542   
3          Switzerland        FBSFCHZH      CHF  B1850ZUIHTMT61N7HMIZYM   
4       United Kingdom        YXRXGB22      GBP  4BBLS9B31LWHZFF17RODX1   
...                ...             ...      ...                     ...   
40804    United States        XITXUS33      USD  EBY8SA8UZOWNNJ2X7OUBZ2   
40805   United Kingdom        YXRXGB22      GBP  3D4772259A6PY7Q7XVJ302   
40806    United States        ZHSZUS33      USD  Z5VK0S69KASH3B82M6W5XV   
40807    United States        XITXUS33      USD  HA4WJAB98YR8M9FIE0C2A1   
40808   United Kingdom        YXRXGB22      GBP  9SJQ6WVX8CGS0P1DYYGQ45   

      Receiver_BIC Sender_BIC  Timestamp  Cl

In [7]:
    
for name in processed_dfs:
    site_dir = os.path.join(site_input_dir, site_name)
    os.makedirs(site_dir, exist_ok=True)
    pre_processed_file_name = os.path.join(site_dir, f"{name}_normalized.csv")
    print(pre_processed_file_name)
    processed_dfs[name].to_csv(pre_processed_file_name) 


/tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/train_normalized.csv
/tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/test_normalized.csv


In [8]:
! tree /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1

[01;34m/tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1[0m
├── history.csv
├── test.csv
├── test_enrichment.csv
├── test_normalized.csv
├── train.csv
├── train_enrichment.csv
└── train_normalized.csv

0 directories, 7 files


Let's go back to the [XGBoost Notebook](./xgboost.ipynb)