# PreProcess Step

* Encode categorical data
* normalize the features


### Load Data

In [2]:
site_input_dir = "/tmp/dataset/horizontal_credit_fraud_data/"
site_name = "ZHSZUS33_Bank_1"

In [None]:
import os
import random
import string

import pandas as pd
dataset_names = ["train", "test"]
datasets = {}

for ds_name in dataset_names:
    file_name = os.path.join(site_input_dir, site_name, f"{ds_name}_enrichment.csv" )
    df = pd.read_csv(file_name)
    datasets[ds_name] = df
    print(df)

### Categorical encoding

In [None]:
category_columns = ['Currency_Country', 'Beneficiary_BIC', 'Currency', 'UETR', 'Receiver_BIC', 'Sender_BIC']

for ds_name in dataset_names:
    df = datasets[ds_name]
    df_encoded = pd.get_dummies(df, columns=category_columns)
    print(df_encoded)


### Normalization

In [5]:
df.columns

Index(['Unnamed: 0', 'Time', 'Class', 'Amount', 'Sender_BIC', 'Receiver_BIC',
       'UETR', 'Currency', 'Beneficiary_BIC', 'Currency_Country',
       'trans_volume', 'total_amount', 'average_amount', 'hist_trans_volume',
       'hist_total_amount', 'hist_average_amount', 'x2_y1', 'x3_y2'],
      dtype='object')

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler


processed_dfs = {}

numerical_columns = ['Timestamp', 'Class', 'Amount', 'trans_volume', 'total_amount', 'average_amount', 'hist_trans_volume',
       'hist_total_amount', 'hist_average_amount', 'x2_y1', 'x3_y2']

for ds_name in dataset_names:
    df = datasets[ds_name]
    
    # Convert 'Time' column to datetime
    df['Time'] = pd.to_datetime(df['Time'])
    # Convert datetime to Unix timestamp
    df['Timestamp'] = df['Time'].astype(int) / 10**9  # convert to seconds
    
    # Separate numerical and categorical features
    numerical_features = df[numerical_columns]
    categorical_features = df[category_columns]

    # Initialize the MinMaxScaler (or StandardScaler)
    scaler = MinMaxScaler()
    
    # Fit and transform the numerical data
    numerical_normalized = pd.DataFrame(scaler.fit_transform(numerical_features), columns=numerical_features.columns)
    
    # Combine the normalized numerical features with the categorical features
    df_combined = pd.concat([categorical_features, numerical_normalized], axis=1)
    
    
#     # one-hot encoding
#     df_combined = pd.get_dummies(df_combined, columns=category_columns)

    print("Combined DataFrame with Normalized Numerical Features:")
    print(df_combined)
    
    processed_dfs[ds_name] = df_combined
    

In [None]:
    
for name in processed_dfs:
    site_dir = os.path.join(site_input_dir, site_name)
    os.makedirs(site_dir, exist_ok=True)
    pre_processed_file_name = os.path.join(site_dir, f"{name}_normalized.csv")
    print(pre_processed_file_name)
    processed_dfs[name].to_csv(pre_processed_file_name) 


In [8]:
! tree /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1

[01;34m/tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1[0m
├── history.csv
├── test.csv
├── test_enrichment.csv
├── test_normalized.csv
├── train.csv
├── train_enrichment.csv
└── train_normalized.csv

0 directories, 7 files


Let's go back to the [XGBoost Notebook](./xgboost.ipynb)