# Graph Construction Step

* Construct the graph for each site's transaction data

Each node represents a transaction, and the edges represent the relationships between transactions. Since each site consists of the same Sender_BIC, to define the graph edge, we use the following rules:

1. The two transactions are with the same Receiver_BIC.
2. The two transactions are with the same Currency.
3. The two transactions time difference are smaller than 6000.


### Load Data

In [None]:
site_input_dir = "/tmp/dataset/horizontal_credit_fraud_data/"
site_name = "ZHSZUS33_Bank_1"

In [None]:
import os
import random
import string

import pandas as pd
dataset_names = ["train", "test"]
datasets = {}

for ds_name in dataset_names:
    file_name = os.path.join(site_input_dir, site_name, f"{ds_name}_enrichment.csv" )
    df = pd.read_csv(file_name)
    datasets[ds_name] = df
    print(df)

### Related Columns

In [None]:
df.columns

In [None]:
import pandas as pd

processed_dfs = {}

numerical_columns = ['Timestamp', 'Class', 'Amount', 'trans_volume', 'total_amount', 'average_amount', 'hist_trans_volume',
       'hist_total_amount', 'hist_average_amount', 'x2_y1', 'x3_y2']

for ds_name in dataset_names:
    df = datasets[ds_name]
    
    # Convert 'Time' column to datetime
    df['Time'] = pd.to_datetime(df['Time'])
    # Convert datetime to Unix timestamp
    df['Timestamp'] = df['Time'].astype(int) / 10**9  # convert to seconds
    
    # Separate numerical and categorical features
    numerical_features = df[numerical_columns]
    categorical_features = df[category_columns]

    # Initialize the MinMaxScaler (or StandardScaler)
    scaler = MinMaxScaler()
    
    # Fit and transform the numerical data
    numerical_normalized = pd.DataFrame(scaler.fit_transform(numerical_features), columns=numerical_features.columns)
    
    # Combine the normalized numerical features with the categorical features
    df_combined = pd.concat([categorical_features, numerical_normalized], axis=1)
        
#     # one-hot encoding
#     df_combined = pd.get_dummies(df_combined, columns=category_columns)

    print("Combined DataFrame with Normalized Numerical Features:")
    print(df_combined)
    
    processed_dfs[ds_name] = df_combined
    

In [None]:
for name in processed_dfs:
    site_dir = os.path.join(site_input_dir, site_name)
    os.makedirs(site_dir, exist_ok=True)
    pre_processed_file_name = os.path.join(site_dir, f"{name}_normalized.csv")
    print(pre_processed_file_name)
    processed_dfs[name].to_csv(pre_processed_file_name) 

In [None]:
! tree /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1

Let's go back to the [XGBoost Notebook](./xgboost.ipynb)