# Credit Card Fraud End-to-End Example

## Prepare Data
### Based on the riginal data, add randome synthentic data to make full dataset
* expand time in seconds x 200 times to cover 26 months
* double the data record size
* add other categorical features, sender_bic, receiever_bic, beneficiary_bic, orginator_bic, currency, country
* currency country and beneficiary_bic country are the same country



In [1]:
data_path="creditcard.csv"
out_folder="/tmp/dataset/horizontal_credit_fraud_data"

import shutil
import os

if os.path.exists(out_folder):
    shutil.rmtree(out_folder)


In [2]:
! wc -l {data_path}

284808 creditcard.csv


In [3]:
# %load_ext cudf.pandas
import argparse
import os
import random
import string

import pandas as pd
from sklearn.model_selection import train_test_split

# expand original data and generate a 2-plus year data

origin_df = pd.read_csv(data_path)
old_max_time = origin_df['Time'].max()
old_max_days = old_max_time/3600/24
print(f"{old_max_days=}")

N = 4

df_temp = origin_df[['Time', 'Amount', 'Class']].copy() 
df_temp['Time'] = df_temp['Time'] * 400/4

# Find the maximum value in the 'Time' column

max_time = df_temp['Time'].max()
df = df_temp

for i in range(1, N): 
    # Create a duplicate of the DataFrame with incremental 'Time' values

    df_duplicate = df_temp.copy()
    df_duplicate['Time'] = df_duplicate['Time'] + max_time*i
    
    # Combine the original DataFrame with the duplicated DataFrame
    df = pd.concat([df, df_duplicate], ignore_index=True)
    

min_time = df['Time'].min()
max_time = df['Time'].max()

min_months = min_time/3600/24/30 
max_months = max_time/3600/24/30 

# Try to generate a 2-plus year data

print(f"{min_months=}, {max_months=}")

# List of example BICs for demonstration, BIC and names are random created, they are fakes. 
bic_list = {
    'ZHSZUS33': 'United States',  # Bank 1
    'SHSHKHH1': 'Hong Kong',      # bank 2
    'YXRXGB22': 'United Kingdom', # bank 3
    'WPUWDEFF': 'Germany',        # bank 4
    'YMNYFRPP': 'France',         # bank 5
    'FBSFCHZH': 'Switzerland',    # Bank 6
    'YSYCESMM': 'Spain',          # bank 7
    'ZNZZAU3M': 'Australia',      # Bank 8
    'HCBHSGSG': 'Singapore',      # bank 9
    'XITXUS33': 'United States'   # bank 10
}

# List of currencies and their respective countries
currencies = {
    'USD': 'United States',
    'EUR': 'Eurozone',
    'GBP': 'United Kingdom',
    'JPY': 'Japan',
    'AUD': 'Australia',
    'CHF': 'Switzerland',
    'SGD': 'Singapore'
}


# BIC to Bank Name mapping
bic_to_bank = {
    'ZHSZUS33': 'Bank_1',
    'SHSHKHH1': 'Bank_2',
    'YXRXGB22': 'Bank_3',
    'WPUWDEFF': 'Bank_4',
    'YMNYFRPP': 'Bank_5', 
    'FBSFCHZH': 'Bank_6', 
    'YSYCESMM': 'Bank_7', 
    'ZNZZAU3M': 'Bank_8', 
    'HCBHSGSG': 'Bank_9', 
    'XITXUS33': 'Bank_10', 
}

# Function to generate random UETR
def generate_random_uetr(length=22):
    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length))


# Function to generate random BICs and currency details
def generate_random_details(df):
    # Ensure the currency and beneficiary BIC match
    def match_currency_and_bic():
        while True:
            currency = random.choice(list(currencies.keys()))
            country = currencies[currency]
            matching_bics = [bic for bic, bic_country in bic_list.items() if bic_country == country]
            if matching_bics:
                return currency, random.choice(matching_bics)
    
    df['Sender_BIC'] = [random.choice(list(bic_list.keys())) for _ in range(len(df))]
    df['Receiver_BIC'] = [random.choice(list(bic_list.keys())) for _ in range(len(df))]
    df['UETR'] = [generate_random_uetr() for _ in range(len(df))]
    
    df['Currency'], df['Beneficiary_BIC'] = zip(*[match_currency_and_bic() for _ in range(len(df))])
    df['Currency_Country'] = df['Currency'].map(currencies)
    
    
    return df

# Add random BIC and currency details to the DataFrame
df = generate_random_details(df)



old_max_days=1.9999074074074075
min_months=0.0, max_months=26.665432098765432


In [4]:
df

Unnamed: 0,Time,Amount,Class,Sender_BIC,Receiver_BIC,UETR,Currency,Beneficiary_BIC,Currency_Country
0,0.0,149.62,0,FBSFCHZH,WPUWDEFF,V4ID8QTCIROHAP683AOX78,AUD,ZNZZAU3M,Australia
1,0.0,2.69,0,ZHSZUS33,YSYCESMM,R7PCTKF9R1PVGXRXU9AB3J,AUD,ZNZZAU3M,Australia
2,100.0,378.66,0,HCBHSGSG,FBSFCHZH,RP1SBN0Q5U58XBS8LQNE0J,USD,ZHSZUS33,United States
3,100.0,123.50,0,YXRXGB22,YMNYFRPP,MAPFA8RU98VZP4MD6VFN1J,USD,ZHSZUS33,United States
4,200.0,69.99,0,XITXUS33,FBSFCHZH,3WX5XAGWK7F3CXRX6RZZK3,USD,ZHSZUS33,United States
...,...,...,...,...,...,...,...,...,...
1139223,69116200.0,0.77,0,XITXUS33,SHSHKHH1,BEEX2F5NEHDU3YV8G17005,GBP,YXRXGB22,United Kingdom
1139224,69116300.0,24.79,0,ZHSZUS33,ZHSZUS33,9SJQ6WVX8CGS0P1DYYGQ45,GBP,YXRXGB22,United Kingdom
1139225,69116400.0,67.88,0,YXRXGB22,WPUWDEFF,CGUZH7AV1YPIQCLCQMAWV6,AUD,ZNZZAU3M,Australia
1139226,69116400.0,10.00,0,WPUWDEFF,WPUWDEFF,9FZFL7WK3AA7K5C0Q6X5W3,SGD,HCBHSGSG,Singapore


## Split Historical Train, Test Data

We are going to split the data into historical, train and test data by the following rules: 
* history : 55 %
* train : 35% 
* test : 15%





In [5]:

# Sort the DataFrame by the Time column
df = df.sort_values(by='Time').reset_index(drop=True)


# Calculate the number of samples for each split
total_size = len(df)
historical_size = int(total_size * 0.55)
train_size = int(total_size * 0.35)
test_size = total_size - historical_size - train_size

# Split into historical and remaining data
df_history = df.iloc[:historical_size]
remaining_df = df.iloc[historical_size:]
y = remaining_df.Class


ds = remaining_df.drop("Class", axis=1)
# Split the remaining data into train and test
x_train, x_test, y_train, y_test = train_test_split(ds, y,  test_size=test_size / (train_size + test_size), random_state=42)

df_train = pd.concat([y_train, x_train], axis=1)
df_test = pd.concat([y_test, x_test], axis=1)

# Display sizes of each dataset
print(f"Historical DataFrame size: {len(df_history)}")
print(f"Training DataFrame size: {len(df_train)}")
print(f"Testing DataFrame size: {len(df_test)}")




Historical DataFrame size: 626575
Training DataFrame size: 398729
Testing DataFrame size: 113924


In [6]:
# Save training and testing sets
os.makedirs(out_folder, exist_ok=True)
    
df_train.to_csv(path_or_buf=os.path.join(out_folder, "train.csv"), index=False)
df_test.to_csv(path_or_buf=os.path.join(out_folder, "test.csv"), index=False)
df_history.to_csv(path_or_buf=os.path.join(out_folder, "history.csv"), index=False)


In [7]:
out_folder

'/tmp/dataset/horizontal_credit_fraud_data'

In [None]:
!ls -al {out_folder}


In [None]:
! find /tmp/dataset/horizontal_credit_fraud_data -exec wc -l {} \;

## Split Data for differnt Client sites

Now, split train, test, history data evenly for n = 2 training sites (Clients)



In [None]:

files = ["history", "train", "test"]
client_names = set()

for f in files: 
    file_path = os.path.join(out_folder, f + ".csv") 
    df = pd.read_csv(file_path)
    # Group the DataFrame by 'Sender_BIC'
    grouped = df.groupby('Sender_BIC')
    # Save each group to a separate file
    for name, group in grouped:
        bank_name = bic_to_bank[name].replace(" ", "_")
        client_name = f"{name}_{bank_name}" 
        client_names.add(client_name)
        site_dir = os.path.join(out_folder, client_name)
        os.makedirs(site_dir, exist_ok=True)
        
        filename = os.path.join(site_dir,  f"{f}.csv")
        group.to_csv(filename, index=False)
        print(f"Saved {name} {f} transactions to {filename}")

print(client_names)
    


    


In [None]:
! find /tmp/dataset/horizontal_credit_fraud_data -exec wc -l {} \;

In [None]:
ls -al  /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/

In [None]:
! find /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/ -exec wc -l {} \;

In [14]:
!tree  /tmp/dataset/horizontal_credit_fraud_data/

[01;34m/tmp/dataset/horizontal_credit_fraud_data/[0m
├── [01;34mFBSFCHZH_Bank_6[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── [01;34mHCBHSGSG_Bank_9[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── history.csv
├── [01;34mSHSHKHH1_Bank_2[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── test.csv
├── train.csv
├── [01;34mWPUWDEFF_Bank_4[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── [01;34mXITXUS33_Bank_10[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── [01;34mYMNYFRPP_Bank_5[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── [01;34mYSYCESMM_Bank_7[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── [01;34mYXRXGB22_Bank_3[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
├── [01;34mZHSZUS33_Bank_1[0m
│   ├── history.csv
│   ├── test.csv
│   └── train.csv
└── [01;34mZNZZAU3M_Bank_8[0m
    ├── history.csv
    ├── test.csv
    └── train.csv

10 directories, 33 files


Let's go back to the [XGBoost Notebook](./xgboost.ipynb)