In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KernelDensity
from scipy.signal import find_peaks

import warnings
warnings.simplefilter(action='ignore')

In [None]:
trans = pd.read_csv('transactions.csv')
acc = pd.read_csv('accounts.csv')

In [None]:
trans.info()

In [None]:
acc.info()

In [None]:
# Check null value
print(trans.isnull().sum())
print(acc.isnull().sum())

In [None]:
# Check if columns have just one value
print(acc['COUNTRY'].nunique())
print(acc['ACCOUNT_TYPE'].nunique())
print(trans['TX_TYPE'].nunique())

In [None]:
# Drop TX_TYPE and ALERT_ID in transaction
trans.drop(['TX_TYPE', 'ALERT_ID'], axis = 1, inplace = True)
# Drop CUSTOMER_ID, COUNTRY, ACCOUNT_TYPE in account
acc.drop(['CUSTOMER_ID', 'COUNTRY', 'ACCOUNT_TYPE'], axis = 1, inplace = True)

In [None]:
# Match transaction behavior ID and initial balance account from account data to transaction data
tx_features = trans.merge(acc, left_on = 'SENDER_ACCOUNT_ID', right_on='ACCOUNT_ID', how='left')
tx_features = tx_features.rename(columns={'INIT_BALANCE': 'SENDER_INIT_BALANCE', 
                                              'TX_BEHAVIOR_ID': 'SENDER_BEHAVIOR_ID'})
tx_features = tx_features.merge(acc, left_on='RECEIVER_ACCOUNT_ID', right_on='ACCOUNT_ID', 
                                    how='left', suffixes=('', '_RECEIVER'))
tx_features = tx_features.rename(columns={'INIT_BALANCE': 'RECEIVER_INIT_BALANCE', 
                                              'TX_BEHAVIOR_ID': 'RECEIVER_BEHAVIOR_ID'})
tx_features.drop(['IS_FRAUD_y', 'IS_FRAUD', 'ACCOUNT_ID_RECEIVER', 'ACCOUNT_ID'], axis = 1, inplace = True)
tx_features.rename(columns = {'IS_FRAUD_x': 'fraud'}, inplace = True)

In [None]:
# Calculate amount to initial balance ratio
tx_features['amount_to_sender_balance_ratio'] = tx_features['TX_AMOUNT'] / tx_features['SENDER_INIT_BALANCE']
tx_features['amount_to_receiver_balance_ratio'] = tx_features['TX_AMOUNT'] / tx_features['RECEIVER_INIT_BALANCE']

In [None]:
# Plot continous variables
columns = [
    ('TX_AMOUNT', 'transaction amount'),
    ('amount_to_sender_balance_ratio', 'transaction amount to sender balance ratio'),
    ('amount_to_receiver_balance_ratio', 'transaction amount to receiver balance ratio') 
]

# Create a figure with 3 subplots in a horizontal line
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Iterate through each column and plot on the corresponding axis
for i, (col, title) in enumerate(columns):
    sns.kdeplot(np.log(tx_features[col]), ax=axes[i])
    axes[i].set_title(f'Distribution of {title}')

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
# Plot categorical variable
columns = [
    ('fraud', 'fraud'),
    ('SENDER_BEHAVIOR_ID', 'sender behavior'),
    ('RECEIVER_BEHAVIOR_ID', 'receiver behavior')
]

# Create a figure with 3 subplots in a horizontal line
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Iterate through each column and plot on the corresponding axis
for i, (col, title) in enumerate(columns):
    values = tx_features[col].value_counts()
    
    # Plot pie chart
    axes[i].pie(values, labels=values.index, autopct='%1.1f%%', startangle=140)
    axes[i].set_title(f'Distribution of {title}')

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
# Def function to identify the bottom and peaks of distribution for data discretization
from scipy.signal import find_peaks
from scipy.stats import gaussian_kde
def discretize_tx(data, column):
    # Transform the data using logarithm
    log_data = np.log1p(data[column])  # log1p is used to avoid log(0) issues

    # KDE to estimate density on logarithm-transformed data
    kde = KernelDensity(bandwidth=0.1, kernel='gaussian')
    kde.fit(log_data.values.reshape(-1, 1))

    # Generate a range of values for the log-transformed data
    log_x = np.linspace(log_data.min(), log_data.max(), 1000).reshape(-1, 1)
    log_density = kde.score_samples(log_x)
    log_density = np.exp(log_density)

    # Find peaks (modes) in the log-transformed data
    peaks, _ = find_peaks(log_density)
    log_modes = log_x[peaks]

    # Find valleys (local minima) between peaks to set thresholds in log scale
    valleys, _ = find_peaks(-log_density)
    log_thresholds = log_x[valleys]
    
    # Convert log thresholds back to original scale
    thresholds = np.expm1(log_thresholds).flatten()
    
    # Plot the log density with peaks and valleys
    plt.figure(figsize=(10, 6))
    plt.plot(log_x, log_density, label='Log Density')
    plt.plot(log_modes, log_density[peaks], 'ro', label='Log Modes')
    plt.plot(log_thresholds, log_density[valleys], 'go', label='Log Valleys (Thresholds)')
    plt.xlabel(f'Log of {column}')
    plt.ylabel('Density')
    plt.title('Kernel Density Estimation (Log Scale)')
    plt.legend()
    plt.show()

    return thresholds

In [None]:
discretize_tx(tx_features, 'TX_AMOUNT')

In [None]:
discretize_tx(tx_features, 'amount_to_sender_balance_ratio')

In [None]:
discretize_tx(tx_features, 'amount_to_receiver_balance_ratio')

In [None]:
# Identify bins based on plot
tx_bins = [9.65538034e+00, 7.46803678e+01, 2.67823111e+02, 9.06675552e+02]
sender_bins = [4.78727948e-01, 2.23344028e+00]
receive_bins = [3.56853867e-01, 8.33559758e+00]

In [None]:
# Data discretization
tx_features['timestamp_bin'] = pd.cut(tx_features['TIMESTAMP'], bins=10, labels=False)
tx_features['discretized_tx'] =  np.digitize(tx_features['TX_AMOUNT'], tx_bins)
tx_features['discretize_sender_ratio'] =  np.digitize(tx_features['amount_to_sender_balance_ratio'], sender_bins)
tx_features['discretize_receiver_ratio'] =  np.digitize(tx_features['amount_to_receiver_balance_ratio'], receive_bins)

In [None]:
# Finalize dataset for modeling
tx_final = tx_features.drop(['TX_ID', 'SENDER_ACCOUNT_ID', 'RECEIVER_ACCOUNT_ID', 'TX_AMOUNT', 'TIMESTAMP', 
                             'SENDER_INIT_BALANCE', 'RECEIVER_INIT_BALANCE', 
                             'amount_to_sender_balance_ratio', 'amount_to_receiver_balance_ratio'], axis = 1)
tx_final.rename(columns = {'SENDER_BEHAVIOR_ID': 'sender_behavior','RECEIVER_BEHAVIOR_ID': 'receiver_behavior'}, inplace = True)
tx_final['fraud'] = pd.Categorical(tx_final['fraud']).codes

In [None]:
tx_final.head()

In [None]:
# Export to csv file to be used for BBN modeling in GeNIe
tx_final.to_csv('txdata.csv', index = False)

In [None]:
# Visualize the inference results from GeNIe
sender_behavior = {'1': [0.998, 0.002], 
      '2': [0.97, 0.03],
      '3': [0.72, 0.28], 
      '4': [0.78, 0.22],
      '5': [0.99, 0.01]}
# Extracting the data
labels = list(sender_behavior.keys())
non_fraud = [sender_behavior[label][0] for label in labels]
fraud = [sender_behavior[label][1] for label in labels]

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

# Plotting the bars
fig, ax = plt.subplots(figsize=(10, 6))
rects1 = ax.bar(x - width/2, non_fraud, width, label='Non-Fraud', color='lightblue')
rects2 = ax.bar(x + width/2, fraud, width, label='Fraud', color='blue')

# Adding labels and title
ax.set_xlabel('Sender behavior')
ax.set_ylabel('Probability')
ax.set_title('Distribution of Fraud under Each State of transaction amount')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

plt.show()

In [None]:
# Visualize inference results from GeNIe
tx = ['below 9.6', '9.6 to 74.7', '74.7 to 267.8', '267.8 to 906.7', 'over 906.7']
fraud = {'1': [0, 0, 0.04, 0.5, 0.5],
        '2': [0.87, 0.85, 0.25, 0.21, 0],
        '3': [0.95, 0.67, 0.5, 0, 0.5],
        '4': [0.96, 0.66, 0, 0.5, 0.5],
        '5': [0.97, 0.91, 0, 0, 0.5]}

query = pd.DataFrame(fraud, index = tx)
sns.heatmap(query, annot=True, fmt="g", cmap='Blues')  # Use 'Blues' colormap
plt.ylabel('Transaction amount')
plt.xlabel('Sender behavior')
plt.show()