In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore')

In [None]:
trans = pd.read_csv('transactions.csv')
acc = pd.read_csv('accounts.csv')

In [None]:
trans.drop(['TX_TYPE', 'ALERT_ID'], axis = 1, inplace = True)
acc.drop(['CUSTOMER_ID', 'COUNTRY', 'ACCOUNT_TYPE'], axis = 1, inplace = True)

In [None]:
# Def function to create avg_transaction_amount
def create_transaction_features(transactions, accounts):
    # Group by sender and receiver separately
    sent = transactions.groupby('SENDER_ACCOUNT_ID').agg({
        'TX_ID': 'count',
        'TX_AMOUNT': 'sum'
    }).rename(columns={'TX_ID': 'num_sent', 'TX_AMOUNT': 'amount_sent'})
    
    received = transactions.groupby('RECEIVER_ACCOUNT_ID').agg({
        'TX_ID': 'count',
        'TX_AMOUNT': 'sum'
    }).rename(columns={'TX_ID': 'num_received', 'TX_AMOUNT': 'amount_received'})
    
    # Merge the results
    account_features = accounts.set_index('ACCOUNT_ID')
    account_features = account_features.join(sent, how='left').join(received, how='left')
    account_features = account_features.fillna(0)
    
    # Calculate avg_transaction_amount
    account_features['total_transactions'] = account_features['num_sent'] + account_features['num_received']
    account_features['total_amount'] = account_features['amount_sent'] + account_features['amount_received']
    account_features['avg_transaction_amount'] = account_features['total_amount'] / account_features['total_transactions']
    return account_features

# def function to create degree centrality
def create_network_features(transactions, account_features):
    # Degree centrality
    sent_connections = transactions.groupby('SENDER_ACCOUNT_ID')['RECEIVER_ACCOUNT_ID'].nunique()
    received_connections = transactions.groupby('RECEIVER_ACCOUNT_ID')['SENDER_ACCOUNT_ID'].nunique()
    account_features['degree_centrality'] = sent_connections.add(received_connections, fill_value=0)
    return account_features

# def function to create transaction frequency
def create_time_features(transactions, account_features):
    time_features = transactions.groupby('SENDER_ACCOUNT_ID').agg({
        'TIMESTAMP': ['min', 'max', lambda x: x.diff().mean(), lambda x: x.diff().std()]
    })
    time_features.columns = ['first_tx_time', 'last_tx_time', 'avg_time_between_tx', 'std_time_between_tx']
    account_features = account_features.join(time_features, how='left')
    
    # Calculate transaction frequency (assuming the timestamp represents some form of time units)
    account_features['tx_frequency'] = account_features['total_transactions'] / (account_features['last_tx_time'] - account_features['first_tx_time'] + 1)  # Add 1 to avoid division by zero
    
    return account_features

# def function to calculate balance volatility
def calculate_balance_volatility(transactions, accounts):
    # Merge transactions with initial balances
    transactions = transactions.merge(
        accounts[['ACCOUNT_ID', 'INIT_BALANCE']],
        left_on='SENDER_ACCOUNT_ID',
        right_on='ACCOUNT_ID',
        suffixes=('', '_SENDER')
    )

    transactions = transactions.merge(
        accounts[['ACCOUNT_ID', 'INIT_BALANCE']],
        left_on='RECEIVER_ACCOUNT_ID',
        right_on='ACCOUNT_ID',
        suffixes=('', '_RECEIVER')
    )

    # Calculate balance after each transaction for sender and receiver
    transactions['balance_after_tx_sender'] = transactions['INIT_BALANCE'] - transactions['TX_AMOUNT']
    transactions['balance_after_tx_receiver'] = transactions['INIT_BALANCE_RECEIVER'] + transactions['TX_AMOUNT']

    # Reshape data to calculate balance changes for each account
    balance_changes = transactions.melt(
        id_vars=['TX_ID', 'TIMESTAMP', 'TX_AMOUNT', 'IS_FRAUD'],
        value_vars=['SENDER_ACCOUNT_ID', 'RECEIVER_ACCOUNT_ID'],
        var_name='transaction_role',
        value_name='ACCOUNT'
    )

    balance_changes = balance_changes.merge(
        transactions[['TX_ID', 'balance_after_tx_sender', 'balance_after_tx_receiver']],
        on='TX_ID'
    )

    balance_changes['balance_after_tx'] = balance_changes.apply(
        lambda row: row['balance_after_tx_sender'] if row['transaction_role'] == 'SENDER_ACCOUNT_ID' else row['balance_after_tx_receiver'],
        axis=1
    )

    # Calculate balance changes for each account
    balance_changes = balance_changes.sort_values(by=['ACCOUNT', 'TIMESTAMP'])
    balance_changes['balance_change'] = balance_changes.groupby('ACCOUNT')['balance_after_tx'].diff()

    # Calculate balance volatility (standard deviation of balance changes) for each account
    balance_volatility = balance_changes.groupby('ACCOUNT')['balance_change'].std().reset_index()
    balance_volatility.columns = ['ACCOUNT_ID', 'balance_volatility']

    return balance_volatility

In [None]:
# Feature engineering for account data
account_features = create_transaction_features(trans, acc)
account_features = create_time_features(trans, account_features)
account_features = create_network_features(trans, account_features)

# Drop account id 0 as it does not have transaction
account_features.drop(0, inplace = True)
account_features.reset_index(inplace = True)

account_features['balance_volatility'] = calculate_balance_volatility(trans, account_features)['balance_volatility']

In [None]:
# Finalize account data for modeling, data discretization will be performed in GeNIe
acc_final = account_features.drop(['ACCOUNT_ID', 'INIT_BALANCE', 'total_transactions', 'total_amount',
                                  'first_tx_time', 'last_tx_time'], axis =1 )
acc_final.rename(columns = {'IS_FRAUD': 'fraud', 'TX_BEHAVIOR_ID': 'tx_behavior'}, inplace = True)

In [None]:
acc_final.head()

In [None]:
# Visualize continous data
first_columns = ['num_sent', 'amount_sent', 'num_received', 'amount_received', 'avg_transaction_amount', 'balance_volatility']

# Create a figure with 3 rows and 2 columns
fig, axes = plt.subplots(3, 2, figsize=(18, 12))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Iterate through each column and plot on the corresponding axis
for i, col in enumerate(first_columns):
    sns.distplot(acc_final[col], kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {col}')

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
# Visualize continous data
second_columns = ['avg_time_between_tx', 'std_time_between_tx', 'tx_frequency', 'degree_centrality']
# Create a figure with 3 rows and 2 columns
fig, axes = plt.subplots(2, 2, figsize=(18, 12))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Iterate through each column and plot on the corresponding axis
for i, col in enumerate(second_columns):
    sns.distplot(acc_final[col], kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {col}')

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
#Visualize categorical data
columns = ['fraud', 'tx_behavior']

# Create a figure with 3 subplots in a horizontal line
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Iterate through each column and plot on the corresponding axis
for i, col in enumerate(columns):
    values = acc_final[col].value_counts()
    
    # Plot pie chart
    axes[i].pie(values, labels=values.index, autopct='%1.1f%%', startangle=140)
    axes[i].set_title(f'Distribution of {col}')

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
# Export to csv file to be used in GeNIe
acc_final.to_csv('account_data.csv', index = False)

In [None]:
# Visualize inference results from GeNIe
degree_centrality ={'below 20': [0.86,0.14], 
                    '20 to 60': [0.59, 0.41], 
                    'over 60': [0.45, 0.55]}

# Extracting the data
labels = list(degree_centrality.keys())
non_fraud = [degree_centrality[label][0] for label in labels]
fraud = [degree_centrality[label][1] for label in labels]

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

# Plotting the bars
fig, ax = plt.subplots(figsize=(10, 6))
rects1 = ax.bar(x - width/2, non_fraud, width, label='Non-Fraud', color='lightblue')
rects2 = ax.bar(x + width/2, fraud, width, label='Fraud', color='blue')

# Adding labels and title
ax.set_xlabel('Degree of Centrality')
ax.set_ylabel('Probability')
ax.set_title('Distribution of Fraud under Each State of Degree of Centrality')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

plt.show()

In [None]:
# Visualized inference results from GeNIe
avg_tx_amount = ['below 20,000,000', '20,000,000 to 40,000,000', 'over 40,000,000']
fraud = {'below 20':[0.14, 0.17, 0.24],
        '20 to 60': [0.41,0.45, 0.46],
        'over 60': [0.55, 0.5, 0.5]}

query1 = pd.DataFrame(fraud, index = avg_tx_amount)
sns.heatmap(query1, annot=True, fmt="g", cmap='Blues')  # Use 'Blues' colormap
plt.ylabel('avg_tx_amount')
plt.xlabel('degree_centrality')
plt.show()

In [None]:
# Visualize inference results from GeNIe
balance_volatility = ['below 1,450,000', '1,450,000 to 2,700,000', '2,700,000 to 5,000,000', 'over 5,000,000']
fraud = {'below 20':[0.14, 0.14, 0.12, 0.23],
        '20 to 60': [0.41,0.43, 0.37, 0.5],
        'over 60': [0.55, 0.5, 0.5, 0.5]}
query2 = pd.DataFrame(fraud, index = balance_volatility)
sns.heatmap(query2, annot=True, fmt="g", cmap='Greens')  # Use 'Blues' colormap
plt.ylabel('balance_volatility')
plt.xlabel('degree_centrality')
plt.show()