In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import logging
import pyarrow
import os
import warnings
warnings.filterwarnings('ignore')
import pickle


from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import average_precision_score,precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from sklearn.utils import class_weight
from src.model_functions import stratified_subsample, identify_repeat_perpetrator
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.decomposition import PCA



In [None]:
#get current working directory
cwd = os.getcwd()
image_dir = os.path.join(cwd, 'image_dir')

if not os.path.exists(image_dir):
    os.makedirs(image_dir)


logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('model_logfile.log'),  
        logging.StreamHandler()
    ]
)

In [None]:
#load file
df = pd.read_parquet(r'C:\Users\chino\Downloads\fraud_model\transactions_data\PS_20174392719_1491204439457_log.parquet')

In [None]:
df = df.drop('isFlaggedFraud', axis=1)

## 1.0 Exploratory Data Analysis

In [None]:
#check for null values
df.isnull().sum()

In [None]:
df.shape

In [None]:
#summarise numerical columns
df.describe()

In [None]:
#chcck for correlation and identify potential feature multicolinearity
df.corr()

In [None]:
#investigate data types
df.info()

In [None]:
#convert isFraud to int to represent a binary categorical target variable
df['isFraud'] = df['isFraud'].astype(int)

In [None]:
#check proportion of classes in the target variable
class_count = df['isFraud'].value_counts()
x = class_count.index
y = class_count.values

plt.figure(figsize=(8, 6))

plt.bar(x, y)

# add value labels on top of each bar
for i, v in enumerate(y):
    plt.text(i, v + 500, str(v), ha='center', va='bottom')

plt.xticks(x, ['Not Fraud', 'Is Fraud'])
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.title('Class Distribution in Dataset')
plt.show()
class_count


In [None]:
df.duplicated().sum()

In [None]:
numeric_features = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']

for col in numeric_features:
    df[col] = df[col].astype(float)

<br>
<br>

Explore multi feature - target variable relationships to find useful trends

In [None]:
#create subsample to enable seamless plotting
df_subsample = stratified_subsample(df, frac = 0.1, target_col='isFraud')

df_subsample['isFraud'] = df_subsample['isFraud'].astype(str)
fig = px.scatter_matrix(df_subsample, dimensions=numeric_features, color='isFraud', height=1000, color_discrete_map={'1': 'red', '0' : 'green'})
fig.update_traces(diagonal_visible=False)
fig.show()


Fraudulent Transactions by Transaction Type

In [None]:
fraud_counts = df_subsample.groupby('type')['isFraud'].value_counts()
fraud_counts_df = fraud_counts.reset_index(name='count')
palette = {'0': 'green', '1':'red', 0: 'green', 1:'red'}

plt.figure(figsize=(12, 6))
ax = sns.barplot(x='type', y='count', hue='isFraud', data=fraud_counts_df, palette=palette)
# Add labels on top of the bars
for p in ax.patches:
    height = p.get_height()
    if height > 0:
        ax.annotate(f'{int(height)}', 
                    (p.get_x() + p.get_width() / 2, height), 
                    ha='center', va='bottom', fontsize=9)

plt.title('Fraudulent Transactions by Transaction Type')
plt.xlabel('Transaction Type')
plt.ylabel('Number of Transactions')
#lt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

Transaction Amount By Transaction Type (Fraud and Non-Fraud)

In [None]:
for tx_type in ['CASH_OUT', 'TRANSFER']:
    df_type = df_subsample[df_subsample['type'] == tx_type]
    
    fig, axes = plt.subplots(nrows=len(numeric_features), ncols=1, figsize=(10, 4 * len(numeric_features)))
    axes = np.array(axes).flatten() if len(numeric_features) > 1 else [axes]
    
    for i, feature in enumerate(numeric_features):
        df_type[f'log_{feature}'] = np.log10(df_type[feature] + 1)
        
        # Create violin plot
        sns.violinplot(
            x='isFraud',
            y=f'log_{feature}',
            data=df_type,
            ax=axes[i],
            split=False,
            palette={'0': 'green', '1': 'red'}  # Green for non-fraud, red for fraud
        )
        axes[i].set_title(f'{feature} Distribution for {tx_type} by Fraud Status')
        axes[i].set_xlabel('Fraud Status')
        axes[i].set_ylabel(f'Log {feature} (log10)')
        #axes[i].set_xticks([1,0], ['Fraud', 'Non Fraud'])  # Change x-axis labels
        
    plt.tight_layout()
    os.makedirs(image_dir, exist_ok=True)
    plt.savefig(f'{image_dir}/violin_plots_{tx_type.lower()}.png')
    plt.show()
    plt.close()  # Close to free memory


Mean/median of numeric features by fraud status

In [None]:


stats = []
for col in numeric_features:
    group_stats = df.groupby('isFraud')[col].agg(['mean', 'median']).reset_index()
    group_stats['Column'] = col
    group_stats = group_stats.melt(id_vars=['isFraud', 'Column'], 
                                  value_vars=['mean', 'median'], 
                                  var_name='Statistic', 
                                  value_name='Value')
    stats.append(group_stats)

stats_df = pd.concat(stats)

num_cols = len(numeric_features)
nrows = (num_cols + 1) // 2
fig, axes = plt.subplots(nrows=nrows, ncols=2, figsize=(16, 4 * nrows))
axes = axes.flatten()
    
for i, col in enumerate(numeric_features):
        col_data = stats_df[stats_df['Column'] == col]
        ax = axes[i]
        sns.barplot(x='isFraud', y='Value', hue='Statistic', data=col_data, ax=ax)
        ax.set_title(f'Mean and Median of {col} by Fraud Status')
        #ax.set_xlabel('Fraudulent Transaction (0 = Non-Fraud, 1 = Fraud)')
        ax.set_ylabel('Amount ($)')
        #ax.legend(title='Statistic')
        
        # add value labels in millions
        for p in ax.patches:
            height = p.get_height()
            if np.isfinite(height):
                value_millions = height / 1_000_000
                ax.text(
                    p.get_x() + p.get_width() / 2,
                    height,
                    f'${value_millions:.2f}M',
                    ha='center',
                    va='bottom',
                    fontsize=10
                )
    
for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])
    
plt.tight_layout()
plt.show()
plt.savefig(f'{image_dir}/stats_barplot.png')
#plt.close()
#logging.info(f"Barplot saved to {image_dir}/stats_barplot.png")

Time-based fraud analysis

In [None]:
#df = df_subsample.copy()

df['hour'] = df['step'] % 24

hourly_counts = df.groupby(['hour', 'isFraud']).size().reset_index(name='count')

# compute total transactions per hour
total_per_hour = hourly_counts.groupby('hour')['count'].transform('sum')

# add proportion column
hourly_counts['proportion'] = hourly_counts['count'] / total_per_hour

# plot proportion lineplot
plt.figure(figsize=(10, 6))
sns.lineplot(data=hourly_counts, x='hour', y='proportion', hue='isFraud', marker='o', palette= palette)
plt.title('Proportion of Fraudulent vs Non-Fraudulent Transactions by Hour')
plt.xlabel('Hour of Day')
plt.ylabel('Proportion of Transactions')
plt.xticks(range(0, 24))
plt.legend(title='Is Fraud', labels=['Non-Fraud', 'Fraud'])
plt.grid(True)
plt.tight_layout()
plt.show()
# Save plot
plt.savefig(f'{image_dir}/hour_vs_fraud_proportion.png')
plt.close()
logging.info("Proportion plot by hour saved.")


In [None]:
df['tx_type'] = df.apply(lambda x: x['nameOrig'][0] + '-' + x['nameDest'][0], axis=1)
df.groupby('tx_type')['isFraud'].value_counts()

In [None]:
#check for repeat perpetrators  and victims 

fraud_df = df[df['isFraud'] == 1]

# count repeat perpetrators
repeat_perpetrators = fraud_df['nameDest'].value_counts()
num_repeat_perpetrators = (repeat_perpetrators > 1).sum()

# count repeat victims
repeat_victims = fraud_df['nameOrig'].value_counts()
num_repeat_victims = (repeat_victims > 1).sum()

print(f"Number of repeat perpetrators: {num_repeat_perpetrators}")
print(f"Most number of fraudulent transactions from a single perpetrator: {repeat_perpetrators.values[0]}")
print(f"Number of repeat victims: {num_repeat_victims}")

## **Key observations from Exploratory Data Analysis**



- Severe class imbalance due to rarity of fradulent transactions
- Fraud only occurs in 'Cash Out' and 'Transfer' transactions, suggesting that transaction type is a critical predictor.
- Fraudulent transactions often involve large amounts and drained accounts as observed in the disparity in mean/median amount and balances between fraudulent and non-fraudulent transactions
- Fraudulent transactions are more prevalent from 2AM, peak around 4-5AM and then experience a decline.
- There are no repeat victims and 44 repeat perpetrators  
- No fraudulent transaction involves merchant 
- In fraudulent transactions, the transaction amount often equates to the origin's balance when the balance is less than $10m, in which case the balance becomes $0. When the origin's balance is greater than $10m, $10m is transferred out of the account, leaving the surplus. This trend suggests that the maximum transaction limit without any higher-level authentication is $10m. As a result, transactions that leave the origin balance as zero and those maxxing out the limit are likely to be fraudulent.

<br>
<br>

## 2.0 Feature Engineering

#### There are 44 repeat perpetrators as observed in EDA. This is useful information, but must be used cautiously to avoid data leakage because it mirrors the target variable. For instance, if it is added directly to the dataset as a feature, and a nameDest commits fraud at step = 100, flagging it as a repeat perpetrator in a row at step = 50 would use future knowledge (isFraud = 1 at step = 100), which isn’t available in real-time fraud detection. This would cause generalisation issues to the model.

#### The ideal way to create a feature to flag repeat perpetrators without leakage is to use the step column to ensure only past fraud (before the current row’s step) is considered. This involves create a rolling history of fraudulent nameDest accounts up to each step. This is implemented in the 'identify repeat perpetrator' function. 

#### Similar ideology is applied to all newly created features, ensuring that the development and model application mimicks real world setting where future fraud is unknown. Also, some newly created features were taken out due to the fact that they set very linear 'fraud recognition' patterns for the model to follow, which makes generalisation impossible and leads to inflated evaluation results.


In [None]:
#commented out due to high multicolinearity with target variable 
#df['is_balance_drained'] = df.apply(lambda x: 1 if x['oldbalanceOrg'] == x['amount'] else 0, axis=1)
#df['max_tx_amount'] =  df['amount'].apply(lambda x: 1 if x == 10000000.00 else 0)

# sort by step to ensure global temporal order
df = df.sort_values('step').reset_index(drop=True)

# log transformation
df['log_amount'] = np.log1p(df['amount'])

# 1. rolling transaction count (last 6 time steps per sender)
df['tx_count_6hr'] = (
    df.groupby('nameOrig')['step']
      .rolling(window=6, min_periods=1)
      .count()
      .reset_index(level=0, drop=True)
)


# cumulative amount sent by sender
df['cum_amount_sent'] = df.groupby('nameOrig')['log_amount'].cumsum()

#ender mean, std, and Z-score (optimized)
df = df.sort_values(['nameOrig', 'step'])  # Sort for sender-specific features
df['sender_mean'] = (
    df.groupby('nameOrig')['log_amount']
      .expanding()
      .mean()
      .shift(1)
      .reset_index(level=0, drop=True)
)
df['sender_std'] = (
    df.groupby('nameOrig')['log_amount']
      .expanding()
      .std()
      .shift(1)
      .reset_index(level=0, drop=True)
)
df['sender_std'] = df['sender_std'].fillna(1).replace(0, 1)

#identify outlier transaction based on user's history
df['zscore_amount'] = (df['log_amount'] - df['sender_mean']) / df['sender_std']

#  time since last transaction
df['prev_step'] = df.groupby('nameOrig')['step'].shift(1)
df['time_since_last_tx'] = df['step'] - df['prev_step']
df['is_first_tx'] = df['prev_step'].isna().astype(int)
df['time_since_last_tx'] = df['time_since_last_tx'].fillna(df['time_since_last_tx'].median())

df = df.drop(['prev_step'], axis=1)
df = df.dropna()


In [None]:
numeric_features = [
    'hour', 'log_amount', 'tx_count_6hr',
    'cum_amount_sent', 'oldbalanceOrg', 'zscore_amount', 
    'time_since_last_tx'
]

categorical_features = ['type', 'tx_type']  # one-hot encode separately



x = df.drop(['isFraud'], axis=1)
y = df['isFraud']


X_train, X_test, y_train, y_test  = train_test_split(x, y, test_size = 0.3, random_state = 42, stratify=y)

fraud_ratio = y_train.mean()
scale_pos_weight = (1 - fraud_ratio) / fraud_ratio 
logging.info(f"Scale_pos_weight: {scale_pos_weight:.2f}")


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(sparse=False, handle_unknown='ignore', drop='first'), categorical_features),
       # ('pca', PCA(n_components = 5))

        #('binary', 'passthrough', binary_features)
    ]
)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


model = XGBClassifier(
    n_estimators=100,
    scale_pos_weight=scale_pos_weight,
    learning_rate=0.1,
    max_depth=10,
    use_label_encoder=False,  
    eval_metric='logloss')    



model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  # probabilities for class 1

precision, recall, thresholds = precision_recall_curve(y_test, y_prob)

#plot precision and recall vs. threshold
plt.figure(figsize=(10, 6))
plt.plot(thresholds, precision[:-1], label='Precision', color='b')
plt.plot(thresholds, recall[:-1], label='Recall', color='r')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision and Recall vs Threshold')
plt.legend()
plt.grid(True)
plt.show()

## 4.0 Model Evaluation

In [None]:
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-6)
best_index = np.argmax(f1_scores)
best_threshold = thresholds[best_index]

print(f"Best Threshold: {best_threshold:.4f}")
print(f"Precision: {precision[best_index]:.4f}, Recall: {recall[best_index]:.4f}, F1: {f1_scores[best_index]:.4f}")


In [None]:
# convert probabilities to binary using the best threshold
y_pred_thresh = (y_prob >= best_threshold).astype(int)

# confusion Matrix and Classification Report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_thresh))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_thresh))


print("AUC-ROC:", roc_auc_score(y_test, y_prob))

conf_matrix = confusion_matrix(y_test, y_pred_thresh)

ax = sns.heatmap(conf_matrix, cmap='flare', annot=True, fmt='d')

plt.xlabel('Predicted Class', fontsize=11)
plt.ylabel('True Class', fontsize = 11)
plt.title('Confusion Matrix', fontsize=11)
plt.show()


##### Result Summary 

Precision (0.81 for class 1): Among all transactions flagged as fraudulent by the model, 81% were actually fraudulent. This suggests the model has a decent ability to identify true frauds.

Recall (0.76 for class 1): The model identified 76% of all actual fraudulent transactions and missed only 24% of fraudulent transactions.

F1-Score (0.78 for class 1): The F1-score is a harmonic mean of precision and recall. At 0.78, this indicates that the model strikes a decent balance between detecting fraud and avoiding false positives.

<br>
<br>

#### We can apply RandomisedSearchCV to search for the potentially better hyperparameter values to retrain the model for improved performance. GridSearch would be more ideal if computational resources and memory were not constraints

In [None]:


X_tune, _, y_tune, _ = train_test_split(X_train, y_train, train_size=0.1, random_state=42, stratify=y_train)
print("Tuning subset shape:", X_tune.shape, y_tune.shape)

numeric_features = X_tune.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_tune.select_dtypes(include=['object', 'category']).columns.tolist()
print("Numeric Features:", numeric_features)
print("Categorical Features:", categorical_features)


# compute scale_pos_weight for sampled data
neg, pos = np.bincount(y_tune)
scale_pos_weight = neg / pos
print("scale_pos_weight:", scale_pos_weight)

xgb_model = XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42, eval_metric='logloss')
param_grid = {
    'xgbclassifier__learning_rate': [0.01, 0.1, 0.3],
    'xgbclassifier__max_depth': [3, 5, 7],
    'xgbclassifier__n_estimators': [50, 100],
    'xgbclassifier__subsample': [0.8, 1.0],
    'xgbclassifier__colsample_bytree': [0.8, 1.0]
}

#create a pipeline and perform randomised search CV
pipeline = make_pipeline(preprocessor, xgb_model)
kf = KFold(n_splits=3, shuffle=True, random_state=42)
grid_search = RandomizedSearchCV(pipeline, param_grid, cv=kf, scoring='recall', n_jobs=-1, verbose=1)
grid_search.fit(X_tune, y_tune)

print("Best Parameters:", grid_search.best_params_)
print("Best Recall Score:", grid_search.best_score_)




### Results from best parameters are not as good as our initial results so no need to retrain the model on the output hyperparameter values


In [None]:


# specify the file path where you want to save the model
file_path = os.path.join(cwd,'fraud_xgbmodel.pkl') 

# Save the model to a pickle file
with open(file_path, 'wb') as file:
    pickle.dump(model, file)

print(f"Model saved successfully to {file_path}")