<h1 style="font-size:2.0em; font-weight:bold; text-align:center;">Model Training </h1>

This project is a **Proof of Concept (PoC)** aimed at detecting potential financial crimes using **anomaly scoring** techniques. It reflects my growing interest in financial crime prevention, particularly in the context of evolving geopolitical risks.

The dataset spans from **April 2, 2025** to **April 4, 2025**, based on the assumption that in mid-2025, trade tensions lead to increased abnormal or suspicious transaction activities.

- Feature Scaling
- Model Training: Isolation Forest, Autoencoder  
- Evaluation: Precision-Recall, ROC-AUC, confusion matrix, feature importance  

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import plotly.express as px
%matplotlib inline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures,MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/nordic_transactions_with_fraud.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
customer_id,CUST00861,CUST00140,CUST00155,CUST00650,CUST00039
transaction_id,9297eaac-fcc2-4d18-92ae-f13c963bd564,624b31d7-2e5c-48ad-97c6-eb3ccb5a1c0e,534234cf-1974-4b71-a00f-7ed09248c8c8,64cfd9b9-050e-4961-b363-d272d0181ce6,89fab5cc-95b0-4f8a-964f-10b584d6b400
currency,DKK,EUR,EUR,EUR,NOK
transaction_datetime,2025-04-03 21:35:23,2025-04-03 03:23:40,2025-04-02 13:37:28,2025-04-03 13:35:46,2025-04-02 19:09:23
value_datetime,2025-04-03 21:35:23,2025-04-03 03:23:40,2025-04-04 13:37:28,2025-04-05 13:35:46,2025-04-04 19:09:23
...,...,...,...,...,...
last_transaction_datetime_weekday,3,3,3,3,3
last_transaction_datetime_hour,22,20,23,19,23
age,23,52,18,73,60
is_sequential,0,0,0,0,0


**Train-Test Split**

As `is_fraud` is imbalanced, so we use `StratifiedShuffleSplit()` to maintain the same ratio of predictor classes.

In [None]:
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']



strat_shuf_split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

for train_idx, test_idx in strat_shuf_split.split(X, y):
  X_train = df.loc[train_idx, X]
  y_train = df.loc[train_idx, 'is_fraud']

  X_test  = df.loc[test_idx, X]
  y_test  = df.loc[test_idx, 'is_fraud']

KeyError: "None of [Index([                                                                                                              ('c', 'u', 's', 't', 'o', 'm', 'e', 'r', '_', 'i', 'd'),\n                                                                                                      ('t', 'r', 'a', 'n', 's', 'a', 'c', 't', 'i', 'o', 'n', '_', 'i', 'd'),\n                                                                                                                                    ('c', 'u', 'r', 'r', 'e', 'n', 'c', 'y'),\n                                                                        ('t', 'r', 'a', 'n', 's', 'a', 'c', 't', 'i', 'o', 'n', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e'),\n                                                                                                      ('v', 'a', 'l', 'u', 'e', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e'),\n                                                                                            ('p', 'a', 'y', 'm', 'e', 'n', 't', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e'),\n                                                                                            ('b', 'o', 'o', 'k', 'i', 'n', 'g', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e'),\n                                                                                            ('t', 'y', 'p', 'e', '_', 'd', 'e', 's', 'c', 'r', 'i', 'p', 't', 'i', 'o', 'n'),\n                                                                                            ('t', 'r', 'a', 'n', 's', 'a', 'c', 't', 'i', 'o', 'n', '_', 't', 'y', 'p', 'e'),\n                                                                                                                               ('n', 'a', 'r', 'r', 'a', 't', 'i', 'v', 'e'),\n                                                                                                                                    ('s', 't', 'a', 't', 'u', 's', '_', 'x'),\n                                                                                       ('c', 'o', 'u', 'n', 't', 'e', 'r', 'p', 'a', 'r', 't', 'y', '_', 'n', 'a', 'm', 'e'),\n                                                                                                                     ('c', 'a', 'r', 'd', '_', 'n', 'u', 'm', 'b', 'e', 'r'),\n                                                                                                                                         ('m', 'e', 's', 's', 'a', 'g', 'e'),\n                                                                                                                     ('o', 'w', 'n', '_', 'm', 'e', 's', 's', 'a', 'g', 'e'),\n                                                                                                                                         ('c', 'o', 'u', 'n', 't', 'r', 'y'),\n                                                                                                 ('a', 'c', 'c', 'o', 'u', 'n', 't', '_', 'n', 'u', 'm', 'b', 'e', 'r', 's'),\n                                                                                                                ('a', 'c', 'c', 'o', 'u', 'n', 't', '_', 'n', 'a', 'm', 'e'),\n                                                                                                                                         ('p', 'r', 'o', 'd', 'u', 'c', 't'),\n                                                                                                                ('a', 'c', 'c', 'o', 'u', 'n', 't', '_', 't', 'y', 'p', 'e'),\n                                                                                                                                    ('s', 't', 'a', 't', 'u', 's', '_', 'y'),\n                                                                   ('l', 'a', 's', 't', '_', 'b', 'o', 'o', 'k', 'i', 'n', 'g', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e'),\n                                                                             ('r', 'e', 'g', 'i', 's', 't', 'r', 'a', 't', 'i', 'o', 'n', '_', 'n', 'u', 'm', 'b', 'e', 'r'),\n                                                                                                                                                             ('d', 'o', 'b'),\n                                                                                       ('r', 'e', 's', 'i', 'd', 'e', 'n', 'c', 'e', '_', 'c', 'o', 'u', 'n', 't', 'r', 'y'),\n                                                                                                                          ('o', 'c', 'c', 'u', 'p', 'a', 't', 'i', 'o', 'n'),\n                                                                                                                                              ('g', 'e', 'n', 'd', 'e', 'r'),\n                                                                                                                               ('b', 'a', 'n', 'k', '.', 'n', 'a', 'm', 'e'),\n                                                                                                                                    ('b', 'a', 'n', 'k', '.', 'b', 'i', 'c'),\n                                                                                                                ('b', 'a', 'n', 'k', '.', 'c', 'o', 'u', 'n', 't', 'r', 'y'),\n                                                                                       ('t', 'r', 'a', 'n', 's', 'a', 'c', 't', 'i', 'o', 'n', '_', 'c', 'o', 'u', 'n', 't'),\n                                               ('l', 'a', 's', 't', '_', 't', 'r', 'a', 'n', 's', 'a', 'c', 't', 'i', 'o', 'n', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e'),\n                                                                                                                          ('a', 'm', 'o', 'u', 'n', 't', '_', 'e', 'u', 'r'),\n                                                                   ('a', 'v', 'a', 'i', 'l', 'a', 'b', 'l', 'e', '_', 'b', 'a', 'l', 'a', 'n', 'c', 'e', '_', 'e', 'u', 'r'),\n                                                         ('v', 'a', 'l', 'u', 'e', '_', 'd', 'a', 't', 'e', 'd', '_', 'b', 'a', 'l', 'a', 'n', 'c', 'e', '_', 'e', 'u', 'r'),\n                                                                                            ('c', 'r', 'e', 'd', 'i', 't', '_', 'l', 'i', 'm', 'i', 't', '_', 'e', 'u', 'r'),\n                                                                                            ('t', 'o', 't', 'a', 'l', '_', 'v', 'o', 'l', 'u', 'm', 'e', '_', 'e', 'u', 'r'),\n                                                                                                      ('a', 'v', 'g', '_', 'a', 'm', 'o', 'u', 'n', 't', '_', 'e', 'u', 'r'),\n                                                                                                      ('m', 'a', 'x', '_', 'a', 'm', 'o', 'u', 'n', 't', '_', 'e', 'u', 'r'),\n                                                                                                      ('m', 'i', 'n', '_', 'a', 'm', 'o', 'u', 'n', 't', '_', 'e', 'u', 'r'),\n                                                                                                                ('i', 's', '_', 't', 'x', 'n', '_', 'a', 'b', 'o', 'v', 'e'),\n                                                                                  ('i', 's', '_', 't', 'o', 't', 'a', 'l', '_', 't', 'x', 'n', '_', 'a', 'b', 'o', 'v', 'e'),\n                                                                                                 ('b', 'a', 'l', 'a', 'n', 'c', 'e', '_', 'u', 's', 'e', '_', 'p', 'c', 't'),\n                                                                                                           ('l', 'i', 'm', 'i', 't', '_', 'u', 's', 'e', '_', 'p', 'c', 't'),\n                                                                                                                ('i', 's', '_', 'a', 'b', 'o', 'v', 'e', '_', 'm', 'a', 'x'),\n                                                                                                                ('i', 's', '_', 'b', 'e', 'l', 'o', 'w', '_', 'm', 'i', 'n'),\n                                                                                                                ('i', 's', '_', 'a', 'b', 'o', 'v', 'e', '_', 'a', 'v', 'g'),\n                                                    ('t', 'r', 'a', 'n', 's', 'a', 'c', 't', 'i', 'o', 'n', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e', '_', 'd', 'a', 'y'),\n                                ('t', 'r', 'a', 'n', 's', 'a', 'c', 't', 'i', 'o', 'n', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e', '_', 'w', 'e', 'e', 'k', 'd', 'a', 'y'),\n                                               ('t', 'r', 'a', 'n', 's', 'a', 'c', 't', 'i', 'o', 'n', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e', '_', 'h', 'o', 'u', 'r'),\n                                                                                  ('v', 'a', 'l', 'u', 'e', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e', '_', 'd', 'a', 'y'),\n                                                              ('v', 'a', 'l', 'u', 'e', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e', '_', 'w', 'e', 'e', 'k', 'd', 'a', 'y'),\n                                                                             ('v', 'a', 'l', 'u', 'e', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e', '_', 'h', 'o', 'u', 'r'),\n                                                                        ('p', 'a', 'y', 'm', 'e', 'n', 't', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e', '_', 'd', 'a', 'y'),\n                                                    ('p', 'a', 'y', 'm', 'e', 'n', 't', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e', '_', 'w', 'e', 'e', 'k', 'd', 'a', 'y'),\n                                                                   ('p', 'a', 'y', 'm', 'e', 'n', 't', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e', '_', 'h', 'o', 'u', 'r'),\n                                                                        ('b', 'o', 'o', 'k', 'i', 'n', 'g', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e', '_', 'd', 'a', 'y'),\n                                                    ('b', 'o', 'o', 'k', 'i', 'n', 'g', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e', '_', 'w', 'e', 'e', 'k', 'd', 'a', 'y'),\n                                                                   ('b', 'o', 'o', 'k', 'i', 'n', 'g', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e', '_', 'h', 'o', 'u', 'r'),\n                                               ('l', 'a', 's', 't', '_', 'b', 'o', 'o', 'k', 'i', 'n', 'g', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e', '_', 'd', 'a', 'y'),\n                           ('l', 'a', 's', 't', '_', 'b', 'o', 'o', 'k', 'i', 'n', 'g', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e', '_', 'w', 'e', 'e', 'k', 'd', 'a', 'y'),\n                                          ('l', 'a', 's', 't', '_', 'b', 'o', 'o', 'k', 'i', 'n', 'g', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e', '_', 'h', 'o', 'u', 'r'),\n                           ('l', 'a', 's', 't', '_', 't', 'r', 'a', 'n', 's', 'a', 'c', 't', 'i', 'o', 'n', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e', '_', 'd', 'a', 'y'),\n       ('l', 'a', 's', 't', '_', 't', 'r', 'a', 'n', 's', 'a', 'c', 't', 'i', 'o', 'n', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e', '_', 'w', 'e', 'e', 'k', 'd', 'a', 'y'),\n                      ('l', 'a', 's', 't', '_', 't', 'r', 'a', 'n', 's', 'a', 'c', 't', 'i', 'o', 'n', '_', 'd', 'a', 't', 'e', 't', 'i', 'm', 'e', '_', 'h', 'o', 'u', 'r'),\n                                                                                                                                                             ('a', 'g', 'e'),\n                                                                                                           ('i', 's', '_', 's', 'e', 'q', 'u', 'e', 'n', 't', 'i', 'a', 'l'),\n                                                                                       ('i', 's', '_', 'h', 'i', 'g', 'h', '_', 'f', 'r', 'e', 'q', 'u', 'e', 'n', 'c', 'y')],\n      dtype='object')] are in the [columns]"

**The correlations between the dependent variables.**

Create a histogram of the correlation values and identify those that are most correlated (either positively or negatively).

In [None]:
feature_cols = df.columns.drop('is_fraud')
corr_values = df[feature_cols].corr()

tril_index = np.tril_indices_from(corr_values)

for coord in zip(*tril_index):
    corr_values.iloc[coord[0], coord[1]] = np.nan
    
# Stack the data and convert to a data frame
corr_values = (corr_values
               .stack()
               .to_frame()
               .reset_index()
               .rename(columns={'level_0':'feature1',
                                'level_1':'feature2',
                                0:'correlation'}))

# Get the absolute values for sorting
corr_values['abs_correlation'] = corr_values.correlation.abs()

In [None]:
plt.figure(figsize=(12,10))
sns.histplot(corr_values['abs_correlation'], bins=20)
plt.xlabel('Absolute Correlation')
plt.ylabel('Frequency')
plt.show()

In [None]:
corr_values.sort_values('correlation', ascending=False).query('abs_correlation>0.8')

In [None]:
df.shape[1]

When we see that all independent variables are highly correlated, we may want to perform feature selection.

In [None]:
def remove_highly_correlated_features(df, threshold=0.99):
    corr_matrix = df[feature_cols].corr().abs()
    to_drop = set()
    for i in range(len(corr_matrix.columns)):
        col = corr_matrix.columns[i]
        if col in to_drop:
            continue
        for j in range(i + 1, len(corr_matrix.columns)):
            other_col = corr_matrix.columns[j]
            if corr_matrix.iloc[i, j] > threshold:
                to_drop.add(other_col)
    return df.drop(columns=list(to_drop))

data = remove_highly_correlated_features(df[feature_cols], threshold=1.0)

In [None]:
data.shape[1]

## <h2 style="font-size: 1.8em; font-weight: bold;"> Baseline Model Training </h2>

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report

iso_forest = IsolationForest(n_estimators=100, contamination=0.02, max_features=1.0, random_state=42)
iso_forest.fit(X_train)
y_pred = iso_forest.predict(X_test)
# Convert -1 to 1 for fraud and 1 to 0 for non-fraud
y_pred = np.where(y_pred == -1, 1, 0) 
print("Classification Report for Isolation Forest:")
print(classification_report(y_test, y_pred, target_names=['Non-Fraud', 'Fraud']))   
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)  
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',    
            xticklabels=['Non-Fraud', 'Fraud'], 
            yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix for Isolation Forest')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Isolation Forest: {accuracy:.2f}")
# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred)
print(f"ROC AUC Score of Isolation Forest: {roc_auc:.2f}")
# Plotting the ROC curve
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label='ROC Curve (AUC = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.title('ROC Curve for Isolation Forest') 
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.grid()
plt.show()
# Plotting the feature importances
feature_importances = iso_forest.feature_importances_
feature_names = X_train.columns

plt.figure(figsize=(12, 6))
sns.barplot(x=feature_importances, y=feature_names, palette='viridis')
plt.title('Feature Importances from Isolation Forest')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()
# Plotting the decision function scores
decision_scores = iso_forest.decision_function(X_test)
plt.figure(figsize=(10, 6))
sns.histplot(decision_scores, bins=50, kde=True, color='blue')
plt.title('Decision Function Scores from Isolation Forest')
plt.xlabel('Decision Function Score')
plt.ylabel('Frequency')
plt.axvline(x=0, color='red', linestyle='--', label='Threshold (0)')
plt.legend()
plt.show()
# Plotting the distribution of decision function scores for fraud and non-fraud
plt.figure(figsize=(12, 6))
sns.histplot(decision_scores[y_test == 0], bins=50, kde=True,
                color='green', label='Non-Fraud', stat='density')
sns.histplot(decision_scores[y_test == 1], bins=50, kde=True,
                color='red', label='Fraud', stat='density')
plt.title('Distribution of Decision Function Scores for Fraud and Non-Fraud')
plt.xlabel('Decision Function Score')
plt.ylabel('Density')

plt.axvline(x=0, color='black', linestyle='--', label='Threshold (0)')
plt.legend()
plt.show()  
# Plotting the cumulative distribution function (CDF) of decision function scores
plt.figure(figsize=(12, 6))
sns.ecdfplot(decision_scores[y_test == 0], label='Non-Fraud',
                color='green', stat='density')
sns.ecdfplot(decision_scores[y_test == 1], label='Fraud',
                color='red', stat='density')
plt.title('Cumulative Distribution Function of Decision Function Scores')

plt.xlabel('Decision Function Score')
plt.ylabel('Cumulative Density')
plt.axvline(x=0, color='black', linestyle='--', label='Threshold (0)')
plt.legend()
plt.show()
    


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

def plot_conf_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

print("=== Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_test_pre_lr))
print("Accuracy:", accuracy_score(y_test, y_test_pre_lr))
print(classification_report(y_test, y_test_pre_lr))
plot_conf_matrix(y_test, y_test_pre_lr, "Logistic Regression")


print("=== Random Forest  ===")
print("Accuracy:", accuracy_score(y_test, y_test_pre_rl))
print(classification_report(y_test, y_test_pre_rl))
plot_conf_matrix(y_test, y_test_pre_rl, "Random Forest")

# Convert to binary: 1 → 0 (normal), -1 → 1 (anomaly)
y_pred_iso_binary = [0 if x == 1 else 1 for x in y_test_pre_iso_f]
print("=== Isolation Forest ===")
print("Accuracy:", accuracy_score(y_test, y_pred_iso_binary))
print(classification_report(y_test, y_pred_iso_binary))
plot_conf_matrix(y_test, y_pred_iso_binary, "Isolation Forest")


In [None]:
logreg_coefs = pd.DataFrame({
    'Feature': X_train.columns,
    'Value': lr.coef_[0],
    'Model': 'Logistic Regression'
})

rf_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Value': rf.feature_importances_,
    'Model': 'Random Forest'
})

try:
    iso_importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Value': iso_f.feature_importances_,  # available in sklearn 1.1+
        'Model': 'Isolation Forest'
    })
except AttributeError:
    iso_importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Value': [0]*len(X_train.columns),
        'Model': 'Isolation Forest (no feature_importances_)'
    })

In [None]:
rf_importance.sort_values('Value',ascending=False).query('Value > 0.1')

In [None]:
X_train_reset = X_train.reset_index(drop=True)
X_test_reset = X_test.reset_index(drop=True)

# Merge on all columns to find exact duplicate rows
overlap = pd.merge(X_train_reset, X_test_reset, how='inner')

print(f"Number of overlapping samples between train and test: {len(overlap)}")