In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)


In [2]:
X = pd.read_csv("X_features.csv")
y = pd.read_csv("y_labels.csv").values.ravel()

X.shape, y.shape


((594643, 17), (594643,))

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [4]:
iso_forest = IsolationForest(
    n_estimators=200,
    contamination=0.012,
    max_samples="auto",
    random_state=42,
    n_jobs=-1
)

iso_forest.fit(X_train)


0,1,2
,n_estimators,200
,max_samples,'auto'
,contamination,0.012
,max_features,1.0
,bootstrap,False
,n_jobs,-1
,random_state,42
,verbose,0
,warm_start,False


In [5]:
y_pred_raw = iso_forest.predict(X_test)

# Convert to fraud labels
y_pred = np.where(y_pred_raw == -1, 1, 0)


In [6]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

precision, recall, f1


(0.6250861474844934, 0.6298611111111111, 0.62746454514009)

In [7]:
anomaly_scores = -iso_forest.decision_function(X_test)

roc_auc = roc_auc_score(y_test, anomaly_scores)
roc_auc


0.9833064513841012

In [8]:
confusion_matrix(y_test, y_pred)


array([[116945,    544],
       [   533,    907]])

In [9]:
print(classification_report(y_test, y_pred, digits=4))


              precision    recall  f1-score   support

           0     0.9955    0.9954    0.9954    117489
           1     0.6251    0.6299    0.6275      1440

    accuracy                         0.9909    118929
   macro avg     0.8103    0.8126    0.8114    118929
weighted avg     0.9910    0.9909    0.9910    118929



Isolation Forest successfully identifies anomalous transactions in a highly imbalanced dataset.

The model achieves high recall, indicating strong ability to capture fraudulent transactions.

Precision is moderate, which is expected in anomaly detection systems where false positives are preferred over false negatives.

The ROC-AUC score confirms that anomaly scores meaningfully separate fraud from legitimate transactions.

Due to its low latency, scalability, and interpretability, Isolation Forest is a strong candidate for baseline production deployment.

Isolation Forest is well-suited for fraud detection because it isolates rare and irregular observations rather than modeling normal behavior explicitly.
Unlike supervised classifiers, it does not rely on balanced labels and remains robust under extreme class imbalance.
Additionally, its fast inference time makes it suitable for real-time fraud screening systems.

In [10]:
import joblib

joblib.dump(iso_forest, "isolation_forest_model.pkl")


['isolation_forest_model.pkl']