In [22]:
import joblib
import pandas as pd
df = pd.read_csv('../data/processed_bank_transaction_data.csv')

In [23]:
preprocessor = joblib.load('../artifacts/preprocessor.joblib')

In [24]:
x_new = df.drop(columns=['TransactionID','TransactionDate','PreviousTransactionDate']).copy()

In [31]:
# ============================================
# STAGE 1: UNSUPERVISED ANOMALY DETECTION (ISOLATION FOREST)
# ============================================

import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from category_encoders import HashingEncoder
# -------------------------
# 1. Load your processed dataset
# -------------------------
# assuming your cleaned dataset is named df_processed
# and contains only numerical + encoded columns
# (no IDs or date/time strings)

df = df.copy()

# Define features
X = df.drop(columns=['TransactionID', 'TransactionDate','PreviousTransactionDate'], errors='ignore')
hash_features = ['AccountID','DeviceID','IP Address']
hash_encode = HashingEncoder(cols=hash_features, n_components=16)
x_hashed = hash_encode.fit_transform(X)

x = preprocessor.fit_transform(x_hashed)

In [32]:
x

array([[-0.97127547,  1.42371826, -0.55244326, ...,  1.        ,
        59.        , 32.        ],
       [ 0.26943961,  1.31128706,  0.30531437, ...,  1.        ,
        63.        , 27.        ],
       [-0.58688162, -1.44327736, -0.90984227, ...,  1.        ,
        61.        , 30.        ],
       ...,
       [-0.92146186,  0.63669986,  0.37679417, ...,  1.        ,
        59.        , 25.        ],
       [-0.38241973, -1.21841495, -1.43879281, ...,  1.        ,
        62.        , 20.        ],
       [-0.18676257, -1.16219935, -0.38089174, ...,  0.        ,
        60.        , 20.        ]], shape=(2512, 39))

In [None]:

# -------------------------
# 2. Initialize Isolation Forest
# -------------------------
iso_forest = IsolationForest(
    n_estimators=200,
    contamination=0.03,        # expected fraction of anomalies (tune this)
    max_samples='auto',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

# -------------------------
# 3. Train the model
# -------------------------
iso_forest.fit(x)

# -------------------------
# 4. Get predictions and anomaly scores
# -------------------------
# predictions: -1 = anomaly, 1 = normal
preds = iso_forest.predict(x)
scores = iso_forest.decision_function(x)  # higher score = more normal

# Convert to readable form
df['AnomalyFlag'] = np.where(preds == -1, 1, 0)  # 1 = anomaly/fraud
df['AnomalyScore'] = -scores  # invert so higher means more anomalous

# -------------------------
# 5. Analyze results
# -------------------------
print("Anomalies detected:", df['AnomalyFlag'].sum(), "out of", len(df))

# Distribution of scores
plt.figure(figsize=(8,5))
sns.histplot(df['AnomalyScore'], bins=50, kde=True, color='orange')
plt.title("Distribution of Anomaly Scores")
plt.xlabel("Anomaly Score")
plt.ylabel("Frequency")
plt.show()

# Quick overview of anomaly transactions
display(df[df['AnomalyFlag'] == 1].head(10))

# -------------------------
# 6. Save model artifact
# -------------------------
joblib.dump(iso_forest, "../artifacts/isolation_forest_model.joblib")
print("✅ Isolation Forest model saved successfully.")