In [42]:
import joblib
import pandas as pd
df = pd.read_csv('../data/processed_bank_transaction_data.csv')
import warnings
warnings.filterwarnings('ignore')

In [43]:
preprocessor = joblib.load('../artifacts/preprocessor.joblib')

In [None]:
# ============================================
# STAGE 1: UNSUPERVISED ANOMALY DETECTION (ISOLATION FOREST)
# ============================================

import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from category_encoders import HashingEncoder
# -------------------------
# 1. Load your processed dataset
# -------------------------
# assuming your cleaned dataset is named df_processed
# and contains only numerical + encoded columns
# (no IDs or date/time strings)


In [None]:
df = df.copy()

# Define features
hash_features = ['AccountID','DeviceID','IP Address']
hash_encode = HashingEncoder(cols=hash_features, n_components=16)
x_hashed = hash_encode.fit_transform(df)

x = preprocessor.fit_transform(x_hashed)

In [None]:

# -------------------------
# 2. Initialize Isolation Forest
# -------------------------
iso_forest = IsolationForest(
    n_estimators=200,
    contamination=0.03,        # expected fraction of anomalies (tune this)
    max_samples='auto',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

# -------------------------
# 3. Train the model
# -------------------------
iso_forest.fit(x)

# -------------------------
# 4. Get predictions and anomaly scores
# -------------------------
# predictions: -1 = anomaly, 1 = normal
preds = iso_forest.predict(x)
scores = iso_forest.decision_function(x)  # higher score = more normal

# Convert to readable form
df['AnomalyFlag'] = np.where(preds == -1, 1, 0)  # 1 = anomaly/fraud
df['AnomalyScore'] = -scores  # invert so higher means more anomalous

# -------------------------
# 5. Analyze results
# -------------------------
print("Anomalies detected:", df['AnomalyFlag'].sum(), "out of", len(df))

# Distribution of scores
plt.figure(figsize=(8,5))
sns.histplot(df['AnomalyScore'], bins=50, kde=True, color='orange')
plt.title("Distribution of Anomaly Scores")
plt.xlabel("Anomaly Score")
plt.ylabel("Frequency")
plt.show()

# Quick overview of anomaly transactions
display(df[df['AnomalyFlag'] == 1].head(10))

# -------------------------
# 6. Save model artifact
# -------------------------
joblib.dump(iso_forest, "../artifacts/isolation_forest_model.joblib")
print("✅ Isolation Forest model saved successfully.")

In [None]:
# from sklearn.decomposition import PCA
# from sklearn.cluster import KMeans
# x_df = pd.DataFrame(x, columns=feature_names)
# x_df['numeric__std'].fillna(0,inplace=True)
# # PCA reduction to 10D
# pca = PCA(n_components=min(10,x_df.shape[1]), random_state=30)
# x_pca = pca.fit_transform(x_df)

# df['PC1'] , df['PC2'] = x_pca[:,0] , x_pca[:,1]
# print(f'Explained variance by PCA components : {pca.explained_variance_ratio_.sum():.2f}')
# # apply k-means clustering on pca components
# k_means = KMeans(n_clusters=5, random_state=42)
# df['Cluster'] = k_means.fit_predict(x_pca)
# # visualise pca vs isolationforest anomalies
# plt.figure(figsize=(10,6))
# sns.scatterplot(x='PC1', y='PC2', hue='AnomalyFlag', data=df, palette={1: 'red', 0: 'blue'}, alpha=0.6)
# plt.title('Isolation Forest Anomalies (PCA visualizations)')
# plt.show()
# # Visualize KMeans clusters
# plt.figure(figsize=(10, 6))
# sns.scatterplot(
#     x='PC1', y='PC2',
#     hue='Cluster',
#     data=df,
#     palette='Set2',
#     alpha=0.6
# )
# plt.title("KMeans Clusters (PCA Visualization)")
# plt.show()

# cross_tab = pd.crosstab(df['Cluster'], df['AnomalyFlag'])
# sns.heatmap(cross_tab, annot=True, fmt='d', cmap='Blues')
# plt.title("KMeans Cluster vs Isolation Forest Anomaly Flag")
# plt.show()

In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,classification_report
)

In [None]:
y = df['AnomalyFlag']
x_train,x_test,y_train,y_test = train_test_split(
    x,y, test_size=0.2, stratify=y
)
# cross validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=30)

# train random forest model
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}
rf = RandomForestClassifier(random_state=42,class_weight='balanced')
rf_grid = GridSearchCV(estimator=rf, param_grid=rf_params, cv=cv, n_jobs=-1, scoring='f1', verbose=1)
print('Training RandomForest...(This May Take A While)')
rf_grid.fit(x_train,y_train)

# best model for random forest
rf_best_ = rf_grid.best_estimator_
print('-' * 70)
# train xgboost model
xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [4, 6],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
xgb = XGBClassifier(objective='binary:logistic',random_state=30,eval_metric='logloss')
xgb_grid = GridSearchCV(estimator=xgb,param_grid=xgb_params, scoring='f1',verbose=1,n_jobs=-1)
print("Training XGBoost...(This May Take A While)")
xgb_grid.fit(x_train,y_train)

xgb_best_ = xgb_grid.best_estimator_

# evaluate both model and pick the best one
models = {'RandomForest' : rf_best_, 'Xgboost' : xgb_best_}
results = []

for name, model in models.items():
    y_probs = model.predict_proba(x_test)[:,1]
    threshold = 0.2
    y_pred = (y_probs >= threshold).astype(int)
    metrics = {
        'Model' : name,
        'Accuracy' : accuracy_score(y_test,y_pred),
        'Precision' : precision_score(y_test,y_pred),
        'Recall' : recall_score(y_test,y_pred),
        'f1_score' : f1_score(y_test,y_pred),
        'roc_auc_score' : roc_auc_score(y_test, y_pred)
    }
    results.append(metrics)
    print('-'*70)
    print(f'{name} Classification Report \n',classification_report(y_test,y_pred))
print('-'*70)
results_df = pd.DataFrame(results)
print('Model Comparison\n',results_df)

# save the best model
best_model_name = results_df.sort_values(by='f1_score',ascending=False).iloc[0]['Model']
best_model = models[best_model_name]
joblib.dump(best_model,f'../models/{best_model_name}_fraud_detector.pkl')
print('-'*70)
print(f"Best model '{best_model_name}' saved as '{best_model_name}_fraud_detector.pkl")


Training RandomForest...(This May Take A While)
Fitting 5 folds for each of 24 candidates, totalling 120 fits
----------------------------------------------------------------------
Training XGBoost...(This May Take A While)
Fitting 5 folds for each of 32 candidates, totalling 160 fits
--------------------------------------------------
RandomForest Classification Report 
               precision    recall  f1-score   support

           0       1.00      0.96      0.98       488
           1       0.39      0.87      0.54        15

    accuracy                           0.96       503
   macro avg       0.69      0.91      0.76       503
weighted avg       0.98      0.96      0.96       503

--------------------------------------------------
Xgboost Classification Report 
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       488
           1       0.43      0.40      0.41        15

    accuracy                           0.97       50