# Data Synthesis

## 0. Setup

In [72]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

rng = np.random.default_rng(seed=42)

# -------------------------
# Config knobs you can tweak
# -------------------------
N_EMPLOYERS = 300          # number of companies applying for work passes
N_WORKERS = 2000           # unique foreign workers in pool
N_APPLICATIONS = 20000      # total work pass applications (rows)
FRAUD_RATE = 0.04          # ~4% labelled suspicious
START_DATE = datetime(2025, 1, 1)
DAYS_SPAN = 60             # simulate 2 months of activity

## 1. Generate entity profiles

In [73]:
# Employers: assign each a sector and "typical salary band"
sectors = ["Construction", "Marine", "F&B", "Manufacturing", "Logistics", "Cleaning"]
employer_ids = np.arange(10000, 10000 + N_EMPLOYERS)

employer_df = pd.DataFrame({
    "employer_id": employer_ids,
    "sector": rng.choice(sectors, size=N_EMPLOYERS, replace=True),
    # base_salary_band is like their usual declared monthly salary for foreign hires
    "base_salary_band": rng.normal(loc=2800, scale=600, size=N_EMPLOYERS).clip(1600, 6000),
    # normal application volume per 30 days
    "baseline_applications_per_month": rng.poisson(lam=4, size=N_EMPLOYERS) + 1
})

# Workers (foreign hires) - assign nationality and "expected wage tier"
countries = ["CN","IN","BD","PH","MM","TH","ID","MY","LK","VN"]
worker_ids = np.arange(500000, 500000 + N_WORKERS)

worker_df = pd.DataFrame({
    "worker_id": worker_ids,
    "nationality": rng.choice(countries, size=N_WORKERS, replace=True),
    # skill_tier 1-3 affects expected salary (e.g. Tier 3 = higher skill, higher pay)
    "skill_tier": rng.integers(low=1, high=4, size=N_WORKERS)
})

# helper: expected salary by skill_tier
tier_salary_map = {
    1: (2200, 400),   # mean, std
    2: (3000, 500),
    3: (4200, 800),
}

## 2. Generate work pass applications (transactions)

In [74]:
# =========================
# 2. Generate work pass applications (transactions)
# =========================
# We'll sample applications: (employer, worker, timestamp, declared salary, etc.)

def random_application(i):
    # pick employer
    emp = employer_df.sample(1, weights=None, random_state=rng.integers(0, 10**9)).iloc[0]
    emp_id = emp.employer_id

    # pick worker
    wrk = worker_df.sample(1, random_state=rng.integers(0, 10**9)).iloc[0]
    wrk_id = wrk.worker_id

    # timestamp within window
    ts_offset_days = rng.integers(0, DAYS_SPAN)
    ts_offset_hours = rng.integers(0, 24)
    ts = START_DATE + timedelta(days=int(ts_offset_days), hours=int(ts_offset_hours))

    # "clean" (non-fraud) salary proposal normally tied to worker tier & employer band
    tier_mean, tier_std = tier_salary_map[wrk.skill_tier]
    base = (tier_mean + emp.base_salary_band) / 2   # average both expectations
    declared_salary_clean = rng.normal(loc=base, scale=(tier_std+200)/2)

    # Slight noise
    declared_salary_clean = np.clip(declared_salary_clean, 1600, 8000)

    # number of prior applications by this employer in last 30 days
    # (we approximate using employer baseline + Poisson jitter)
    recent_load_est = emp.baseline_applications_per_month + rng.poisson(2)

    return {
        "application_id": i,
        "timestamp": ts,
        "employer_id": emp_id,
        "sector": emp.sector,
        "worker_id": wrk_id,
        "nationality": wrk.nationality,
        "skill_tier": wrk.skill_tier,
        "declared_salary": round(declared_salary_clean, 0),
        "est_monthly_load": int(recent_load_est)
    }

applications = [random_application(i) for i in range(N_APPLICATIONS)]
df = pd.DataFrame(applications)
df.sort_values("timestamp", inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,application_id,timestamp,employer_id,sector,worker_id,nationality,skill_tier,declared_salary,est_monthly_load
0,10530,2025-01-01,10163,F&B,500460,VN,1,2842.0,11
1,15452,2025-01-01,10264,Cleaning,500507,MM,2,2689.0,9
2,4927,2025-01-01,10287,Manufacturing,501511,BD,3,4413.0,5
3,15872,2025-01-01,10232,Construction,500748,MY,2,3096.0,8
4,4293,2025-01-01,10040,Construction,500646,MY,1,2230.0,9


## 3. Inject suspicious patterns (fraud logic)

In [75]:
# We'll mark some rows as suspicious_flag = 1 using rules that MOM-style fraud teams care about:
#   Pattern A: Salary too low for claimed skill_tier
#   Pattern B: Sudden application spike by employer
#   Pattern C: High churn: same worker_id appearing with multiple different employers quickly

df["suspicious_flag"] = 0

# Pattern A: underpayment relative to tier
def expected_min_pay(tier):
    # super rough "legal/market floor" concept
    if tier == 1: return 1800
    if tier == 2: return 2600
    return 3500  # tier 3
underpaid_mask = df.apply(
    lambda row: row.declared_salary < expected_min_pay(row.skill_tier) * 0.7,
    axis=1
)
# What is loc? It's a way to access a group of rows and columns by labels or a boolean array.
# df[underpaid_mask, "suspicious_flag"] = 1 # Does this code work? Answer: No, it should be df.loc[underpaid_mask, "suspicious_flag"] = 1
df.loc[underpaid_mask, "suspicious_flag"] = 1

# Pattern B: spike in est_monthly_load (possible quota abuse / pass farming)
spike_mask = df["est_monthly_load"] > (df["est_monthly_load"].median() + 3*df["est_monthly_load"].std())
df.loc[spike_mask, "suspicious_flag"] = 1

# Pattern C: worker hopping employers in <4 days
# first, get worker timeline
df["timestamp_unix"] = df["timestamp"].astype("int64") // 10**9  # seconds
df = df.sort_values(["worker_id","timestamp_unix"])
df["prev_employer"] = df.groupby("worker_id")["employer_id"].shift(1)
df["prev_time"] = df.groupby("worker_id")["timestamp_unix"].shift(1)
rapid_reapply_mask = (
    (df["prev_employer"].notna()) &
    (df["prev_employer"] != df["employer_id"]) &
    ((df["timestamp_unix"] - df["prev_time"]) < 4*24*3600)  # < 4 days
)
df.loc[rapid_reapply_mask, "suspicious_flag"] = 1

# Clean up helper cols for presentation
df.drop(columns=["timestamp_unix","prev_employer","prev_time"], inplace=True)

df["suspicious_flag"] = df["suspicious_flag"].astype(int)

fraud_rate_actual = df["suspicious_flag"].mean()
print("Final suspicious rate:", round(fraud_rate_actual,4))

## Downsample suspicious rows to target fraud rate
target_rate = 0.03  # e.g. 3%

# Count how many are currently fraud / not fraud
fraud_mask = df["suspicious_flag"] == 1
N_fraud_orig = fraud_mask.sum()
N_norm_orig = (~fraud_mask).sum()

# desired fraud rows to keep
k = int((target_rate / (1 - target_rate)) * N_norm_orig)

# when more fraudulent rows than desired, downsample
if N_fraud_orig > k:
    fraud_indices_to_keep = (
        df[fraud_mask]
        .sample(n=k, random_state=42)
        .index
    )

    df = pd.concat([
        df[~fraud_mask],            # keep all normal
        df.loc[fraud_indices_to_keep]  # keep only k fraud
    ]).sample(frac=1, random_state=42).reset_index(drop=True)

# sanity check
final_rate = df["suspicious_flag"].mean()
print("Final suspicious rate (after proper downsampling):", round(final_rate, 4))
print("Total applications after downsampling:", len(df))



Final suspicious rate: 0.4744
Final suspicious rate (after proper downsampling): 0.03
Total applications after downsampling: 10837


In [159]:
sum(underpaid_mask)
df["est_monthly_load"].median() + 3*df["est_monthly_load"].std()

14.403361862391694

## Save to disk

In [76]:
df.to_csv("synthetic_workpass_fraud.csv", index=False)

# Data Augmentation

## Load and Inspect Data

In [168]:
df = pd.read_csv("synthetic_workpass_fraud.csv")
print("\nData types:\n", df.dtypes)


Data types:
 application_id        int64
timestamp            object
employer_id           int64
sector               object
worker_id             int64
nationality          object
skill_tier            int64
declared_salary     float64
est_monthly_load      int64
suspicious_flag       int64
dtype: object


## Convert Data Types

In [169]:
# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])
print(df.dtypes)


application_id               int64
timestamp           datetime64[ns]
employer_id                  int64
sector                      object
worker_id                    int64
nationality                 object
skill_tier                   int64
declared_salary            float64
est_monthly_load             int64
suspicious_flag              int64
dtype: object


## Create New Columns

Based on our synthetic data generation rules for fraud patterns, we can create new columns to help identify suspicious applications.  
Pattern A: Underpaid applicants (declared_salary significantly below expected for skill_tier)  
Pattern B: Spike in est_monthly_load (possible quota abuse / pass farming)  
Pattern C: Frequent changes in employment history (job hopping)  

In [170]:
# Pattern A: underpayment relative to tier
# Find mean pay for each skill tier
df["tier_mean_salary"] = df.groupby("skill_tier")["declared_salary"].transform("mean")
# Calculate salary deviation from tier mean
df["salary_deviation"] = df["declared_salary"] - df["tier_mean_salary"]
# Create tier_salary_ratio feature, if low ratio indicates underpayment
df["tier_salary_ratio"] = df["declared_salary"] / df["tier_mean_salary"]

# Pattern B: spike in est_monthly_load (possible quota abuse / pass farming)
# Calculate z-score for est_monthly_load 
df["load_zscore"] = (df["est_monthly_load"] - df["est_monthly_load"].mean()) / df["est_monthly_load"].std()

# Pattern C: worker hopping employers in <4 days
# Find previous application time for each worker
df = df.sort_values(["worker_id","timestamp"])
df["prev_application_time"] = df.groupby("worker_id")["timestamp"].shift(1)
# Calculate time difference in days since last application
df["days_since_last_application"] = (df["timestamp"] - df["prev_application_time"]).dt.days
# If first application, fill NaN with large number
df["days_since_last_application"].fillna(999, inplace=True)

# Sanity check for first applications
df[df["days_since_last_application"]==999].head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["days_since_last_application"].fillna(999, inplace=True)


Unnamed: 0,application_id,timestamp,employer_id,sector,worker_id,nationality,skill_tier,declared_salary,est_monthly_load,suspicious_flag,tier_mean_salary,salary_deviation,tier_salary_ratio,load_zscore,prev_application_time,days_since_last_application
1545,10016,2025-01-07 12:00:00,10003,F&B,500000,VN,1,2202.0,7,0,2498.71631,-296.71631,0.881253,0.000785,NaT,999.0
10441,2180,2025-01-13 14:00:00,10170,Construction,500001,ID,3,4559.0,5,0,3556.79025,1002.20975,1.281774,-0.809657,NaT,999.0
10155,10640,2025-01-01 10:00:00,10218,F&B,500002,MM,2,1866.0,7,0,2904.632302,-1038.632302,0.642422,0.000785,NaT,999.0
5333,7001,2025-01-02 00:00:00,10072,F&B,500003,MM,3,3662.0,7,0,3556.79025,105.20975,1.02958,0.000785,NaT,999.0
9202,11675,2025-01-05 06:00:00,10192,Manufacturing,500004,ID,3,2479.0,6,0,3556.79025,-1077.79025,0.696977,-0.404436,NaT,999.0


In [172]:
# One-hot encode categorical variables
df_encoded = pd.get_dummies(df, columns=["sector", "nationality"])
df_encoded.head()

Unnamed: 0,application_id,timestamp,employer_id,worker_id,skill_tier,declared_salary,est_monthly_load,suspicious_flag,tier_mean_salary,salary_deviation,...,nationality_BD,nationality_CN,nationality_ID,nationality_IN,nationality_LK,nationality_MM,nationality_MY,nationality_PH,nationality_TH,nationality_VN
1545,10016,2025-01-07 12:00:00,10003,500000,1,2202.0,7,0,2498.71631,-296.71631,...,False,False,False,False,False,False,False,False,False,True
9235,8218,2025-01-15 13:00:00,10186,500000,1,2019.0,5,0,2498.71631,-479.71631,...,False,False,False,False,False,False,False,False,False,True
6413,14445,2025-01-21 15:00:00,10274,500000,1,2640.0,5,0,2498.71631,141.28369,...,False,False,False,False,False,False,False,False,False,True
575,5479,2025-02-06 18:00:00,10298,500000,1,2882.0,6,0,2498.71631,383.28369,...,False,False,False,False,False,False,False,False,False,True
1934,8978,2025-02-08 02:00:00,10177,500000,1,2520.0,7,1,2498.71631,21.28369,...,False,False,False,False,False,False,False,False,False,True


# Modelling

## Logistic Regression

First we fit a general logistic regression model to see which predictors are most important in identifying fraudulent applications. We can expect the engineered features to be significant predictors of fraud.

In [197]:
# Fit a general logistic regression model to see which predictors are most important in identifying fraudulent applications. We can expect the engineered features to be significant predictors of fraud.
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
# Prepare features and target
feature_cols = [
    "declared_salary",
    "est_monthly_load",
    "skill_tier",
    "salary_deviation",
    "tier_salary_ratio",
    "load_zscore",
    "days_since_last_application"
] + [col for col in df_encoded.columns if col.startswith("sector_") or col.startswith("nationality_")] 
X = df_encoded[feature_cols].astype(float)
y = df_encoded["suspicious_flag"].astype(int)

# fit logistic regression without standardization
logit_model = sm.Logit(y, sm.add_constant(X))
result = logit_model.fit()
print(result.summary()) 

Optimization terminated successfully.
         Current function value: 0.127170
         Iterations 11
                           Logit Regression Results                           
Dep. Variable:        suspicious_flag   No. Observations:                10837
Model:                          Logit   Df Residuals:                    10816
Method:                           MLE   Df Model:                           20
Date:                Tue, 28 Oct 2025   Pseudo R-squ.:                 0.05595
Time:                        20:49:22   Log-Likelihood:                -1378.1
converged:                       True   LL-Null:                       -1459.8
Covariance Type:            nonrobust   LLR p-value:                 1.695e-24
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const                           0.5172   3.07e+06   1.68e-07      1.000  

As predicted, nationality and sector dummies are not significant predictors of fraud, while our engineered features show strong significance. Due to perfect correlation between load_zscore and est_monthly_load, we drop load_zscore from the model.

In [248]:
feature_cols = [
    "declared_salary",
    "est_monthly_load",
    "skill_tier",
    "salary_deviation",
    "tier_salary_ratio",
    # "load_zscore",
    "days_since_last_application"
]
X = df_encoded[feature_cols].astype(float)
y = df_encoded["suspicious_flag"].astype(int)

# fit logistic regression without standardization
logit_model = sm.Logit(y, sm.add_constant(X))
result = logit_model.fit()
print(result.summary()) 

Optimization terminated successfully.
         Current function value: 0.128045
         Iterations 11
                           Logit Regression Results                           
Dep. Variable:        suspicious_flag   No. Observations:                10837
Model:                          Logit   Df Residuals:                    10830
Method:                           MLE   Df Model:                            6
Date:                Tue, 28 Oct 2025   Pseudo R-squ.:                 0.04945
Time:                        22:07:37   Log-Likelihood:                -1387.6
converged:                       True   LL-Null:                       -1459.8
Covariance Type:            nonrobust   LLR p-value:                 1.186e-28
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const                           4.2356      3.205      1.322      0.186  

Type II Error (missed fraud): 0.142  
Type I Error (false alarms): 0.641   
ROC AUC: 0.665  

While the Type II error is acceptable, the high Type I error indicates many false alarms. Further model tuning and feature engineering may be needed to improve precision in fraud detection.

In [202]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

logit = LogisticRegression(class_weight='balanced', max_iter=1000)
logit.fit(X, y)
y_pred = logit.predict(X)

cm = confusion_matrix(y, y_pred)
tn, fp, fn, tp = cm.ravel()

recall = tp / (tp + fn)
type2_error = fn / (tp + fn)   # 1 - recall
type1_error = fp / (fp + tn)   # 1 - specificity

print(f"Recall: {recall:.3f}")
print(f"Type II Error (missed fraud): {type2_error:.3f}")
print(f"Type I Error (false alarms): {type1_error:.3f}")

# ROC AUC
from sklearn.metrics import roc_auc_score
y_prob = logit.predict_proba(X)[:, 1]
roc_auc = roc_auc_score(y, y_prob)
print(f"ROC AUC: {roc_auc:.3f}")

Recall: 0.858
Type II Error (missed fraud): 0.142
Type I Error (false alarms): 0.641
ROC AUC: 0.665


In [207]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold

def evaluate_lr_cv(X, y, n_splits=5, threshold=0.5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    recalls = []
    type2_errors = []
    type1_errors = []
    aucs = []

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # train weighted logistic regression
        logit = LogisticRegression(class_weight='balanced', max_iter=1000)
        logit.fit(X_train, y_train)

        # predicted probs on test fold
        y_prob = logit.predict_proba(X_test)[:, 1]

        # convert to hard labels using threshold
        y_pred = (y_prob > threshold).astype(int)

        # confusion matrix on test fold
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        type2_error = fn / (tp + fn) if (tp + fn) > 0 else 0.0      # 1 - recall
        type1_error = fp / (fp + tn) if (fp + tn) > 0 else 0.0      # FPR
        auc = roc_auc_score(y_test, y_prob)

        recalls.append(recall)
        type2_errors.append(type2_error)
        type1_errors.append(type1_error)
        aucs.append(auc)

    print(f"Recall: {np.mean(recalls):.3f}")
    print(f"Type II Error (missed fraud): {np.mean(type2_errors):.3f}")
    print(f"Type I Error (false alarms): {np.mean(type1_errors):.3f}")
    print(f"ROC AUC: {np.mean(aucs):.3f}")

# Evaluate for different thresholds
for thresh in [0.3, 0.5, 0.7]:
    print(f"\nEvaluating at threshold = {thresh}")
    evaluate_lr_cv(X, y, n_splits=5, threshold=thresh)
    



Evaluating at threshold = 0.3
Recall: 0.991
Type II Error (missed fraud): 0.009
Type I Error (false alarms): 0.810
ROC AUC: 0.657

Evaluating at threshold = 0.5
Recall: 0.837
Type II Error (missed fraud): 0.163
Type I Error (false alarms): 0.634
ROC AUC: 0.657

Evaluating at threshold = 0.7
Recall: 0.003
Type II Error (missed fraud): 0.997
Type I Error (false alarms): 0.000
ROC AUC: 0.657


## Decision Trees

Evaluating Decision Tree at threshold = 0.26  
Recall: 0.898  
Type II Error (missed fraud): 0.102  
Type I Error (false alarms): 0.285  
ROC AUC: 0.91  

Decision trees hold a high AUC value of 0.917, indicating strong discriminatory power between fraudulent and legitimate applications. Setting the threshold to 0.26 achieves a good balance between catching fraud (high recall) while keeping false alarms relatively low. Further tuning of tree parameters and thresholds can help optimize performance for specific operational needs in fraud detection.


In [234]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, roc_auc_score

# Define model
dt = DecisionTreeClassifier(class_weight='balanced', random_state=42)

# Define parameter grid to search
param_grid = {
    'max_depth': [4, 6, 8, 10],
    'min_samples_split': [2, 10, 20, 30],
    'min_samples_leaf': [1, 5, 10, 20]
}

small_grid = {'max_depth':[4,6], 'min_samples_split':[2,10], 'min_samples_leaf':[1,5]}

# Define stratified K-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Use ROC-AUC as the scoring metric
scorer = make_scorer(roc_auc_score)

# Set up grid search
grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    scoring=scorer,
    cv=cv
)

# Run search
grid_search.fit(X, y)


# Print best results
print("Best Parameters:", grid_search.best_params_)
print(f"Best ROC-AUC: {grid_search.best_score_:.3f}")



Best Parameters: {'max_depth': 6, 'min_samples_leaf': 5, 'min_samples_split': 2}
Best ROC-AUC: 0.865


In [240]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold

def evaluate_dt_cv(X, y, n_splits=5, threshold=0.5,
                   max_depth=None,
                   min_samples_split=2,
                   min_samples_leaf=1):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    recalls = []
    type2_errors = []
    type1_errors = []
    aucs = []

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # train weighted decision tree
        dt = DecisionTreeClassifier(
            class_weight='balanced',
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )
        dt.fit(X_train, y_train)

        # predicted probs on test fold for the positive class (fraud = 1)
        y_prob = dt.predict_proba(X_test)[:, 1]

        # convert to hard labels using threshold
        y_pred = (y_prob > threshold).astype(int)

        # confusion matrix on test fold
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        type2_error = fn / (tp + fn) if (tp + fn) > 0 else 0.0      # miss rate
        type1_error = fp / (fp + tn) if (fp + tn) > 0 else 0.0      # false alarm rate
        auc = roc_auc_score(y_test, y_prob)

        recalls.append(recall)
        type2_errors.append(type2_error)
        type1_errors.append(type1_error)
        aucs.append(auc)

    print(f"Recall: {np.mean(recalls):.3f}")
    print(f"Type II Error (missed fraud): {np.mean(type2_errors):.3f}")
    print(f"Type I Error (false alarms): {np.mean(type1_errors):.3f}")
    print(f"ROC AUC: {np.mean(aucs):.3f}")

# Evaluate tree at different decision thresholds
for thresh in [0.22, 0.24, 0.26, 0.28, 0.3]:
    print(f"\nEvaluating Decision Tree at threshold = {thresh}")
    evaluate_dt_cv(
        X, y,
        n_splits=5,
        threshold=thresh,
        max_depth=6,
        min_samples_split=2,
        min_samples_leaf=5
    )



Evaluating Decision Tree at threshold = 0.22
Recall: 0.923
Type II Error (missed fraud): 0.077
Type I Error (false alarms): 0.342
ROC AUC: 0.917

Evaluating Decision Tree at threshold = 0.24
Recall: 0.920
Type II Error (missed fraud): 0.080
Type I Error (false alarms): 0.324
ROC AUC: 0.917

Evaluating Decision Tree at threshold = 0.26
Recall: 0.898
Type II Error (missed fraud): 0.102
Type I Error (false alarms): 0.285
ROC AUC: 0.917

Evaluating Decision Tree at threshold = 0.28
Recall: 0.892
Type II Error (missed fraud): 0.108
Type I Error (false alarms): 0.278
ROC AUC: 0.917

Evaluating Decision Tree at threshold = 0.3
Recall: 0.892
Type II Error (missed fraud): 0.108
Type I Error (false alarms): 0.275
ROC AUC: 0.917


## Random Forests

Evaluating Random Forest at threshold = 0.32  
Recall: 0.858  
Type II Error (missed fraud): 0.142  
Type I Error (false alarms): 0.182  
ROC AUC: 0.931  

Random Forests further improve fraud detection performance, achieving a high AUC of 0.931. At a threshold of 0.32, the model maintains a low Type II error of 0.142 while significantly reducing Type I error to 0.182 compared to decision trees. This indicates fewer false alarms while still effectively identifying fraudulent applications.

In [241]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, roc_auc_score

# Define model
rf = RandomForestClassifier(class_weight='balanced', random_state=42)

# Define parameter grid to search
param_grid = {
    'n_estimators': [100, 200],          # number of trees
    'max_depth': [4, 6, 8, 10],          # tree depth
    'min_samples_split': [2, 10, 20],    # min samples to split a node
    'min_samples_leaf': [1, 5, 10],      # min samples at a leaf
    'max_features': ['sqrt', 'log2']     # number of features per split
}

# Define stratified K-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Use ROC-AUC as the scoring metric
scorer = make_scorer(roc_auc_score)

# Set up grid search
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring=scorer,
    cv=cv,
    n_jobs=-1,       # parallelize
    verbose=2
)

# Run search
grid_search.fit(X, y)

# Print best results
print("Best Parameters:", grid_search.best_params_)
print(f"Best ROC-AUC: {grid_search.best_score_:.3f}")


Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best Parameters: {'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Best ROC-AUC: 0.868


In [245]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold

def evaluate_rf_cv(X, y,
                   n_splits=5,
                   threshold=0.5,
                   n_estimators=200,
                   max_depth=None,
                   min_samples_split=2,
                   min_samples_leaf=1,
                   max_features='sqrt',
                   random_state=42):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    recalls = []
    type2_errors = []
    type1_errors = []
    aucs = []

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # train weighted random forest
        rf = RandomForestClassifier(
            n_estimators=n_estimators,
            class_weight='balanced',
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            random_state=random_state,
            n_jobs=-1
        )
        rf.fit(X_train, y_train)

        # predicted probs on test fold for the positive class (fraud = 1)
        y_prob = rf.predict_proba(X_test)[:, 1]

        # convert to hard labels using chosen threshold
        y_pred = (y_prob > threshold).astype(int)

        # confusion matrix on test fold
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        type2_error = fn / (tp + fn) if (tp + fn) > 0 else 0.0      # miss rate
        type1_error = fp / (fp + tn) if (fp + tn) > 0 else 0.0      # false alarm rate
        auc = roc_auc_score(y_test, y_prob)

        recalls.append(recall)
        type2_errors.append(type2_error)
        type1_errors.append(type1_error)
        aucs.append(auc)

    print(f"Recall: {np.mean(recalls):.3f}")
    print(f"Type II Error (missed fraud): {np.mean(type2_errors):.3f}")
    print(f"Type I Error (false alarms): {np.mean(type1_errors):.3f}")
    print(f"ROC AUC: {np.mean(aucs):.3f}")

# {'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
# Sweep thresholds 
for thresh in [0.3, 0.32, 0.34, 0.36, 0.38]:
    print(f"\nEvaluating Random Forest at threshold = {thresh}")
    evaluate_rf_cv(
        X, y,
        n_splits=5,
        threshold=thresh,
        n_estimators=100,        
        max_depth=4,             
        min_samples_split=10,    
        min_samples_leaf=1,      
        max_features='sqrt',     
        random_state=42
    )



Evaluating Random Forest at threshold = 0.3
Recall: 0.902
Type II Error (missed fraud): 0.098
Type I Error (false alarms): 0.240
ROC AUC: 0.931

Evaluating Random Forest at threshold = 0.32
Recall: 0.858
Type II Error (missed fraud): 0.142
Type I Error (false alarms): 0.182
ROC AUC: 0.931

Evaluating Random Forest at threshold = 0.34
Recall: 0.825
Type II Error (missed fraud): 0.175
Type I Error (false alarms): 0.114
ROC AUC: 0.931

Evaluating Random Forest at threshold = 0.36
Recall: 0.766
Type II Error (missed fraud): 0.234
Type I Error (false alarms): 0.058
ROC AUC: 0.931

Evaluating Random Forest at threshold = 0.38
Recall: 0.754
Type II Error (missed fraud): 0.246
Type I Error (false alarms): 0.027
ROC AUC: 0.931


## CatBoost

Evaluating CatBoost at threshold = 0.34  
Recall: 0.855  
Type II Error (missed fraud): 0.145  
Type I Error (false alarms): 0.148  
ROC AUC: 0.932  
CatBoost achieves the highest AUC of 0.932 among the models tested, indicating excellent capability in distinguishing fraudulent from legitimate applications. At a threshold of 0.34, it maintains a low Type II error of 0.145 while further reducing Type I error to 0.148. This balance makes CatBoost a strong candidate for deployment in fraud detection systems, effectively minimizing both missed fraud and false alarms.

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score

# custom scorer because older sklearn in your env doesn't support needs_proba in make_scorer
def auc_scorer(estimator, X_val, y_val):
    # predict_proba returns [:,1] = P(class 1)
    y_prob = estimator.predict_proba(X_val)[:, 1]
    # guard: if one-class or constant probs, fallback to 0.5
    if len(set(y_val)) < 2:
        return 0.5
    if (y_prob == y_prob[0]).all():
        return 0.5
    return roc_auc_score(y_val, y_prob)

# base model
# We'll keep verbose=0 so it doesn't spam during GridSearchCV
cat = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='AUC',
    verbose=0,
    random_state=42,
    # handle class imbalance:
    # we can set scale_pos_weight=y_neg/y_pos if you want stronger minority weighting.
    # we'll leave it out in the grid for now, can add later.
)

# parameter grid
# Note: CatBoost has many hyperparams; start small or it'll explode in runtime.
param_grid = {
    'depth': [4, 6, 8],          # tree depth (like max_depth)
    'learning_rate': [0.05, 0.1],
    'n_estimators': [200, 400],  # boosting rounds
    'l2_leaf_reg': [1, 5, 10],   # L2 regularization on leaves
    'border_count': [32, 64]     # number of splits for numeric features
}

# stratified CV same style
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=cat,
    param_grid=param_grid,
    scoring=auc_scorer,   # custom scorer that uses predict_proba
    cv=cv,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X, y)

print("Best Parameters:", grid_search.best_params_)
print(f"Best ROC-AUC: {grid_search.best_score_:.3f}")


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'border_count': 64, 'depth': 4, 'l2_leaf_reg': 1, 'learning_rate': 0.05, 'n_estimators': 200}
Best ROC-AUC: 0.937


In [253]:
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold

def evaluate_catboost_cv(X, y,
                         n_splits=5,
                         threshold=0.5,
                         depth=6,
                         border_count=32,
                         learning_rate=0.1,
                         n_estimators=200,
                         l2_leaf_reg=3,
                         random_state=42,
                         scale_pos_weight=None):

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    recalls, type2_errors, type1_errors, aucs = [], [], [], []

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # train CatBoost (silent)
        model = CatBoostClassifier(
            depth=depth,
            border_count=border_count,
            learning_rate=learning_rate,
            n_estimators=n_estimators,
            l2_leaf_reg=l2_leaf_reg,
            scale_pos_weight=scale_pos_weight,
            loss_function='Logloss',
            eval_metric='AUC',
            random_state=random_state,
            verbose=0
        )
        model.fit(X_train, y_train)

        # predicted probs on test fold
        y_prob = model.predict_proba(X_test)[:, 1]

        # convert to hard labels
        y_pred = (y_prob > threshold).astype(int)

        # confusion matrix
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        type2_error = fn / (tp + fn) if (tp + fn) > 0 else 0.0      # miss rate
        type1_error = fp / (fp + tn) if (fp + tn) > 0 else 0.0      # false alarm rate
        auc = roc_auc_score(y_test, y_prob)

        recalls.append(recall)
        type2_errors.append(type2_error)
        type1_errors.append(type1_error)
        aucs.append(auc)

    print(f"Recall: {np.mean(recalls):.3f}")
    print(f"Type II Error (missed fraud): {np.mean(type2_errors):.3f}")
    print(f"Type I Error (false alarms): {np.mean(type1_errors):.3f}")
    print(f"ROC AUC: {np.mean(aucs):.3f}")


# ---- Run CV across thresholds ----
# Compute positive/negative ratio to handle imbalance
pos_weight = (y == 0).sum() / (y == 1).sum()
print(f"scale_pos_weight ≈ {pos_weight:.2f}")

# Best Parameters: {'border_count': 64, 'depth': 4, 'l2_leaf_reg': 1, 'learning_rate': 0.05, 'n_estimators': 200}

for thresh in [0.3, 0.32, 0.34, 0.36, 0.38]:
    print(f"\nEvaluating CatBoost at threshold = {thresh}")
    evaluate_catboost_cv(
        X, y,
        n_splits=5,
        threshold=thresh,
        depth=4,
        border_count=64,
        learning_rate=0.05,
        n_estimators=200,
        l2_leaf_reg=1,
        scale_pos_weight=pos_weight,  # handles imbalance
        random_state=42
    )


scale_pos_weight ≈ 32.34

Evaluating CatBoost at threshold = 0.3
Recall: 0.871
Type II Error (missed fraud): 0.129
Type I Error (false alarms): 0.190
ROC AUC: 0.932

Evaluating CatBoost at threshold = 0.32
Recall: 0.865
Type II Error (missed fraud): 0.135
Type I Error (false alarms): 0.168
ROC AUC: 0.932

Evaluating CatBoost at threshold = 0.34
Recall: 0.855
Type II Error (missed fraud): 0.145
Type I Error (false alarms): 0.148
ROC AUC: 0.932

Evaluating CatBoost at threshold = 0.36
Recall: 0.846
Type II Error (missed fraud): 0.154
Type I Error (false alarms): 0.129
ROC AUC: 0.932

Evaluating CatBoost at threshold = 0.38
Recall: 0.834
Type II Error (missed fraud): 0.166
Type I Error (false alarms): 0.110
ROC AUC: 0.932


## Unsupervised Approaches

KMeans achieve the highest ROC-AUC of 0.663, overall all unsupervised models perform worse than supervised models due to the lack of label information during training. However, they can still provide value in scenarios where labeled data is scarce or unavailable.

In [267]:
# SMOTEEN
from imblearn.combine import SMOTEENN
from sklearn.ensemble import RandomForestClassifier

smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
print("Original dataset shape:", np.bincount(y))
print("Resampled dataset shape:", np.bincount(y_resampled))

Original dataset shape: [10512   325]
Resampled dataset shape: [7995 8815]


In [268]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)
y_true = y_resampled.values


In [269]:
# Percentage of frauds
fraud_ratio = y_true.sum() / len(y_true)
print(f"Fraud ratio in dataset: {fraud_ratio:.4f}")

Fraud ratio in dataset: 0.5244


In [270]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix, roc_auc_score

# train unsupervised model
iso = IsolationForest(
    n_estimators=200,
    contamination=0.03,   # expected fraction of frauds
    random_state=42
)
iso.fit(X_scaled)

# predict anomalies (-1 = anomaly, 1 = normal)
y_pred_unsup = iso.predict(X_scaled)

# convert to binary 0/1 (fraud=1)
y_pred_unsup = np.where(y_pred_unsup == -1, 1, 0)

# evaluate against true fraud labels
cm = confusion_matrix(y_true, y_pred_unsup)
tn, fp, fn, tp = cm.ravel()

recall = tp / (tp + fn)
precision = tp / (tp + fp)
f1 = 2 * (precision * recall) / (precision + recall)
# invert
auc = roc_auc_score(y_true, -y_pred_unsup)

print("=== Isolation Forest ===")
print(f"Recall (catch frauds): {recall:.3f}")
print(f"Precision (accuracy of fraud flags): {precision:.3f}")
print(f"F1 score: {f1:.3f}")
print(f"ROC AUC: {auc:.3f}")
print(f"Confusion matrix:\n{cm}")


=== Isolation Forest ===
Recall (catch frauds): 0.014
Precision (accuracy of fraud flags): 0.248
F1 score: 0.027
ROC AUC: 0.517
Confusion matrix:
[[7615  380]
 [8690  125]]


In [271]:
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score

kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_scaled)

# compute distance to nearest cluster center
dist = np.min(kmeans.transform(X_scaled), axis=1)

# flag high-distance points as anomalies
threshold = np.percentile(dist, 97)  # top 3% farthest points
y_pred_kmeans = (dist > threshold).astype(int)

# invert
auc = roc_auc_score(y_true, -dist)
print(f"KMeans ROC-AUC: {auc:.3f}")


KMeans ROC-AUC: 0.663


In [282]:
from sklearn.svm import OneClassSVM

ocsvm = OneClassSVM(kernel='rbf', gamma='scale', nu=0.05)
ocsvm.fit(X_scaled)
scores = -ocsvm.decision_function(X_scaled)
auc = roc_auc_score(y_true, -scores)
print(f"One-Class SVM ROC-AUC: {auc:.3f}")

One-Class SVM ROC-AUC: 0.537


In [281]:
from tensorflow.keras import layers, models

inp = layers.Input(shape=(X_scaled.shape[1],))
enc = layers.Dense(32, activation='relu')(inp)
enc = layers.Dense(16, activation='relu')(enc)
dec = layers.Dense(32, activation='relu')(enc)
out = layers.Dense(X_scaled.shape[1], activation='linear')(dec)
autoencoder = models.Model(inp, out)
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_scaled, X_scaled, epochs=30, batch_size=64, verbose=0)

recon_error = np.mean((X_scaled - autoencoder.predict(X_scaled))**2, axis=1)
auc = roc_auc_score(y_true, -recon_error)
print(f"Autoencoder ROC-AUC: {auc:.3f}")

[1m526/526[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 714us/step
Autoencoder ROC-AUC: 0.635


In [278]:
from pyod.models.iforest import IForest
from pyod.models.auto_encoder import AutoEncoder
from pyod.models.copod import COPOD

model = COPOD()    # completely unsupervised probabilistic detector
model.fit(X_scaled)
scores = model.decision_function(X_scaled)
roc_auc_score(y_true, -scores)


0.6113450515193665

In [279]:
scores = (
    -iso.decision_function(X_scaled) +
    -ocsvm.decision_function(X_scaled) +
    -ell.decision_function(X_scaled)
) / 3
auc = roc_auc_score(y_true, -scores)
print(f"Ensembled ROC-AUC: {auc:.3f}")

Ensembled ROC-AUC: 0.631
