### *Import Libraries*

In [1]:
import numpy as np
import pandas as pd
from mlapp.ml.classification import LightGBM, DecisionTree, SVC

import warnings
warnings.filterwarnings("ignore")

### *Mock Data*

In [2]:
# Synthetic AML dataset
def generate_aml_data(n_rows=10000, fraud_rate=0.005, random_state=42):
    rng = np.random.default_rng(random_state)

    df = pd.DataFrame({
        "txn_amount_avg_7d": rng.gamma(2.0, 300, n_rows),
        "txn_amount_max_30d": rng.gamma(2.5, 500, n_rows),
        "txn_count_24h": rng.poisson(2, n_rows),
        "txn_count_7d": rng.poisson(10, n_rows),
        "cash_txn_ratio": rng.uniform(0, 1, n_rows),
        "round_amount_ratio": rng.uniform(0, 1, n_rows),
        "cross_border_ratio": rng.uniform(0, 1, n_rows),
        "high_risk_country_ratio": rng.uniform(0, 0.4, n_rows),
        "account_age_days": rng.integers(30, 5000, n_rows),
        "kyc_risk_score": rng.integers(1, 6, n_rows)
    })

    # AML signal
    risk_score = (
        0.003 * df["txn_amount_max_30d"]
        + 0.5 * df["cash_txn_ratio"]
        + 0.8 * df["high_risk_country_ratio"]
        + 0.4 * (df["txn_count_24h"] > 5).astype(int)
        - 0.0002 * df["account_age_days"]
        + 0.3 * (df["kyc_risk_score"] >= 4).astype(int)
    )

    threshold = np.quantile(risk_score, 1 - fraud_rate)
    df["is_aml"] = (risk_score >= threshold).astype(int)

    return df

# Generate data
df = generate_aml_data(n_rows=50000, fraud_rate=0.1)
X = df.drop(columns=["is_aml"])
y = df["is_aml"]

## *Data Loading*

In [None]:
df = pd.read_csv('data/AMLNet_August 2025.csv')
df.head()

### *Exploration Data Analysis*

In [None]:
df.info()

In [None]:
df.drop(columns=["step", "isFraud", "fraud_probability", "metadata"], inplace=True)

In [None]:
df = df[:5000000]

In [None]:
X = df.drop(columns=["isMoneyLaundering"])
y = df["isMoneyLaundering"]

In [None]:
# Initialize LightGBM
lgb_model = LightGBM()
lgb_model.set_tuning_params(
    num_leaves=[32],
    max_depth=[6],
    learning_rate=[0.03],
    n_estimators=[1000],
    scale_pos_weight=[200]
)

In [None]:
# Initialize DecisionTree
dc_model = DecisionTree()
lgb_model = dc_model
lgb_model.set_tuning_params()

In [None]:
# Initialize SVC
svc_model = SVC()
lgb_model = svc_model
lgb_model.set_tuning_params()

In [None]:
# Reuse the same preprocessing pipeline
lgb_model.data_pipeline.schema(enforce=True) \
    .infer_columns() \
    .impute(add_missing_indicators=True) \
    .rare_categories(min_freq=2) \
    .outliers(method="quantile", low_q=0.01, high_q=0.9) \
    .multicollinearity_corr(threshold=0.8) \
    .build()

In [None]:
# Train
X_test_lgb, y_test_lgb = lgb_model.fit(
    X, y,
    search_method='optuna',
    tuning_params=lgb_model.tuning_params,
    n_iter=50
)

In [None]:
# Predict
y_pred_lgb = lgb_model.predict(X_test_lgb)
y_pred_proba_lgb = lgb_model.predict_proba(X_test_lgb)

In [None]:
lgb_model.build_metrics(
    y_test=y_test_lgb, 
    y_pred=y_pred_lgb,
    keys=[
        "accuracy", 
        "confusion_matrix", 
        "f1",
        "recall_sensitivity"])

### *Run All Models*

In [10]:
MODEL_CONFIGS = {
    "lightgbm": {
        "class": LightGBM,
        "params": {
            "num_leaves": [32],
            "max_depth": [6],
            "learning_rate": [0.03],
            "n_estimators": [1000],
            "scale_pos_weight": [200],
        },
    },

    "decision_tree": {
        "class": DecisionTree,
        "params": {
            "max_depth": [4, 6, 8],
            "min_samples_leaf": [50, 100],
        },
    },

    "svc": {
        "class": SVC,
        "params": {
            "kernel": ["rbf"],
            "C": [0.1, 1, 10],
            "gamma": ["scale"],
        },
    },
}

In [7]:
def build_pipeline(model):
    model.data_pipeline.schema(enforce=True) \
    .infer_columns() \
    .impute(add_missing_indicators=True) \
    .rare_categories(min_freq=2) \
    .outliers(method="quantile", low_q=0.01, high_q=0.9) \
    .multicollinearity_corr(threshold=0.8) \
    .build()

In [8]:
def run_model(model_cls, params, X, y, n_iter=50):
    model = model_cls()
    model.set_tuning_params(**params)

    build_pipeline(model)

    X_test, y_test = model.fit(
        X,
        y,
        search_method="optuna",
        tuning_params=model.tuning_params,
        n_iter=n_iter,
    )

    return model, X_test, y_test

In [19]:
def evaluate_model(model, X_test, y_test, keys=None):
    """
    Predict and compute metrics for any model.

    Args:
        model : trained model instance
        X_test: test features
        y_test: test labels
        keys  : list of metric keys to compute
    Returns:
        metrics dictionary
    """
    # Predict labels
    y_pred = model.predict(X_test)

    # Predict probabilities (optional, only if available)
    try:
        y_pred_proba = model.predict_proba(X_test)
    except AttributeError:
        y_pred_proba = None

    # Build metrics
    metrics = model.build_metrics(
        y_test=y_test,
        y_pred=y_pred,
        keys=keys or [
            "accuracy",
            "confusion_matrix",
            "f1",
            "recall_sensitivity"
        ]
    )

    return {
        "y_pred": y_pred,
        "y_pred_proba": y_pred_proba,
        "metrics": metrics
    }

In [11]:
results = {}

for name, cfg in MODEL_CONFIGS.items():
    print(f"\nðŸš€ Training {name.upper()}")

    model, X_test, y_test = run_model(
        model_cls=cfg["class"],
        params=cfg["params"],
        X=X,
        y=y,
        n_iter=10,
    )

    results[name] = {
        "model": model,
        "metrics": model.metrics,
    }

2026-01-30 16:36:36,945 | INFO | [SPLIT] Train-test split started
2026-01-30 16:36:36,955 | INFO | [PREPROCESS] Fitting data pipeline
2026-01-30 16:36:37,046 | INFO | [TRAIN] Training started (optuna)
2026-01-30 16:36:37,047 | INFO | [SEARCH] Optuna running
[32m[I 2026-01-30 16:36:37,048][0m A new study created in memory with name: no-name-0e8d4b01-ba82-4f58-8608-f50b530bd42a[0m



ðŸš€ Training LIGHTGBM


[32m[I 2026-01-30 16:36:51,192][0m Trial 0 finished with value: 0.9896 and parameters: {'num_leaves': 32, 'max_depth': 6, 'learning_rate': 0.03, 'n_estimators': 1000, 'min_child_samples': 100, 'min_split_gain': 0.1, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'reg_alpha': 1.0, 'reg_lambda': 1.0, 'scale_pos_weight': 200}. Best is trial 0 with value: 0.9896.[0m
[32m[I 2026-01-30 16:37:06,506][0m Trial 1 finished with value: 0.9896750000000001 and parameters: {'num_leaves': 32, 'max_depth': 6, 'learning_rate': 0.03, 'n_estimators': 1000, 'min_child_samples': 100, 'min_split_gain': 0.05, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'reg_alpha': 2.0, 'reg_lambda': 1.0, 'scale_pos_weight': 200}. Best is trial 1 with value: 0.9896750000000001.[0m
[32m[I 2026-01-30 16:37:22,391][0m Trial 2 finished with value: 0.9890500000000001 and parameters: {'num_leaves': 32, 'max_depth': 6, 'learning_rate': 0.03, 'n_estimators': 1000, 'min_child_sam

[LightGBM] [Info] Number of positive: 4006, number of negative: 35994
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001246 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1809
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.100150 -> initscore=-2.195559
[LightGBM] [Info] Start training from score -2.195559


2026-01-30 16:38:23,616 | INFO | Training finished in 106.67s
2026-01-30 16:38:23,618 | INFO | [SPLIT] Train-test split started



ðŸš€ Training DECISION_TREE


2026-01-30 16:38:23,627 | INFO | [PREPROCESS] Fitting data pipeline
2026-01-30 16:38:23,765 | INFO | [TRAIN] Training started (optuna)
2026-01-30 16:38:23,767 | INFO | [SEARCH] Optuna running
[32m[I 2026-01-30 16:38:23,768][0m A new study created in memory with name: no-name-9789dea7-f5e7-45cd-b4ab-f08d7ec6069d[0m
[32m[I 2026-01-30 16:38:24,163][0m Trial 0 finished with value: 0.9736750000000001 and parameters: {'criterion': 'log_loss', 'max_depth': 4, 'min_samples_split': 5, 'min_samples_leaf': 50, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.9736750000000001.[0m
[32m[I 2026-01-30 16:38:24,549][0m Trial 1 finished with value: 0.97355 and parameters: {'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 10, 'min_samples_leaf': 50, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.9736750000000001.[0m
[32m[I 2026-01-30 16:38:25,062][0m Trial 2 finished with value: 0.98805 and parameters: {'criterion': 'entropy', 'max_depth': 6, 'min_samples_split': 2, 'min


ðŸš€ Training SVC


[32m[I 2026-01-30 16:38:36,197][0m Trial 0 finished with value: 0.9709 and parameters: {'C': 0.1, 'kernel': 'rbf', 'gamma': 'scale', 'degree': 2}. Best is trial 0 with value: 0.9709.[0m
[32m[I 2026-01-30 16:38:40,087][0m Trial 1 finished with value: 0.9848250000000001 and parameters: {'C': 10, 'kernel': 'rbf', 'gamma': 'scale', 'degree': 4}. Best is trial 1 with value: 0.9848250000000001.[0m
[32m[I 2026-01-30 16:38:43,733][0m Trial 2 finished with value: 0.9848250000000001 and parameters: {'C': 10, 'kernel': 'rbf', 'gamma': 'scale', 'degree': 4}. Best is trial 1 with value: 0.9848250000000001.[0m
[32m[I 2026-01-30 16:38:48,339][0m Trial 3 finished with value: 0.9812750000000001 and parameters: {'C': 1, 'kernel': 'rbf', 'gamma': 'scale', 'degree': 4}. Best is trial 1 with value: 0.9848250000000001.[0m
[32m[I 2026-01-30 16:38:57,641][0m Trial 4 finished with value: 0.9709 and parameters: {'C': 0.1, 'kernel': 'rbf', 'gamma': 'scale', 'degree': 4}. Best is trial 1 with value:

In [18]:
print("\nðŸ“Š MODEL COMPARISON")
for name, res in results.items():
    m = res["metrics"]
    print(
        f"{name.upper():15s} | "
        f"Recall: {m['recall_sensitivity']['binary']:.4f} | "
        f"F1: {m['f1']['binary']:.4f} | "
        f"Acc: {m['accuracy']:.4f}"
    )


ðŸ“Š MODEL COMPARISON


KeyError: 'recall_sensitivity'