In [4]:
import numpy as np
import pandas as pd
import random

In [5]:
np.random.seed(42)
random.seed(42)

def generate_synthetic_data(n_samples=20000):

    data = []

    for _ in range(n_samples):

        # ---- Latent Fraud Intensity (skewed mostly low) ----
        true_fraud_intensity = np.random.beta(4, 4)  # mostly low, some medium, few high

        # ---- Structured Features Derived From Latent Fraud ----

        order_velocity = np.random.normal(
            loc=true_fraud_intensity * 10,
            scale=1.5
        )

        card_velocity = np.random.normal(
            loc=true_fraud_intensity * 8,
            scale=1.2
        )

        risky_pattern_flag = np.random.binomial(
            1, true_fraud_intensity * 0.8
        )

        bad_gsi_flag = np.random.binomial(
            1, true_fraud_intensity * 0.7
        )

        unpaid_orders = np.random.poisson(
            lam=true_fraud_intensity * 3
        )

        prior_disputes = np.random.poisson(
            lam=true_fraud_intensity * 4
        )

        account_age_days = np.random.exponential(
            scale=400 * (1 - true_fraud_intensity + 0.1)
        )

        linked_accounts_count = np.random.poisson(
            lam=true_fraud_intensity * 2
        )

        # Ensure no negatives from normal distributions
        order_velocity = max(order_velocity, 0)
        card_velocity = max(card_velocity, 0)

        # ---- Annotation Derived From Signals ----
        annotation_parts = []

        if order_velocity > 6:
            annotation_parts.append("High order velocity observed.")

        if card_velocity > 5:
            annotation_parts.append("Card velocity spike detected.")

        if risky_pattern_flag:
            annotation_parts.append("Risky ordering pattern consistent with abuse.")

        if bad_gsi_flag:
            annotation_parts.append("Browser/timezone anomaly and recent password change.")

        if unpaid_orders > 1:
            annotation_parts.append("Multiple unpaid orders linked to account.")

        if not annotation_parts:
            annotation_parts.append("No strong fraud indicators identified.")

        annotation_text = " ".join(annotation_parts)

        # ---- Approval Logic (Investigator 90% Correct) ----
        approval_prob = true_fraud_intensity * 0.9 + np.random.normal(0, 0.05)
        approval_prob = min(max(approval_prob, 0), 1)

        approved = np.random.binomial(1, approval_prob)

        data.append([
            order_velocity,
            card_velocity,
            risky_pattern_flag,
            bad_gsi_flag,
            unpaid_orders,
            prior_disputes,
            account_age_days,
            linked_accounts_count,
            annotation_text,
            approved
        ])

    columns = [
        "order_velocity",
        "card_velocity",
        "risky_pattern_flag",
        "bad_gsi_flag",
        "unpaid_orders",
        "prior_disputes",
        "account_age_days",
        "linked_accounts_count",
        "annotation_text",
        "approved"
    ]

    return pd.DataFrame(data, columns=columns)


In [10]:
df_mixed = generate_synthetic_data(20000)

print("Mixed Approval Rate:", df_mixed["approved"].mean())

Mixed Approval Rate: 0.4537


In [11]:
df_mixed["approved"].value_counts(normalize=True)


approved
0    0.5463
1    0.4537
Name: proportion, dtype: float64

In [12]:
df_mixed.corr(numeric_only=True)["approved"].sort_values(ascending=False)


approved                 1.000000
order_velocity           0.234553
card_velocity            0.229069
prior_disputes           0.131230
unpaid_orders            0.115761
linked_accounts_count    0.115555
bad_gsi_flag             0.085675
risky_pattern_flag       0.077181
account_age_days        -0.077668
Name: approved, dtype: float64

XGBoost


In [14]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Features (NO annotation_text)
X = df_mixed.drop(columns=["annotation_text", "approved"])
y = df_mixed["approved"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

xgb_model.fit(X_train, y_train)

y_prob = xgb_model.predict_proba(X_test)[:, 1]

roc = roc_auc_score(y_test, y_prob)

print("Structured-only XGBoost ROC-AUC:", roc)


Structured-only XGBoost ROC-AUC: 0.6506341638954193


In [15]:
from sentence_transformers import SentenceTransformer

model_embed = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model_embed.encode(
    df_mixed["annotation_text"].tolist(),
    show_progress_bar=True
)


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1078.39it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Batches: 100%|██████████| 625/625 [00:23<00:00, 26.69it/s]


In [16]:
import numpy as np

X_struct = df_mixed.drop(columns=["annotation_text", "approved"]).values

X_combined = np.hstack([X_struct, embeddings])
y = df_mixed["approved"].values


In [17]:
embeddings.shape


(20000, 384)

In [18]:
import numpy as np

# Structured features only
X_struct = df_mixed.drop(columns=["annotation_text", "approved"]).values

# Combine structured + embeddings
X_combined = np.hstack([X_struct, embeddings])

y = df_mixed["approved"].values


In [19]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, random_state=42
)

xgb_combined = XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

xgb_combined.fit(X_train, y_train)

y_prob_combined = xgb_combined.predict_proba(X_test)[:, 1]

roc_combined = roc_auc_score(y_test, y_prob_combined)

print("Structured + Embeddings XGBoost ROC-AUC:", roc_combined)


Structured + Embeddings XGBoost ROC-AUC: 0.6278704741370559
