In [1]:
import numpy as np
import pandas as pd
import random

np.random.seed(42)
random.seed(42)

def generate_core_data(n_samples=20000):

    rows = []

    for _ in range(n_samples):

        # --- Behavioral Risk Strength (BR) ---
        BR = np.random.beta(4, 4)  # Pre-filtered suspicious queue

        # Derived structured features
        order_velocity = max(np.random.normal(BR * 10, 1.5), 0)
        device_changes_7d = np.random.poisson(BR * 3)
        ip_changes_7d = np.random.poisson(BR * 2)
        unpaid_ratio = min(np.random.beta(2 + BR*3, 5), 1)
        risky_category_flag = np.random.binomial(1, BR * 0.7)

        # --- Investigator Reasoning Quality (RQ) ---
        RQ = np.random.beta(3, 3)  # independent of BR

        # --- Context Signal (CS) ---
        CS = np.random.binomial(1, BR * 0.6)

        # --- Approval Logic ---
        approval_score = (
            0.55 * BR +
            0.25 * RQ +
            0.20 * CS +
            np.random.normal(0, 0.05)
        )

        approval_score = min(max(approval_score, 0), 1)
        approved = np.random.binomial(1, approval_score)

        rows.append([
            order_velocity,
            device_changes_7d,
            ip_changes_7d,
            unpaid_ratio,
            risky_category_flag,
            BR,
            RQ,
            CS,
            approved
        ])

    columns = [
        "order_velocity",
        "device_changes_7d",
        "ip_changes_7d",
        "unpaid_ratio",
        "risky_category_flag",
        "BR",
        "RQ",
        "CS",
        "approved"
    ]

    return pd.DataFrame(rows, columns=columns)


In [2]:
df_core = generate_core_data(20000)
print("Approval rate:", df_core["approved"].mean())


Approval rate: 0.45655


In [3]:
def add_template_annotations(df):

    annotations = []

    for _, row in df.iterrows():

        parts = []

        # Behavioral mentions
        if row["order_velocity"] > 6:
            parts.append("Elevated order velocity observed.")
        if row["device_changes_7d"] > 1:
            parts.append("Multiple new devices detected.")
        if row["ip_changes_7d"] > 1:
            parts.append("Frequent IP changes within short window.")
        if row["risky_category_flag"] == 1:
            parts.append("Orders fall under high-risk category.")
        if row["unpaid_ratio"] > 0.4:
            parts.append("High unpaid order ratio noted.")

        # Context signal
        if row["CS"] == 1:
            parts.append("Linked to previously confirmed fraud cluster.")

        # Reasoning quality affects tone
        if row["RQ"] > 0.7:
            tone = "Strong evidence supports fraud classification."
        elif row["RQ"] > 0.4:
            tone = "Indicators suggest elevated fraud risk."
        else:
            tone = "Some suspicious signals present."

        annotation = " ".join(parts + [tone])
        annotations.append(annotation)

    df["annotation_text"] = annotations
    return df


In [4]:
df_template = add_template_annotations(df_core.copy())


In [5]:
def add_varied_annotations(df):

    templates_velocity = [
        "Significant spike in order frequency.",
        "Unusual surge in transaction velocity.",
        "Accelerated ordering behavior detected."
    ]

    templates_context = [
        "Matches known fraud network patterns.",
        "Associated with prior coordinated abuse cases.",
        "Device fingerprint overlap with confirmed fraud accounts."
    ]

    annotations = []

    for _, row in df.iterrows():

        parts = []

        if row["order_velocity"] > 6:
            parts.append(random.choice(templates_velocity))

        if row["device_changes_7d"] > 1:
            parts.append("Multiple device additions observed.")

        if row["ip_changes_7d"] > 1:
            parts.append("Rapid IP switching behavior.")

        if row["risky_category_flag"] == 1:
            parts.append("High-risk merchandise profile.")

        if row["CS"] == 1:
            parts.append(random.choice(templates_context))

        # Tone variability
        if row["RQ"] > 0.7:
            tone = random.choice([
                "Evidence strongly supports tag escalation.",
                "Fraud classification confidently recommended."
            ])
        elif row["RQ"] > 0.4:
            tone = random.choice([
                "Risk indicators present but moderate.",
                "Signals indicate potential abuse."
            ])
        else:
            tone = random.choice([
                "Limited evidence; review advised.",
                "Suspicion present but not conclusive."
            ])

        annotation = " ".join(parts + [tone])
        annotations.append(annotation)

    df["annotation_text"] = annotations
    return df


In [6]:
df_varied = add_varied_annotations(df_core.copy())


In [7]:
df_template_model = df_template.drop(columns=["BR", "RQ", "CS"])
df_varied_model = df_varied.drop(columns=["BR", "RQ", "CS"])


In [8]:
df_core.drop(columns=["BR", "RQ", "CS"]).corr(numeric_only=True)["approved"].sort_values(ascending=False)


approved               1.000000
order_velocity         0.165629
device_changes_7d      0.091263
ip_changes_7d          0.067043
unpaid_ratio           0.055010
risky_category_flag    0.045236
Name: approved, dtype: float64

In [10]:
df_template_model = df_template.drop(columns=["BR", "RQ", "CS"])


In [11]:
X_struct = df_template_model.drop(columns=["annotation_text", "approved"])
y = df_template_model["approved"]


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_struct,
    y,
    test_size=0.2,
    random_state=42
)


In [13]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

xgb_struct = XGBClassifier(
    n_estimators=400,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

xgb_struct.fit(X_train, y_train)

y_prob_struct = xgb_struct.predict_proba(X_test)[:, 1]

roc_struct = roc_auc_score(y_test, y_prob_struct)

print("Structured-only XGBoost ROC-AUC:", roc_struct)


Structured-only XGBoost ROC-AUC: 0.581763436376588


In [14]:
from sentence_transformers import SentenceTransformer

model_embed = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model_embed.encode(
    df_template_model["annotation_text"].tolist(),
    show_progress_bar=True
)


  from .autonotebook import tqdm as notebook_tqdm
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1275.29it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Batches: 100%|██████████| 625/625 [00:30<00:00, 20.56it/s]


In [15]:
embeddings.shape


(20000, 384)

In [16]:
import numpy as np

X_struct = df_template_model.drop(columns=["annotation_text", "approved"]).values
y = df_template_model["approved"].values

X_combined = np.hstack([X_struct, embeddings])


In [17]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(
    X_combined,
    y,
    test_size=0.2,
    random_state=42
)

xgb_combined = XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

xgb_combined.fit(X_train, y_train)

y_prob_combined = xgb_combined.predict_proba(X_test)[:, 1]

roc_combined = roc_auc_score(y_test, y_prob_combined)

print("Structured + Embeddings ROC-AUC:", roc_combined)


Structured + Embeddings ROC-AUC: 0.6127369701511216


In [18]:
df_varied = add_varied_annotations(df_core.copy())
df_varied_model = df_varied.drop(columns=["BR", "RQ", "CS"])


In [19]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

# Structured features only
X_struct = df_varied_model.drop(columns=["annotation_text", "approved"])
y = df_varied_model["approved"]

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_struct,
    y,
    test_size=0.2,
    random_state=42
)

xgb_struct = XGBClassifier(
    n_estimators=400,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

xgb_struct.fit(X_train_s, y_train_s)

y_prob_struct = xgb_struct.predict_proba(X_test_s)[:, 1]

roc_struct = roc_auc_score(y_test_s, y_prob_struct)

print("Structured-only ROC:", roc_struct)


Structured-only ROC: 0.581763436376588


In [20]:
from sentence_transformers import SentenceTransformer

model_embed = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model_embed.encode(
    df_varied_model["annotation_text"].tolist(),
    show_progress_bar=True
)

print("Embedding shape:", embeddings.shape)


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1042.48it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Batches: 100%|██████████| 625/625 [00:26<00:00, 23.75it/s]

Embedding shape: (20000, 384)





In [21]:
import numpy as np

X_combined = np.hstack([X_struct.values, embeddings])


In [22]:
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_combined,
    y,
    test_size=0.2,
    random_state=42
)

xgb_combined = XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

xgb_combined.fit(X_train_c, y_train_c)

y_prob_combined = xgb_combined.predict_proba(X_test_c)[:, 1]

roc_combined = roc_auc_score(y_test_c, y_prob_combined)

print("Structured + Embeddings ROC:", roc_combined)


Structured + Embeddings ROC: 0.6119602995287099
