## Code for generating transactions dataset

In [None]:
import csv
import random
from datetime import datetime, timedelta

random.seed(42)

TOTAL_TX = 1000
START_TIME = datetime(2024, 1, 1)

accounts = [f"A{i:04d}" for i in range(1, 300)]
merchants = [f"M{i:03d}" for i in range(1, 6)]

rows = []
tx_id = 1

def add_tx(sender, receiver, amount, time):
    global tx_id
    rows.append([
        f"TX{tx_id:05d}",
        sender,
        receiver,
        round(amount, 2),
        time.isoformat()
    ])
    tx_id += 1

# 1Ô∏è‚É£ CLEAN NORMAL TRANSACTIONS (~500)
for _ in range(500):
    s, r = random.sample(accounts, 2)
    t = START_TIME + timedelta(minutes=random.randint(0, 10000))
    add_tx(s, r, random.uniform(50, 500), t)

# 2Ô∏è‚É£ CYCLE FRAUD (3‚Äì5 nodes) (~150)
cycles = [
    ["A0101", "A0102", "A0103"],
    ["A0201", "A0202", "A0203", "A0204"],
    ["A0301", "A0302", "A0303", "A0304", "A0305"]
]

for cycle in cycles:
    for _ in range(10):
        for i in range(len(cycle)):
            s = cycle[i]
            r = cycle[(i + 1) % len(cycle)]
            t = START_TIME + timedelta(hours=random.randint(1, 48))
            add_tx(s, r, random.uniform(900, 1200), t)

# 3Ô∏è‚É£ SMURFING FAN-IN (~120)
receiver = "A0400"
senders = [f"A04{i:02d}" for i in range(1, 15)]
base_time = START_TIME + timedelta(days=5)

for s in senders:
    for _ in range(3):
        add_tx(s, receiver, random.uniform(200, 300),
               base_time + timedelta(hours=random.randint(0, 48)))

# 4Ô∏è‚É£ SMURFING FAN-OUT (~120)
sender = "A0500"
receivers = [f"A05{i:02d}" for i in range(1, 15)]

for r in receivers:
    for _ in range(3):
        add_tx(sender, r, random.uniform(200, 300),
               base_time + timedelta(hours=random.randint(0, 48)))

# 5Ô∏è‚É£ LAYERED SHELL NETWORK (~60)
chains = [
    ["A0601", "A0602", "A0603", "A0604"],
    ["A0701", "A0702", "A0703"]
]

for chain in chains:
    for _ in range(10):
        for i in range(len(chain) - 1):
            add_tx(chain[i], chain[i+1],
                   random.uniform(700, 900),
                   START_TIME + timedelta(days=10 + i))

# 6Ô∏è‚É£ MERCHANT TRAP (~200)
for m in merchants:
    for _ in range(40):
        s = random.choice(accounts)
        t = START_TIME + timedelta(minutes=random.randint(0, 20000))
        add_tx(s, m, random.uniform(5, 2000), t)

# ‚úÇÔ∏è Trim to exactly 1000
rows = rows[:TOTAL_TX]

# üíæ WRITE CSV
with open("transactions.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["transaction_id", "sender_id", "receiver_id", "amount", "timestamp"])
    writer.writerows(rows)

print("‚úÖ transactions.csv generated with", len(rows), "rows")


‚úÖ transactions.csv generated with 954 rows


## Code for generating account features dataset

In [27]:
import numpy as np
import pandas as pd

np.random.seed(42)
N = 100_000


In [28]:
account_type = np.random.choice(
    ["normal", "merchant", "fraud"],
    size=N,
    p=[0.65, 0.15, 0.20]  # realistic distribution
)


In [29]:
data = {
    "account_id": [f"A{i}" for i in range(N)],
    "total_transactions": [],
    "total_amount_sent": [],
    "avg_transaction_amount": [],
    "unique_receivers": [],
    "unique_senders": [],
    "max_transactions_per_hour": [],
    "cycle_count": [],
    "is_in_cycle": [],
    "smurfing_flag": [],
    "fan_in_count": [],
    "fan_out_count": [],
    "layering_depth": [],
    "ring_size": [],
    "merchant_flag": [],
    "label": []
}


In [30]:
for acc in account_type:

    if acc == "normal":
        tx = np.random.randint(5, 50)
        avg_amt = np.random.uniform(500, 5000)
        cycle = 0
        smurf = 0
        layer = 0
        ring = 0
        merchant = 0

    elif acc == "merchant":
        tx = np.random.randint(200, 1000)
        avg_amt = np.random.uniform(2000, 15000)
        cycle = 0
        smurf = 0
        layer = 0
        ring = 0
        merchant = 1

    else:  # fraud
        tx = np.random.randint(50, 300)
        avg_amt = np.random.uniform(1000, 8000)
        cycle = np.random.randint(1, 4)
        smurf = np.random.choice([0, 1], p=[0.2, 0.8])
        layer = np.random.randint(2, 6)
        ring = np.random.randint(3, 10)
        merchant = 0

    total_amt = tx * avg_amt

    data["total_transactions"].append(tx)
    data["total_amount_sent"].append(total_amt)
    data["avg_transaction_amount"].append(avg_amt)
    data["unique_receivers"].append(min(tx, np.random.randint(2, 50)))
    data["unique_senders"].append(np.random.randint(1, 20))
    data["max_transactions_per_hour"].append(np.random.randint(1, 20))

    data["cycle_count"].append(cycle)
    data["is_in_cycle"].append(1 if cycle > 0 else 0)
    data["smurfing_flag"].append(smurf)
    data["fan_in_count"].append(np.random.randint(1, 15))
    data["fan_out_count"].append(np.random.randint(1, 15))
    data["layering_depth"].append(layer)
    data["ring_size"].append(ring)
    data["merchant_flag"].append(merchant)

    # WEAK SUPERVISION LABEL
    if (
        cycle > 0
        or smurf == 1
        or layer >= 3
        or ring >= 5
    ):
        label = 1
    else:
        label = 0

    # protect merchants from false positives
    if merchant == 1:
        label = 0

    data["label"].append(label)


In [31]:
df = pd.DataFrame(data)
print(df.head())
print(df["label"].value_counts(normalize=True))


  account_id  total_transactions  total_amount_sent  avg_transaction_amount  \
0         A0                  38       1.348589e+05             3548.917363   
1         A1                 279       1.563940e+06             5605.519348   
2         A2                 931       4.361649e+06             4684.907636   
3         A3                  34       1.655959e+05             4870.468756   
4         A4                  47       1.541227e+05             3279.206277   

   unique_receivers  unique_senders  max_transactions_per_hour  cycle_count  \
0                16              17                          6            0   
1                39              19                          4            2   
2                44              19                         19            0   
3                33              14                         12            0   
4                23              17                         10            0   

   is_in_cycle  smurfing_flag  fan_in_count  fan_o

In [33]:
df.to_csv("account_features.csv", index=False)
print("Saved account_features.csv with", len(df), "rows")


Saved account_features.csv with 100000 rows


## ML Model

In [34]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler


In [35]:
df = pd.read_csv("account_features.csv")

print(df.head())
print(df.shape)


  account_id  total_transactions  total_amount_sent  avg_transaction_amount  \
0         A0                  38       1.348589e+05             3548.917363   
1         A1                 279       1.563940e+06             5605.519348   
2         A2                 931       4.361649e+06             4684.907636   
3         A3                  34       1.655959e+05             4870.468756   
4         A4                  47       1.541227e+05             3279.206277   

   unique_receivers  unique_senders  max_transactions_per_hour  cycle_count  \
0                16              17                          6            0   
1                39              19                          4            2   
2                44              19                         19            0   
3                33              14                         12            0   
4                23              17                         10            0   

   is_in_cycle  smurfing_flag  fan_in_count  fan_o

In [36]:
# Drop account_id (not useful for ML)
df = df.drop(columns=["account_id"])

# Handle missing values
df = df.fillna(0)

print(df.isna().sum())


total_transactions           0
total_amount_sent            0
avg_transaction_amount       0
unique_receivers             0
unique_senders               0
max_transactions_per_hour    0
cycle_count                  0
is_in_cycle                  0
smurfing_flag                0
fan_in_count                 0
fan_out_count                0
layering_depth               0
ring_size                    0
merchant_flag                0
label                        0
dtype: int64


In [42]:
# ‚ùå Remove leaky rule-based features
leaky_features = [
    "is_in_cycle",
    "smurfing_flag",
    "merchant_flag"
]

X = df.drop(columns=["label"] + leaky_features)
y = df["label"]

print("Features used by model:")
print(X.columns.tolist())


print("Fraud ratio:", y.mean())


Features used by model:
['total_transactions', 'total_amount_sent', 'avg_transaction_amount', 'unique_receivers', 'unique_senders', 'max_transactions_per_hour', 'cycle_count', 'fan_in_count', 'fan_out_count', 'layering_depth', 'ring_size']
Fraud ratio: 0.19905


In [43]:
# Add label noise to simulate real-world uncertainty
noise_rate = 0.10  # 10% noisy labels

y_noisy = y.copy()
noise_mask = np.random.rand(len(y)) < noise_rate
y_noisy[noise_mask] = 1 - y_noisy[noise_mask]

print("Original fraud rate:", y.mean())
print("Noisy fraud rate:", y_noisy.mean())


Original fraud rate: 0.19905
Noisy fraud rate: 0.25761


In [44]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_noisy,
    test_size=0.25,
    random_state=42,
    stratify=y_noisy
)


In [45]:
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    class_weight="balanced",
    random_state=42
)

model.fit(X_train, y_train)


In [46]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nROC AUC:", roc_auc_score(y_test, y_prob))


Confusion Matrix:
[[18088   472]
 [ 1953  4487]]

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.97      0.94     18560
           1       0.90      0.70      0.79      6440

    accuracy                           0.90     25000
   macro avg       0.90      0.84      0.86     25000
weighted avg       0.90      0.90      0.90     25000


ROC AUC: 0.8346192556623474


In [47]:
importances = pd.Series(
    model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

print(importances)


ring_size                    0.306764
cycle_count                  0.259241
layering_depth               0.255436
total_transactions           0.068992
total_amount_sent            0.055784
avg_transaction_amount       0.024286
unique_receivers             0.008439
unique_senders               0.005680
max_transactions_per_hour    0.005503
fan_out_count                0.004974
fan_in_count                 0.004901
dtype: float64
