# Money Laundering Detection - Clean Implementation
## Feature Engineering without Data Leakage

## CELL 1 — CLEAN SETUP

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict

## CELL 2 — LOAD & SORT (WAJIB)

In [None]:
df = pd.read_csv("resources/HI-Small_Trans.csv")
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

df = df.sort_values('Timestamp').reset_index(drop=True)

# rename minimal (tidak overkill)
df = df.rename(columns={
    'Account': 'sender',
    'Account.1': 'receiver',
    'Amount Paid': 'amount',
    'Is Laundering': 'label'
})

## CELL 3 — TEMPORAL FEATURES (SAFE)

In [None]:
df['hour'] = df['Timestamp'].dt.hour
df['day_of_week'] = df['Timestamp'].dt.dayofweek
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
df['is_night'] = df['hour'].isin([22,23,0,1,2,3,4,5]).astype(int)

## CELL 4 — HISTORY FEATURES (NO LEAKAGE)

In [None]:
last_tx_sender = {}
last_tx_receiver = {}

sender_time_gap = np.full(len(df), np.nan)
receiver_time_gap = np.full(len(df), np.nan)

for i, row in enumerate(df.itertuples(index=False)):
    ts, s, r = row.Timestamp, row.sender, row.receiver

    if s in last_tx_sender:
        sender_time_gap[i] = (ts - last_tx_sender[s]).total_seconds()
    last_tx_sender[s] = ts

    if r in last_tx_receiver:
        receiver_time_gap[i] = (ts - last_tx_receiver[r]).total_seconds()
    last_tx_receiver[r] = ts

df['sender_time_gap'] = sender_time_gap
df['receiver_time_gap'] = receiver_time_gap

## CELL 5 — FAN-IN / FAN-OUT (TEMPORAL, VALID)

In [None]:
fan_out = defaultdict(set)
fan_in = defaultdict(set)

fan_out_count = np.zeros(len(df))
fan_in_count = np.zeros(len(df))

for i, row in enumerate(df.itertuples(index=False)):
    s, r = row.sender, row.receiver

    fan_out_count[i] = len(fan_out[s])
    fan_in_count[i] = len(fan_in[r])

    fan_out[s].add(r)
    fan_in[r].add(s)

df['fan_out_count'] = fan_out_count
df['fan_in_count'] = fan_in_count

## CELL 6 — AMOUNT STAT (SENDER-SIDE ONLY)

In [None]:
sender_amounts = defaultdict(list)

avg_amt = np.full(len(df), np.nan)
std_amt = np.full(len(df), np.nan)

for i, row in enumerate(df.itertuples(index=False)):
    s, amt = row.sender, row.amount
    hist = sender_amounts[s]

    if hist:
        avg_amt[i] = np.mean(hist)
        std_amt[i] = np.std(hist)

    sender_amounts[s].append(amt)

df['sender_avg_amount'] = avg_amt
df['sender_std_amount'] = std_amt
df['sender_amt_z'] = (df['amount'] - avg_amt) / (std_amt + 1e-6)

## CELL 7 — CHRONOLOGICAL SPLIT (WAJIB)

In [None]:
split_1 = int(len(df) * 0.6)
split_2 = int(len(df) * 0.8)

train_df = df.iloc[:split_1]
val_df   = df.iloc[split_1:split_2]
test_df  = df.iloc[split_2:]

print(f"Train: {len(train_df)} rows")
print(f"Val:   {len(val_df)} rows")
print(f"Test:  {len(test_df)} rows")