# RAT Validation

This notebook is for debugging and validating that RAT injection was done correctly on the datasets. Use this script to create the plots to visualize how the dataset is structured.

In [17]:
import pandas as pd

PATH = r"C:\Users\yasmi\OneDrive\Desktop\Uni - Master's\Fall 2025\MLR 570\Motif-Aware-Temporal-GNNs-for-Anti-Money-Laundering-Detection\ibm_transcations_datasets\RAT\HI-Small_Trans_RAT_low.csv"

# Load ONLY the header
df_head = pd.read_csv(PATH, nrows=5)
print("\n=== HEADER CHECK ===")
print(df_head.columns.tolist())
print(df_head.head())



=== HEADER CHECK ===
['Timestamp', 'From Bank', 'Account', 'To Bank', 'Account.1', 'Amount Received', 'Receiving Currency', 'Amount Paid', 'Payment Currency', 'Payment Format', 'Is Laundering', 'src_out_degree', 'dst_in_degree', 'src_amt_mean', 'src_amt_std', 'dst_amt_mean', 'dst_amt_std', 'src_first_seen', 'dst_first_seen', 'src_age_days', 'dst_age_days', 'date_only', 'src_day_tx_count', 'dst_day_tx_count', 'hour', 'weekday', 'RAT_is_off_hours', 'RAT_is_weekend', 'RAT_is_cross_bank', 'RAT_src_amount_z_pos', 'RAT_dst_amount_z_pos', 'RAT_src_out_deg_norm', 'RAT_dst_in_deg_norm', 'RAT_src_burst_norm', 'RAT_dst_burst_norm', 'RAT_combined_burst', 'srcacct_Bank Name', 'srcacct_Bank ID', 'srcacct_Entity ID', 'srcacct_Entity Name', 'dstacct_Bank Name', 'dstacct_Bank ID', 'dstacct_Entity ID', 'dstacct_Entity Name', 'src_entity_id', 'dst_entity_id', 'RAT_same_entity', 'RAT_src_entity_accounts', 'RAT_dst_entity_accounts', 'RAT_src_entity_acct_norm', 'RAT_dst_entity_acct_norm', 'RAT_src_pattern_

In [13]:
import pandas as pd


launder_count = 0
chunksize = 250_000

for chunk in pd.read_csv(PATH, chunksize=chunksize):
    launder_count += int((chunk["Is Laundering"] == 1).sum())

print("Total laundering rows =", launder_count)


Total laundering rows = 5177


In [14]:
import pandas as pd
import numpy as np

nan_count = 0
valid_count = 0
chunksize = 200_000

for chunk in pd.read_csv(PATH, chunksize=chunksize):

    if "RAT_score" not in chunk.columns:
        print("❌ RAT_score missing from file — SCRIPT DID NOT RUN PROPERLY")
        break

    nan_count += chunk["RAT_score"].isna().sum()
    valid_count += chunk["RAT_score"].notna().sum()

print("Valid RAT_score values:", valid_count)
print("NaN RAT_score values:", nan_count)


Valid RAT_score values: 4742989
NaN RAT_score values: 335426


In [15]:
import pandas as pd
import numpy as np

scores = []
chunksize = 200_000

for chunk in pd.read_csv(PATH, chunksize=chunksize):
    mask = chunk["Is Laundering"] == 1
    vals = chunk.loc[mask, "RAT_score"].astype(float)
    scores.extend(vals.values.tolist())

scores = [s for s in scores if pd.notna(s)]

print("Laundering RAT_score count:", len(scores))
if len(scores):
    print(pd.Series(scores).describe())
else:
    print("❌ No RAT_score found for laundering rows.")


Laundering RAT_score count: 5007
count    5007.000000
mean        0.248731
std         0.190827
min         0.001184
25%         0.042855
50%         0.254112
75%         0.359814
max         1.000000
dtype: float64


In [18]:
import pandas as pd

total_injected = 0

for chunk in pd.read_csv(PATH, chunksize=200_000):
    total_injected += chunk["RAT_injected"].sum()

print("Injected rows:", total_injected)


Injected rows: 0
