In [1]:
# ============================================================
# 0. Import
# ============================================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    f1_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
)

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch_geometric.data import Data
from torch_geometric.utils import to_undirected
from torch_geometric.nn import GCN2Conv


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ============================================================
# 1. Đọc dữ liệu BLTE, tạo label, xử lý cột
# ============================================================

# Đổi path theo file của bạn
DATA_PATH = r"D:\elliptic\blte\Labeled-Transactions-based-Dataset-of-Ethereum-Network-master\FinalDataset.xlsx"
df = pd.read_excel(DATA_PATH)

print("Raw shape:", df.shape)

# Tạo label từ from_scam / to_scam
df["from_scam"] = df["from_scam"].fillna(0).astype(int)
df["to_scam"]   = df["to_scam"].fillna(0).astype(int)

df["label"] = ((df["from_scam"] == 1) | (df["to_scam"] == 1)).astype(int)
print("Label distribution:\n", df["label"].value_counts())

# Reset index, gán txId = index (node id)
df = df.reset_index(drop=True)
df["txId"] = np.arange(len(df), dtype=int)

# BỎ HẲN 2 cột thời gian khỏi data từ đầu
for col in ["block_timestamp", "block_number"]:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

print("Columns after dropping time cols:", df.columns.tolist())


Raw shape: (71250, 18)
Label distribution:
 label
0    57000
1    14250
Name: count, dtype: int64
Columns after dropping time cols: ['hash', 'nonce', 'transaction_index', 'from_address', 'to_address', 'value', 'gas', 'gas_price', 'input', 'receipt_cumulative_gas_used', 'receipt_gas_used', 'block_hash', 'from_scam', 'to_scam', 'from_category', 'to_category', 'label', 'txId']


In [3]:
# ============================================================
# 2. Chọn feature: bỏ ID, địa chỉ, nhãn, scam flag, category
# ============================================================

id_and_addr_cols = [
    "hash",
    "from_address",
    "to_address",
    "block_hash",
    "input",
    "txId",
]

label_related_cols = [
    "from_scam",
    "to_scam",
    "from_category",
    "to_category",
    "label",
]

cols_to_exclude = set(id_and_addr_cols + label_related_cols)

feature_cols = [c for c in df.columns if c not in cols_to_exclude]
print("Feature columns:", feature_cols)

X_all = df[feature_cols].astype(float).values
y_all = df["label"].values
txid_all = df["txId"].values

# Chuẩn hóa feature
scaler = StandardScaler()
X_all_scaled = scaler.fit_transform(X_all)

Feature columns: ['nonce', 'transaction_index', 'value', 'gas', 'gas_price', 'receipt_cumulative_gas_used', 'receipt_gas_used']


In [4]:
# ============================================================
# 3. Build transaction graph KHÔNG dùng thời gian
#    - Với mỗi địa chỉ, nối các tx có cùng địa chỉ theo thứ tự xuất hiện
# ============================================================

def build_tx_edges(df, addr_col, txid_col="txId"):
    """
    Với mỗi địa chỉ (from_address / to_address):
      - lấy list txId theo thứ tự trong DataFrame
      - nối tx liên tiếp thành cạnh (tx[i], tx[i+1])
    KHÔNG dùng block_timestamp.
    """
    edges = []
    for addr, group in df.groupby(addr_col):
        if len(group) <= 1:
            continue
        ids = group[txid_col].values
        src = ids[:-1]
        dst = ids[1:]
        edges.extend(zip(src, dst))
    return edges

edges_from = build_tx_edges(df, "from_address")
edges_to   = build_tx_edges(df, "to_address")

all_edges = edges_from + edges_to
print("Num raw edges:", len(all_edges))

if len(all_edges) > 0:
    edges_array = np.array(all_edges, dtype=np.int64)
    # Đưa về dạng undirected unique
    edges_array = np.sort(edges_array, axis=1)
    edges_array = np.unique(edges_array, axis=0)
else:
    edges_array = np.empty((0, 2), dtype=np.int64)

print("Num unique edges:", len(edges_array))

df_edges = pd.DataFrame(edges_array, columns=["txId1", "txId2"])

Num raw edges: 67812
Num unique edges: 65764


In [5]:
# ============================================================
# 4. Split train / val / test NGẪU NHIÊN, 70 / 15 / 15 với stratify
# ============================================================

RANDOM_STATE = 42
TEST_SIZE  = 0.15
VAL_SIZE   = 0.15
TRAIN_SIZE = 0.70

# B1: tách TEST trước
X_temp, X_test, y_temp, y_test, txid_temp, txid_test = train_test_split(
    X_all_scaled, y_all, txid_all,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y_all
)

# B2: tách TRAIN và VAL từ phần còn lại
#   tỉ lệ val trên (train+val)
val_ratio_in_temp = VAL_SIZE / (TRAIN_SIZE + VAL_SIZE)  # 0.15 / 0.85

X_train, X_val, y_train, y_val, txid_train, txid_val = train_test_split(
    X_temp, y_temp, txid_temp,
    test_size=val_ratio_in_temp,
    random_state=RANDOM_STATE,
    stratify=y_temp
)

print("Train size:", len(X_train),
      "Val size:", len(X_val),
      "Test size:", len(X_test))

Train size: 49874 Val size: 10688 Test size: 10688


In [6]:
# ============================================================
# 5. Build edge_index cho từng subset (train/val/test)
# ============================================================

def build_edge_index(df_edges, valid_txids):
    """
    df_edges: DataFrame với cột txId1, txId2 (ID node gốc)
    valid_txids: mảng txId thuộc tập node (train/val/test)
    Trả:
      edge_index: LongTensor [2, num_edges] với index nội bộ [0..num_nodes-1]
    """
    node_ids = np.asarray(valid_txids, dtype=np.int64)
    id2idx = {tid: i for i, tid in enumerate(node_ids)}

    mask = df_edges["txId1"].isin(node_ids) & df_edges["txId2"].isin(node_ids)
    edges_sub = df_edges.loc[mask, ["txId1", "txId2"]]

    if len(edges_sub) == 0:
        return torch.empty((2, 0), dtype=torch.long)

    src_idx = edges_sub["txId1"].map(id2idx).values
    dst_idx = edges_sub["txId2"].map(id2idx).values
    edges_idx = np.vstack([src_idx, dst_idx])

    edge_index = torch.tensor(edges_idx, dtype=torch.long)
    edge_index = to_undirected(edge_index)
    return edge_index

edge_index_train = build_edge_index(df_edges, txid_train)
edge_index_val   = build_edge_index(df_edges, txid_val)
edge_index_test  = build_edge_index(df_edges, txid_test)

print("Train edges:", edge_index_train.size(1),
      "Val edges:", edge_index_val.size(1),
      "Test edges:", edge_index_test.size(1))


Train edges: 64410 Val edges: 2892 Test edges: 2990


In [7]:
# ============================================================
# 6. Tạo PyG Data object
# ============================================================

X_train_gcn = torch.tensor(X_train, dtype=torch.float)
X_val_gcn   = torch.tensor(X_val,   dtype=torch.float)
X_test_gcn  = torch.tensor(X_test,  dtype=torch.float)

y_train_gcn = torch.tensor(y_train, dtype=torch.long)
y_val_gcn   = torch.tensor(y_val,   dtype=torch.long)
y_test_gcn  = torch.tensor(y_test,  dtype=torch.long)

train_data = Data(x=X_train_gcn, edge_index=edge_index_train, y=y_train_gcn)
val_data   = Data(x=X_val_gcn,   edge_index=edge_index_val,   y=y_val_gcn)
test_data  = Data(x=X_test_gcn,  edge_index=edge_index_test,  y=y_test_gcn)

# Lưu txId (cho debug nếu cần)
train_data.node_ids = torch.tensor(txid_train, dtype=torch.long)
val_data.node_ids   = torch.tensor(txid_val,   dtype=torch.long)
test_data.node_ids  = torch.tensor(txid_test,  dtype=torch.long)

print(train_data)
print(val_data)
print(test_data)

print("Check NaN in train features:", torch.isnan(train_data.x).any().item())
print("Check Inf in train features:", torch.isinf(train_data.x).any().item())

Data(x=[49874, 7], edge_index=[2, 64410], y=[49874], node_ids=[49874])
Data(x=[10688, 7], edge_index=[2, 2892], y=[10688], node_ids=[10688])
Data(x=[10688, 7], edge_index=[2, 2990], y=[10688], node_ids=[10688])
Check NaN in train features: False
Check Inf in train features: False


In [8]:
# ============================================================
# 7. Device + class weights
# ============================================================

if hasattr(torch, "xpu") and torch.xpu.is_available():
    device = torch.device("xpu")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("Device:", device)

class_sample_count = torch.bincount(train_data.y, minlength=2).float()
eps = 1e-8
inv_freq = 1.0 / (class_sample_count + eps)
norm_inv_freq = inv_freq / inv_freq.min()

print("Class counts (train):", class_sample_count.tolist())
print("Class weights (inv_freq normalized):", norm_inv_freq.tolist())

Device: xpu
Class counts (train): [39900.0, 9974.0]
Class weights (inv_freq normalized): [1.0, 4.000401020050049]


In [9]:
from torch_geometric.nn import SAGEConv
class GraphSAGE(nn.Module):
    def __init__(self, in_dim, hid_dim, out_dim, num_layers=2, dropout=0.5):
        super().__init__()
        self.dropout = dropout

        convs = []
        # layer đầu: in_dim -> hid_dim
        convs.append(SAGEConv(in_dim, hid_dim))
        # các layer hidden: hid_dim -> hid_dim
        for _ in range(num_layers - 1):
            convs.append(SAGEConv(hid_dim, hid_dim))
        self.convs = nn.ModuleList(convs)

        self.lin_out = nn.Linear(hid_dim, out_dim)

    def forward(self, x, edge_index):
        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        out = self.lin_out(x)
        return out


In [10]:


class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, weight=None, reduction="mean"):
        super().__init__()
        self.gamma = gamma
        self.weight = weight
        self.reduction = reduction

    def forward(self, logits, target):
        logp = F.log_softmax(logits, dim=1)
        p = logp.exp()

        target = target.view(-1, 1)
        logp_t = logp.gather(1, target).squeeze(1)
        p_t    = p.gather(1, target).squeeze(1)

        focal_term = (1 - p_t) ** self.gamma
        loss = - focal_term * logp_t

        if self.weight is not None:
            w = self.weight[target.squeeze(1)].view(-1)
            loss = loss * w

        if self.reduction == "mean":
            return loss.mean()
        elif self.reduction == "sum":
            return loss.sum()
        else:
            return loss


def train_one_config(config, loss_type="ce", class_weights=None,
                     max_epochs=400, patience=30, verbose=False):
    in_dim  = train_data.x.size(1)
    out_dim = 2

    model = GraphSAGE(
        in_dim=in_dim,
        hid_dim=config["hid_dim"],
        out_dim=out_dim,
        num_layers=config["num_layers"],
        dropout=config.get("dropout", 0.5),
    ).to(device)

    cw = class_weights.to(device) if class_weights is not None else None

    if loss_type == "ce":
        crit = nn.CrossEntropyLoss(weight=cw)
    elif loss_type == "focal":
        crit = FocalLoss(gamma=config.get("gamma", 2.0), weight=cw)
    else:
        raise ValueError("loss_type must be 'ce' or 'focal'")

    opt = torch.optim.Adam(
        model.parameters(),
        lr=config["lr"],
        weight_decay=config["weight_decay"],
    )

    def eval_for_search(data):
        model.eval()
        with torch.no_grad():
            out = model(data.x.to(device), data.edge_index.to(device))
            loss = crit(out, data.y.to(device)).item()
            preds = out.argmax(dim=1).cpu().numpy()
            y_true = data.y.cpu().numpy()
            macro_f1 = f1_score(y_true, preds, average="macro", zero_division=0)
        return loss, macro_f1

    best_state = None
    best_val_macro = -1.0
    patience_counter = 0

    for epoch in range(1, max_epochs + 1):
        model.train()
        opt.zero_grad()
        out = model(train_data.x.to(device), train_data.edge_index.to(device))
        loss_train = crit(out, train_data.y.to(device))
        loss_train.backward()
        opt.step()

        val_loss, val_macro = eval_for_search(val_data)

        if verbose and (epoch % 20 == 0 or epoch == 1):
            print(f"[{config.get('name','?')}] Epoch {epoch:03d} "
                  f"- train_loss={loss_train.item():.4f} "
                  f"- val_loss={val_loss:.4f} "
                  f"- val_macro={val_macro:.4f}")

        if val_macro > best_val_macro + 1e-4:
            best_val_macro = val_macro
            best_state = torch.save(model.state_dict(), "tmp_best_model.pt")
            best_state = torch.load("tmp_best_model.pt", map_location="cpu")
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            if verbose:
                print(f"Early stop (no improve {patience} epochs)")
            break

    if best_state is not None:
        model.load_state_dict(best_state)

    return model, best_val_macro


In [11]:
# ============================================================
# 9. Hàm tune threshold + evaluate_with_threshold
# ============================================================

@torch.no_grad()
def tune_threshold_on_val(model, data, thresholds=None):
    model.eval()
    if thresholds is None:
        thresholds = np.linspace(0.1, 0.9, 17)

    out = model(data.x.to(device), data.edge_index.to(device))
    probs = F.softmax(out, dim=1)[:, 1].cpu().numpy()
    y_true = data.y.cpu().numpy()

    results = []
    for t in thresholds:
        preds = (probs >= t).astype(int)
        f1_scam = f1_score(y_true, preds, pos_label=1, zero_division=0)
        macro_f1 = f1_score(y_true, preds, average="macro", zero_division=0)
        results.append((t, f1_scam, macro_f1))

    best = max(results, key=lambda x: x[2])  # theo macro-F1
    best_t, best_f1_scam, best_macro = best

    print("\n=== Threshold search on VAL ===")
    for t, f1_s, f1_m in results:
        print(f"th={t:.2f}  F1_scam={f1_s:.4f}  macro_F1={f1_m:.4f}")

    print(f"\n>>> Best threshold = {best_t:.2f} "
          f"(F1_scam={best_f1_scam:.4f}, macro_F1={best_macro:.4f})")

    return best_t, results


@torch.no_grad()
def evaluate_with_threshold(model, data, threshold, name="TEST"):
    model.eval()
    out = model(data.x.to(device), data.edge_index.to(device))
    probs = F.softmax(out, dim=1)[:, 1].cpu().numpy()
    y_true = data.y.cpu().numpy()
    preds = (probs >= threshold).astype(int)

    f1_scam  = f1_score(y_true, preds, pos_label=1, zero_division=0)
    micro_f1 = f1_score(y_true, preds, average="micro", zero_division=0)
    macro_f1 = f1_score(y_true, preds, average="macro", zero_division=0)
    cm = confusion_matrix(y_true, preds, labels=[0, 1])

    print(f"\n{name} with threshold={threshold:.2f}")
    print("F1 (scam):", f1_scam)
    print("Micro-F1:", micro_f1)
    print("Macro-F1:", macro_f1)
    print("Confusion matrix:\n", cm)
    print("\nClassification report:")
    print(classification_report(y_true, preds, digits=4))

In [12]:
# ============================================================
# 10. Grid search arch × loss, chọn best model
# ============================================================

arch_space = [
    {"name": "arch1", "hid_dim": 64,  "num_layers": 8,  "dropout": 0.5,
     "lr": 1e-2, "weight_decay": 5e-4, "alpha": 0.1},
    {"name": "arch2", "hid_dim": 64,  "num_layers": 16, "dropout": 0.5,
     "lr": 1e-2, "weight_decay": 5e-4, "alpha": 0.1},
    {"name": "arch3", "hid_dim": 128, "num_layers": 16, "dropout": 0.5,
     "lr": 5e-3, "weight_decay": 5e-4, "alpha": 0.1},
    {"name": "arch4", "hid_dim": 128, "num_layers": 32, "dropout": 0.5,
     "lr": 5e-3, "weight_decay": 1e-3,"alpha": 0.1},
]

loss_configs = [
    {"name": "CE_no_weight",   "loss_type": "ce",
     "class_weights": torch.tensor([1.0, 1.0])},
    {"name": "CE_inv_freq",    "loss_type": "ce",
     "class_weights": norm_inv_freq},
    {"name": "Focal_gamma1.5", "loss_type": "focal",
     "class_weights": norm_inv_freq, "gamma": 1.5},
    {"name": "Focal_gamma2.0", "loss_type": "focal",
     "class_weights": norm_inv_freq, "gamma": 2.0},
]

best_model = None
best_arch  = None
best_loss_cfg = None
best_val_macro = -1.0

for arch in arch_space:
    for lc in loss_configs:
        cfg = arch.copy()
        cfg["name"] = arch["name"] + "_" + lc["name"]
        if lc["loss_type"] == "focal":
            cfg["gamma"] = lc["gamma"]

        print(f"\n=== Training config: {cfg['name']} (loss={lc['loss_type']}) ===")
        model_cfg, val_macro = train_one_config(
            cfg,
            loss_type=lc["loss_type"],
            class_weights=lc["class_weights"],
            max_epochs=300,
            patience=40,
            verbose=False,
        )
        print(f"Config {cfg['name']} val_macro = {val_macro:.4f}")

        if val_macro > best_val_macro:
            best_val_macro = val_macro
            best_model = model_cfg
            best_arch = arch
            best_loss_cfg = lc

print("\n>>> BEST CONFIG OVERALL")
print("Best arch:", best_arch)
print("Best loss config:", best_loss_cfg)
print("Best val macro-F1:", best_val_macro)



=== Training config: arch1_CE_no_weight (loss=ce) ===
Config arch1_CE_no_weight val_macro = 0.4444

=== Training config: arch1_CE_inv_freq (loss=ce) ===
Config arch1_CE_inv_freq val_macro = 0.8062

=== Training config: arch1_Focal_gamma1.5 (loss=focal) ===
Config arch1_Focal_gamma1.5 val_macro = 0.8071

=== Training config: arch1_Focal_gamma2.0 (loss=focal) ===
Config arch1_Focal_gamma2.0 val_macro = 0.6712

=== Training config: arch2_CE_no_weight (loss=ce) ===
Config arch2_CE_no_weight val_macro = 0.4444

=== Training config: arch2_CE_inv_freq (loss=ce) ===
Config arch2_CE_inv_freq val_macro = 0.5663

=== Training config: arch2_Focal_gamma1.5 (loss=focal) ===
Config arch2_Focal_gamma1.5 val_macro = 0.5663

=== Training config: arch2_Focal_gamma2.0 (loss=focal) ===
Config arch2_Focal_gamma2.0 val_macro = 0.5663

=== Training config: arch3_CE_no_weight (loss=ce) ===
Config arch3_CE_no_weight val_macro = 0.4444

=== Training config: arch3_CE_inv_freq (loss=ce) ===
Config arch3_CE_inv_fr

In [13]:
# ============================================================
# 11. Tuning threshold & Evaluate trên train/val/test
# ============================================================

best_threshold, _ = tune_threshold_on_val(best_model, val_data)

evaluate_with_threshold(best_model, train_data, best_threshold, name="TRAIN")
evaluate_with_threshold(best_model, val_data,   best_threshold, name="VAL")
evaluate_with_threshold(best_model, test_data,  best_threshold, name="TEST")


=== Threshold search on VAL ===
th=0.10  F1_scam=0.3433  macro_F1=0.2173
th=0.15  F1_scam=0.3543  macro_F1=0.2754
th=0.20  F1_scam=0.3586  macro_F1=0.2966
th=0.25  F1_scam=0.3607  macro_F1=0.3037
th=0.30  F1_scam=0.6200  macro_F1=0.7608
th=0.35  F1_scam=0.6526  macro_F1=0.7859
th=0.40  F1_scam=0.6804  macro_F1=0.8063
th=0.45  F1_scam=0.6758  macro_F1=0.8047
th=0.50  F1_scam=0.6781  macro_F1=0.8071
th=0.55  F1_scam=0.5862  macro_F1=0.7560
th=0.60  F1_scam=0.4032  macro_F1=0.6572
th=0.65  F1_scam=0.3819  macro_F1=0.6463
th=0.70  F1_scam=0.3677  macro_F1=0.6391
th=0.75  F1_scam=0.3538  macro_F1=0.6319
th=0.80  F1_scam=0.3384  macro_F1=0.6237
th=0.85  F1_scam=0.3272  macro_F1=0.6178
th=0.90  F1_scam=0.3127  macro_F1=0.6100

>>> Best threshold = 0.50 (F1_scam=0.6781, macro_F1=0.8071)

TRAIN with threshold=0.50
F1 (scam): 0.8303492944714319
Micro-F1: 0.9264747162850383
Macro-F1: 0.891708250194773
Confusion matrix:
 [[37233  2667]
 [ 1000  8974]]

Classification report:
              precisi