In [1]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.7.0


In [2]:
# ============================================================
# 0. Import
# ============================================================
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    f1_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
)

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch_geometric.data import Data
from torch_geometric.utils import to_undirected
from torch_geometric.nn import GCN2Conv

In [3]:
# ============================================================
# 1. Đọc dữ liệu Elliptic, merge, tạo label, xử lý cột
# ============================================================

# Sửa lại cho đúng path trên máy bạn
DATA_DIR = "/kaggle/input/elliptic/Elliptic++ Dataset"
TXS_FEATURES_FILE = "txs_features.csv"
TXS_CLASSES_FILE  = "txs_classes.csv"
TXS_EDGELIST_FILE = "txs_edgelist.csv"

df_feat = pd.read_csv(f"{DATA_DIR}/{TXS_FEATURES_FILE}")
df_cls  = pd.read_csv(f"{DATA_DIR}/{TXS_CLASSES_FILE}")
df_edge = pd.read_csv(f"{DATA_DIR}/{TXS_EDGELIST_FILE}")

print("txs_features shape:", df_feat.shape)
print("txs_classes  shape:", df_cls.shape)
print("txs_edgelist shape:", df_edge.shape)

# Merge theo txId
df = df_feat.merge(df_cls, on="txId", how="left")

# Bỏ các dòng không có nhãn
df = df.dropna()
df["class"] = df["class"].astype(int)

print("\nClass raw distribution (1=licit, 2=illicit, 3=unknown):")
print(df["class"].value_counts())

# Bỏ class=3 (unknown)
df = df[df["class"] != 3].copy()

# Map nhãn: 1 -> 0, 2 -> 1
label_map = {1: 0, 2: 1}
df["label"] = df["class"].map(label_map)

print("\nLabel distribution (0=licit,1=illicit):")
print(df["label"].value_counts())

# Đảm bảo có cột Time step
if "Time step" not in df.columns:
    raise ValueError("Không tìm thấy cột 'Time step' trong df!")

txs_features shape: (203769, 184)
txs_classes  shape: (203769, 2)
txs_edgelist shape: (234355, 2)

Class raw distribution (1=licit, 2=illicit, 3=unknown):
class
3    156759
2     41500
1      4545
Name: count, dtype: int64

Label distribution (0=licit,1=illicit):
label
1    41500
0     4545
Name: count, dtype: int64


In [4]:
# ============================================================
# 2. Chọn feature: bỏ ID, nhãn, time_step
# ============================================================

cols_to_exclude = {"txId", "class", "label", "Time step"}

# Chỉ lấy numeric
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [c for c in numeric_cols if c not in cols_to_exclude]

print("\nSố feature:", len(feature_cols))
print("Một vài feature đầu:", feature_cols[:10])

X_all = df[feature_cols].astype(float).values
y_all = df["label"].values
txid_all = df["txId"].astype(int).values
time_all = df["Time step"].values

# Chuẩn hóa feature
scaler = StandardScaler()
X_all_scaled = scaler.fit_transform(X_all)


Số feature: 182
Một vài feature đầu: ['Local_feature_1', 'Local_feature_2', 'Local_feature_3', 'Local_feature_4', 'Local_feature_5', 'Local_feature_6', 'Local_feature_7', 'Local_feature_8', 'Local_feature_9', 'Local_feature_10']


In [5]:
# ============================================================
# 3. Chia train / val / test theo Time step
#    - train: Time step <= 30
#    - val  : 31..35
#    - test : > 35
# (Không dùng Time step làm feature, chỉ dùng để chia tập)
# ============================================================

unique_ts = np.sort(df["Time step"].unique())
print("\nTime steps unique:", unique_ts)

train_ts = unique_ts[unique_ts <= 30]
val_ts   = unique_ts[(unique_ts > 30) & (unique_ts <=39)]
test_ts  = unique_ts[unique_ts > 39]

# Nếu vì lý do nào đó mà val/test rỗng, fallback chia tỉ lệ
if (len(train_ts) == 0) or (len(val_ts) == 0) or (len(test_ts) == 0):
    print("\n[Cảnh báo] Tập time_step chia theo 34/41 không hợp lệ, fallback 60/20/20 theo time.")
    n_ts = len(unique_ts)
    idx_train_end = int(0.6 * n_ts)
    idx_val_end   = int(0.8 * n_ts)
    train_ts = unique_ts[:idx_train_end]
    val_ts   = unique_ts[idx_train_end:idx_val_end]
    test_ts  = unique_ts[idx_val_end:]

print("\nTime-step TRAIN:", train_ts[0], "->", train_ts[-1])
print("Time-step VAL  :", val_ts[0],   "->", val_ts[-1])
print("Time-step TEST :", test_ts[0],  "->", test_ts[-1])

train_mask = np.isin(time_all, train_ts)
val_mask   = np.isin(time_all, val_ts)
test_mask  = np.isin(time_all, test_ts)

X_train = X_all_scaled[train_mask]
X_val   = X_all_scaled[val_mask]
X_test  = X_all_scaled[test_mask]

y_train = y_all[train_mask]
y_val   = y_all[val_mask]
y_test  = y_all[test_mask]

txid_train = txid_all[train_mask]
txid_val   = txid_all[val_mask]
txid_test  = txid_all[test_mask]

print("\nKích thước:")
print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)
print("\nPhân bố nhãn train:")
print(pd.Series(y_train).value_counts())
print("\nPhân bố nhãn val:")
print(pd.Series(y_val).value_counts())
print("\nPhân bố nhãn test:")
print(pd.Series(y_test).value_counts())


Time steps unique: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49]

Time-step TRAIN: 1 -> 30
Time-step VAL  : 31 -> 39
Time-step TEST : 40 -> 49

Kích thước:
Train: (26750, 182) Val: (8221, 182) Test: (11074, 182)

Phân bố nhãn train:
1    23796
0     2954
Name: count, dtype: int64

Phân bố nhãn val:
1    7266
0     955
Name: count, dtype: int64

Phân bố nhãn test:
1    10438
0      636
Name: count, dtype: int64


In [6]:
# ============================================================
# 4. Build edge_index cho từng subset (train/val/test)
#    Dùng txs_edgelist, chuyển sang graph vô hướng
# ============================================================

def build_edge_index(df_edges, valid_txids):
    """
    df_edges: DataFrame với cột txId1, txId2 (ID node gốc trong toàn graph)
    valid_txids: mảng txId thuộc tập node (train/val/test)
    Trả:
      edge_index: LongTensor [2, num_edges] với index nội bộ [0..num_nodes-1]
    """
    node_ids = np.asarray(valid_txids, dtype=np.int64)
    id2idx = {tid: i for i, tid in enumerate(node_ids)}

    mask = df_edges["txId1"].isin(node_ids) & df_edges["txId2"].isin(node_ids)
    edges_sub = df_edges.loc[mask, ["txId1", "txId2"]]

    if len(edges_sub) == 0:
        # graph rỗng, tạo edge_index size (2, 0)
        edges_idx = np.zeros((2, 0), dtype=np.int64)
        edge_index = torch.tensor(edges_idx, dtype=torch.long)
        return edge_index

    src_idx = edges_sub["txId1"].map(id2idx).values
    dst_idx = edges_sub["txId2"].map(id2idx).values
    edges_idx = np.vstack([src_idx, dst_idx])

    edge_index = torch.tensor(edges_idx, dtype=torch.long)
    edge_index = to_undirected(edge_index)
    return edge_index

edge_index_train = build_edge_index(df_edge, txid_train)
edge_index_val   = build_edge_index(df_edge, txid_val)
edge_index_test  = build_edge_index(df_edge, txid_test)

print("\nTrain edges:", edge_index_train.size(1),
      "Val edges:", edge_index_val.size(1),
      "Test edges:", edge_index_test.size(1))


Train edges: 40860 Val edges: 13172 Test edges: 18792


In [7]:
# ============================================================
# 5. Tạo PyG Data object
# ============================================================

X_train_gcn = torch.tensor(X_train, dtype=torch.float)
X_val_gcn   = torch.tensor(X_val,   dtype=torch.float)
X_test_gcn  = torch.tensor(X_test,  dtype=torch.float)

y_train_gcn = torch.tensor(y_train, dtype=torch.long)
y_val_gcn   = torch.tensor(y_val,   dtype=torch.long)
y_test_gcn  = torch.tensor(y_test,  dtype=torch.long)

train_data = Data(x=X_train_gcn, edge_index=edge_index_train, y=y_train_gcn)
val_data   = Data(x=X_val_gcn,   edge_index=edge_index_val,   y=y_val_gcn)
test_data  = Data(x=X_test_gcn,  edge_index=edge_index_test,  y=y_test_gcn)

# Lưu txId (cho debug nếu cần)
train_data.node_ids = torch.tensor(txid_train, dtype=torch.long)
val_data.node_ids   = torch.tensor(txid_val,   dtype=torch.long)
test_data.node_ids  = torch.tensor(txid_test,  dtype=torch.long)

print("\ntrain_data:", train_data)
print("val_data  :", val_data)
print("test_data :", test_data)

print("\nCheck NaN in train features:", torch.isnan(train_data.x).any().item())
print("Check Inf in train features:", torch.isinf(train_data.x).any().item())


train_data: Data(x=[26750, 182], edge_index=[2, 40860], y=[26750], node_ids=[26750])
val_data  : Data(x=[8221, 182], edge_index=[2, 13172], y=[8221], node_ids=[8221])
test_data : Data(x=[11074, 182], edge_index=[2, 18792], y=[11074], node_ids=[11074])

Check NaN in train features: False
Check Inf in train features: False


In [8]:
# ============================================================
# 6. Device + class weights (giống blte_gcn)
# ============================================================

if hasattr(torch, "xpu") and torch.xpu.is_available():
    device = torch.device("xpu")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("\nDevice:", device)

class_sample_count = torch.bincount(train_data.y, minlength=2).float()
eps = 1e-8
inv_freq = 1.0 / (class_sample_count + eps)
norm_inv_freq = inv_freq / inv_freq.min()

print("Class counts (train):", class_sample_count.tolist())
print("Class weights (inv_freq normalized):", norm_inv_freq.tolist())


Device: cuda
Class counts (train): [2954.0, 23796.0]
Class weights (inv_freq normalized): [8.05551815032959, 1.0]


In [9]:
from torch_geometric.nn import SAGEConv
class GraphSAGE(nn.Module):
    def __init__(self, in_dim, hid_dim, out_dim, num_layers=2, dropout=0.5):
        super().__init__()
        self.dropout = dropout

        convs = []
        # layer đầu: in_dim -> hid_dim
        convs.append(SAGEConv(in_dim, hid_dim))
        # các layer hidden: hid_dim -> hid_dim
        for _ in range(num_layers - 1):
            convs.append(SAGEConv(hid_dim, hid_dim))
        self.convs = nn.ModuleList(convs)

        self.lin_out = nn.Linear(hid_dim, out_dim)

    def forward(self, x, edge_index):
        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        out = self.lin_out(x)
        return out

In [10]:


class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, weight=None, reduction="mean"):
        super().__init__()
        self.gamma = gamma
        self.weight = weight
        self.reduction = reduction

    def forward(self, logits, target):
        logp = F.log_softmax(logits, dim=1)
        p = logp.exp()

        target = target.view(-1, 1)
        logp_t = logp.gather(1, target).squeeze(1)
        p_t    = p.gather(1, target).squeeze(1)

        focal_term = (1 - p_t) ** self.gamma
        loss = - focal_term * logp_t

        if self.weight is not None:
            w = self.weight[target.squeeze(1)].view(-1)
            loss = loss * w

        if self.reduction == "mean":
            return loss.mean()
        elif self.reduction == "sum":
            return loss.sum()
        else:
            return loss


def train_one_config(config, loss_type="ce", class_weights=None,
                     max_epochs=400, patience=30, verbose=False):
    in_dim  = train_data.x.size(1)
    out_dim = 2

    model = GraphSAGE(
        in_dim=in_dim,
        hid_dim=config["hid_dim"],
        out_dim=out_dim,
        num_layers=config["num_layers"],
        dropout=config.get("dropout", 0.5),
    ).to(device)

    cw = class_weights.to(device) if class_weights is not None else None

    if loss_type == "ce":
        crit = nn.CrossEntropyLoss(weight=cw)
    elif loss_type == "focal":
        crit = FocalLoss(gamma=config.get("gamma", 2.0), weight=cw)
    else:
        raise ValueError("loss_type must be 'ce' or 'focal'")

    opt = torch.optim.Adam(
        model.parameters(),
        lr=config["lr"],
        weight_decay=config["weight_decay"],
    )

    def eval_for_search(data):
        model.eval()
        with torch.no_grad():
            out = model(data.x.to(device), data.edge_index.to(device))
            loss = crit(out, data.y.to(device)).item()
            preds = out.argmax(dim=1).cpu().numpy()
            y_true = data.y.cpu().numpy()
            macro_f1 = f1_score(y_true, preds, average="macro", zero_division=0)
        return loss, macro_f1

    best_state = None
    best_val_macro = -1.0
    patience_counter = 0

    for epoch in range(1, max_epochs + 1):
        model.train()
        opt.zero_grad()
        out = model(train_data.x.to(device), train_data.edge_index.to(device))
        loss_train = crit(out, train_data.y.to(device))
        loss_train.backward()
        opt.step()

        val_loss, val_macro = eval_for_search(val_data)

        if verbose and (epoch % 20 == 0 or epoch == 1):
            print(f"[{config.get('name','?')}] Epoch {epoch:03d} "
                  f"- train_loss={loss_train.item():.4f} "
                  f"- val_loss={val_loss:.4f} "
                  f"- val_macro={val_macro:.4f}")

        if val_macro > best_val_macro + 1e-4:
            best_val_macro = val_macro
            best_state = torch.save(model.state_dict(), "tmp_best_tx_model.pt")
            best_state = torch.load("tmp_best_tx_model.pt", map_location="cpu")
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            if verbose:
                print(f"Early stop (no improve {patience} epochs)")
            break

    if best_state is not None:
        model.load_state_dict(best_state)

    return model, best_val_macro

In [11]:
# ============================================================
# 8. Hàm tune threshold + evaluate_with_threshold
# ============================================================

@torch.no_grad()
def tune_threshold_on_val(model, data, thresholds=None):
    model.eval()
    if thresholds is None:
        thresholds = np.linspace(0.1, 0.9, 17)

    out = model(data.x.to(device), data.edge_index.to(device))
    probs = F.softmax(out, dim=1)[:, 1].cpu().numpy()  # xác suất class=1 (illicit)
    y_true = data.y.cpu().numpy()

    results = []
    for t in thresholds:
        preds = (probs >= t).astype(int)
        macro_f1 = f1_score(y_true, preds, average="macro", zero_division=0)
        results.append((t, macro_f1))

    best_t, best_macro = max(results, key=lambda x: x[1])
    print("\nThreshold search (val):")
    for t, m in results:
        print(f"  t={t:.2f}  macro-F1={m:.4f}")
    print(f"\nBest threshold on VAL: t={best_t:.2f}, macro-F1={best_macro:.4f}")
    return best_t, best_macro


@torch.no_grad()
def evaluate_with_threshold(model, data, threshold=0.5, name="SET"):
    model.eval()
    out = model(data.x.to(device), data.edge_index.to(device))
    probs = F.softmax(out, dim=1)[:, 1].cpu().numpy()
    y_true = data.y.cpu().numpy()
    preds = (probs >= threshold).astype(int)

    f1_scam  = f1_score(y_true, preds, pos_label=1, zero_division=0)
    micro_f1 = f1_score(y_true, preds, average="micro", zero_division=0)
    macro_f1 = f1_score(y_true, preds, average="macro", zero_division=0)
    cm = confusion_matrix(y_true, preds, labels=[0, 1])

    print(f"\n{name} with threshold={threshold:.2f}")
    print("F1 (illicit=1):", f1_scam)
    print("Micro-F1:", micro_f1)
    print("Macro-F1:", macro_f1)
    print("Confusion matrix:\n", cm)
    print("\nClassification report:")
    print(classification_report(y_true, preds, digits=4))

In [12]:
# ============================================================
# 9. Grid search arch × loss, chọn best model theo macro-F1 VAL
# ============================================================

arch_space = [
    {"name": "arch1", "hid_dim": 64,  "num_layers": 8,  "dropout": 0.5,
     "lr": 1e-2, "weight_decay": 5e-4, "alpha": 0.1},
    {"name": "arch2", "hid_dim": 64,  "num_layers": 16, "dropout": 0.5,
     "lr": 1e-2, "weight_decay": 5e-4, "alpha": 0.1},
    {"name": "arch3", "hid_dim": 128, "num_layers": 16, "dropout": 0.5,
     "lr": 5e-3, "weight_decay": 5e-4, "alpha": 0.1},
    {"name": "arch4", "hid_dim": 128, "num_layers": 32, "dropout": 0.5,
     "lr": 5e-3, "weight_decay": 1e-3,"alpha": 0.1},
]

loss_configs = [
    {"name": "CE_no_weight",   "loss_type": "ce",
     "class_weights": torch.tensor([1.0, 1.0])},
    {"name": "CE_inv_freq",    "loss_type": "ce",
     "class_weights": norm_inv_freq},
    {"name": "Focal_gamma1.5", "loss_type": "focal",
     "class_weights": norm_inv_freq, "gamma": 1.5},
    {"name": "Focal_gamma2.0", "loss_type": "focal",
     "class_weights": norm_inv_freq, "gamma": 2.0},
]

best_model = None
best_arch  = None
best_loss_cfg = None
best_val_macro = -1.0

for arch in arch_space:
    for lc in loss_configs:
        cfg = arch.copy()
        cfg["name"] = arch["name"] + "_" + lc["name"]
        if lc["loss_type"] == "focal":
            cfg["gamma"] = lc["gamma"]

        print(f"\n=== Training config: {cfg['name']} (loss={lc['loss_type']}) ===")
        model_cfg, val_macro = train_one_config(
            cfg,
            loss_type=lc["loss_type"],
            class_weights=lc["class_weights"],
            max_epochs=300,
            patience=40,
            verbose=False,
        )
        print(f"Config {cfg['name']}  val_macro = {val_macro:.4f}")

        if val_macro > best_val_macro:
            best_val_macro = val_macro
            best_model = model_cfg
            best_arch = arch
            best_loss_cfg = lc

print("\n>>> BEST CONFIG OVERALL")
print("Best arch:", best_arch)
print("Best loss config:", best_loss_cfg)
print("Best val macro-F1:", best_val_macro)


=== Training config: arch1_CE_no_weight (loss=ce) ===
Config arch1_CE_no_weight  val_macro = 0.4692

=== Training config: arch1_CE_inv_freq (loss=ce) ===
Config arch1_CE_inv_freq  val_macro = 0.8552

=== Training config: arch1_Focal_gamma1.5 (loss=focal) ===
Config arch1_Focal_gamma1.5  val_macro = 0.8601

=== Training config: arch1_Focal_gamma2.0 (loss=focal) ===
Config arch1_Focal_gamma2.0  val_macro = 0.8578

=== Training config: arch2_CE_no_weight (loss=ce) ===
Config arch2_CE_no_weight  val_macro = 0.4692

=== Training config: arch2_CE_inv_freq (loss=ce) ===
Config arch2_CE_inv_freq  val_macro = 0.5770

=== Training config: arch2_Focal_gamma1.5 (loss=focal) ===
Config arch2_Focal_gamma1.5  val_macro = 0.5770

=== Training config: arch2_Focal_gamma2.0 (loss=focal) ===
Config arch2_Focal_gamma2.0  val_macro = 0.5770

=== Training config: arch3_CE_no_weight (loss=ce) ===
Config arch3_CE_no_weight  val_macro = 0.4692

=== Training config: arch3_CE_inv_freq (loss=ce) ===
Config arch3_

In [13]:
# ============================================================
# 10. Tuning threshold & Evaluate trên train/val/test
# ============================================================

best_threshold, _ = tune_threshold_on_val(best_model, val_data)

evaluate_with_threshold(best_model, train_data, best_threshold, name="TRAIN")
evaluate_with_threshold(best_model, val_data,   best_threshold, name="VAL")
evaluate_with_threshold(best_model, test_data,  best_threshold, name="TEST")


Threshold search (val):
  t=0.10  macro-F1=0.7339
  t=0.15  macro-F1=0.8473
  t=0.20  macro-F1=0.8641
  t=0.25  macro-F1=0.8617
  t=0.30  macro-F1=0.8630
  t=0.35  macro-F1=0.8649
  t=0.40  macro-F1=0.8672
  t=0.45  macro-F1=0.8654
  t=0.50  macro-F1=0.8601
  t=0.55  macro-F1=0.8452
  t=0.60  macro-F1=0.8142
  t=0.65  macro-F1=0.7862
  t=0.70  macro-F1=0.7524
  t=0.75  macro-F1=0.7216
  t=0.80  macro-F1=0.6893
  t=0.85  macro-F1=0.6541
  t=0.90  macro-F1=0.6178

Best threshold on VAL: t=0.40, macro-F1=0.8672

TRAIN with threshold=0.40
F1 (illicit=1): 0.995940004627974
Micro-F1: 0.9927850467289719
Macro-F1: 0.9817868730166535
Confusion matrix:
 [[ 2885    69]
 [  124 23672]]

Classification report:
              precision    recall  f1-score   support

           0     0.9588    0.9766    0.9676      2954
           1     0.9971    0.9948    0.9959     23796

    accuracy                         0.9928     26750
   macro avg     0.9779    0.9857    0.9818     26750
weighted avg     0.9