In [1]:
!unzip -q "/content/Labeled-Transactions-based-Dataset-of-Ethereum-Network-master.zip" -d "/content/blte"

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix
)
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from scipy.stats import randint, uniform
import matplotlib.pyplot as plt
import seaborn as sns

# ==========================
# 0. Cấu hình chung
# ==========================
RANDOM_STATE = 42
DATA_PATH = r"/content/blte/Labeled-Transactions-based-Dataset-of-Ethereum-Network-master/FinalDataset.xlsx"
# (hoặc .xlsx nếu bạn dùng bản excel -> dùng read_excel)

In [3]:
# ==========================
# 1. Load final_dataset
# ==========================
df = pd.read_excel(DATA_PATH)

print("Số dòng ban đầu:", len(df))
print("Các cột:", df.columns.tolist())

Số dòng ban đầu: 71250
Các cột: ['hash', 'nonce', 'transaction_index', 'from_address', 'to_address', 'value', 'gas', 'gas_price', 'input', 'receipt_cumulative_gas_used', 'receipt_gas_used', 'block_timestamp', 'block_number', 'block_hash', 'from_scam', 'to_scam', 'from_category', 'to_category']


In [4]:
# ==========================
# 2. Tạo nhãn transaction-level
# ==========================
# Điền thiếu cho from_scam/to_scam (nếu có NaN)
for col in ["from_scam", "to_scam"]:
    if col in df.columns:
        df[col] = df[col].fillna(0)

# 1 = abnormal nếu from hoặc to là scam
df["label"] = (
    (df.get("from_scam", 0) == 1) |
    (df.get("to_scam", 0) == 1)
).astype(int)

LABEL_COL = "label"

print("Phân bố nhãn (0=normal, 1=abnormal):")
print(df[LABEL_COL].value_counts())

y = df[LABEL_COL].values

Phân bố nhãn (0=normal, 1=abnormal):
label
0    57000
1    14250
Name: count, dtype: int64


In [5]:
# ==========================
# 3. Chọn feature (tránh rò rỉ label)
# ==========================

# Các cột KHÔNG dùng làm feature:
drop_id_cols = [
    "hash",          # tx hash
    "from_address",  # địa chỉ
    "to_address",
    "block_hash",
    "input"          # raw input data, text dài
]

drop_label_cols = [
    "from_scam", "to_scam",
    "from_category", "to_category",
    LABEL_COL
]

cols_to_drop = [c for c in drop_id_cols + drop_label_cols if c in df.columns]

# Ban đầu, lấy tất cả cột trừ mấy cột drop
feature_candidates = [c for c in df.columns if c not in cols_to_drop]
X_df = df[feature_candidates].copy()

# ---- XỬ LÝ block_timestamp TRƯỚC KHI LỌC NUMERIC ----
if "block_timestamp" in X_df.columns:
    # Trường hợp block_timestamp là chuỗi datetime -> convert sang datetime rồi sang int
    if not np.issubdtype(X_df["block_timestamp"].dtype, np.number):
        # cố gắng parse datetime; errors='coerce' nếu có ô lỗi sẽ thành NaT
        X_df["block_timestamp"] = pd.to_datetime(X_df["block_timestamp"], errors="coerce")
        # chuyển về "số giây từ epoch" (hoặc .view('int64') nếu bạn muốn ns)
        X_df["block_timestamp"] = X_df["block_timestamp"].astype("int64") // 10**9
    # nếu đã là số (unix time) thì thôi, giữ nguyên

# Giữ lại cột numeric để dùng cho ML
non_numeric = X_df.select_dtypes(exclude=[np.number]).columns.tolist()
if non_numeric:
    print("Bỏ cột không phải số:", non_numeric)
    X_df = X_df.drop(columns=non_numeric)

feature_cols = X_df.columns.tolist()
print("Feature dùng để train:", feature_cols)

X = X_df.values


Feature dùng để train: ['nonce', 'transaction_index', 'value', 'gas', 'gas_price', 'receipt_cumulative_gas_used', 'receipt_gas_used', 'block_timestamp', 'block_number']


In [6]:
# ==========================
# 4. Chia train / val / test (stratified)
# ==========================
TEST_SIZE = 0.15
VAL_SIZE  = 0.15

# B1: tách TEST
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y
)

# B2: tách TRAIN & VAL từ phần còn lại
val_ratio_in_temp = VAL_SIZE / (1.0 - TEST_SIZE)  # ~0.1765

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=val_ratio_in_temp,
    random_state=RANDOM_STATE,
    stratify=y_temp
)

def show_stats(name, yy):
    counts = np.bincount(yy)
    n0 = counts[0] if len(counts) > 0 else 0
    n1 = counts[1] if len(counts) > 1 else 0
    ratio = n1 / (n0 + n1) if (n0 + n1) > 0 else 0
    print(f"{name:5s}: 0 = {n0:6d}, 1 = {n1:6d}, scam_ratio = {ratio:.6f}")

print("\n=== Phân bố nhãn sau khi chia ===")
show_stats("ALL",  y)
show_stats("Train", y_train)
show_stats("Val",   y_val)
show_stats("Test",  y_test)



=== Phân bố nhãn sau khi chia ===
ALL  : 0 =  57000, 1 =  14250, scam_ratio = 0.200000
Train: 0 =  39900, 1 =   9974, scam_ratio = 0.199984
Val  : 0 =   8550, 1 =   2138, scam_ratio = 0.200037
Test : 0 =   8550, 1 =   2138, scam_ratio = 0.200037


In [7]:
# ==========================
# 5. Chuẩn hóa feature
# ==========================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)

# Gộp train + val cho giai đoạn autoML (CV nội bộ)
X_train_full = np.vstack([X_train_scaled, X_val_scaled])
y_train_full = np.concatenate([y_train, y_val])

In [8]:
# ==========================
# 6. Hàm tiện ích: auto-tune 1 model
# ==========================
def tune_model(name, base_estimator, param_dist, X, y, n_iter=40):
    print(f"\n===== AutoML cho model: {name} =====")
    search = RandomizedSearchCV(
        estimator=base_estimator,
        param_distributions=param_dist,
        n_iter=n_iter,
        scoring="f1",  # F1 cho lớp 1 (abnormal)
        cv=5,
        n_jobs=-1,
        random_state=RANDOM_STATE,
        verbose=1
    )
    search.fit(X, y)
    print(f"Best params ({name}):", search.best_params_)
    print(f"Best CV F1 ({name}): {search.best_score_:.4f}")
    return search.best_estimator_

# Tỷ lệ imbalance → dùng cho scale_pos_weight nếu muốn
n_pos = np.sum(y_train_full == 1)
n_neg = np.sum(y_train_full == 0)
scale_pos_weight = n_neg / n_pos if n_pos > 0 else 1.0
print("\nTỷ lệ n_neg / n_pos (scale_pos_weight):", scale_pos_weight)



Tỷ lệ n_neg / n_pos (scale_pos_weight): 4.000165125495377


7. AutoML cho RF / XGB / LGBM

In [None]:
# 7.1 Random Forest
rf_base = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1)
rf_param_dist = {
    "n_estimators": randint(200, 600),
    "max_depth": randint(3, 30),
    "min_samples_split": randint(2, 50),
    "min_samples_leaf": randint(1, 20),
    "max_features": ["sqrt", "log2", None],
    "class_weight": [None, "balanced"]
}
best_rf = tune_model("RandomForest", rf_base, rf_param_dist, X_train_full, y_train_full)



===== AutoML cho model: RandomForest =====
Fitting 5 folds for each of 40 candidates, totalling 200 fits


In [None]:
# 7.2 XGBoost
xgb_base = XGBClassifier(
    random_state=RANDOM_STATE,
    tree_method="hist",
    eval_metric="logloss",
    use_label_encoder=False
)
xgb_param_dist = {
    "n_estimators": randint(200, 800),
    "max_depth": randint(3, 12),
    "learning_rate": uniform(0.01, 0.29),
    "subsample": uniform(0.6, 0.4),
    "colsample_bytree": uniform(0.6, 0.4),
    "min_child_weight": randint(1, 10),
    "gamma": uniform(0.0, 5.0),
    "scale_pos_weight": [scale_pos_weight]
}
best_xgb = tune_model("XGBoost", xgb_base, xgb_param_dist, X_train_full, y_train_full)

In [None]:
# 7.3 LightGBM
lgbm_base = LGBMClassifier(
    random_state=RANDOM_STATE,
    objective="binary",
    n_jobs=-1
)
lgbm_param_dist = {
    "n_estimators": randint(200, 800),
    "max_depth": randint(-1, 15),
    "num_leaves": randint(16, 256),
    "learning_rate": uniform(0.01, 0.29),
    "subsample": uniform(0.6, 0.4),
    "colsample_bytree": uniform(0.6, 0.4),
    "min_child_samples": randint(5, 50),
    "scale_pos_weight": [scale_pos_weight]
}
best_lgbm = tune_model("LightGBM", lgbm_base, lgbm_param_dist, X_train_full, y_train_full)

In [None]:
# ==========================
# 8. Đánh giá trên TEST (phân bố thật)
# ==========================
models = {
    "RandomForest": best_rf,
    "XGBoost": best_xgb,
    "LightGBM": best_lgbm
}

for name, model in models.items():
    print(f"\n===== {name} trên TEST =====")
    y_pred_test = model.predict(X_test_scaled)
    print("Accuracy:", accuracy_score(y_test, y_pred_test))
    print("F1 (abnormal):", f1_score(y_test, y_pred_test))
    print(classification_report(y_test, y_pred_test, digits=4))
    cm = confusion_matrix(y_test, y_pred_test)
    print("Confusion matrix:\n", cm)

    # Vẽ CM (optional)
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=["normal", "abnormal"],
                yticklabels=["normal", "abnormal"])
    plt.title(f"Confusion Matrix (Test) - {name}")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.show()