In [1]:
# =========================================
# 0. Import & cấu hình chung
# =========================================
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
)
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.base import clone
from scipy.stats import randint, uniform

RANDOM_STATE = 42

# Thư mục chứa các file Elliptic (sửa lại cho đúng máy bạn)
DATA_DIR = r"D:\elliptic\Elliptic_Dataset"
TXS_FEATURES_FILE = "txs_features.csv"
TXS_CLASSES_FILE  = "txs_classes.csv"
TXS_EDGELIST_FILE = "txs_edgelist.csv"  # nếu cần cho GNN thì dùng thêm, ở đây chưa cần

In [2]:
# =========================================
# 1. Load dataset
# =========================================
import os

df_txs_features = pd.read_csv(os.path.join(DATA_DIR, TXS_FEATURES_FILE))
df_txs_classes  = pd.read_csv(os.path.join(DATA_DIR, TXS_CLASSES_FILE))

print("txs_features shape:", df_txs_features.shape)
print("txs_classes  shape:", df_txs_classes.shape)
print("txs_features columns:", df_txs_features.columns.tolist())
print("txs_classes  columns:", df_txs_classes.columns.tolist())


txs_features shape: (203769, 184)
txs_classes  shape: (203769, 2)
txs_features columns: ['txId', 'Time step', 'Local_feature_1', 'Local_feature_2', 'Local_feature_3', 'Local_feature_4', 'Local_feature_5', 'Local_feature_6', 'Local_feature_7', 'Local_feature_8', 'Local_feature_9', 'Local_feature_10', 'Local_feature_11', 'Local_feature_12', 'Local_feature_13', 'Local_feature_14', 'Local_feature_15', 'Local_feature_16', 'Local_feature_17', 'Local_feature_18', 'Local_feature_19', 'Local_feature_20', 'Local_feature_21', 'Local_feature_22', 'Local_feature_23', 'Local_feature_24', 'Local_feature_25', 'Local_feature_26', 'Local_feature_27', 'Local_feature_28', 'Local_feature_29', 'Local_feature_30', 'Local_feature_31', 'Local_feature_32', 'Local_feature_33', 'Local_feature_34', 'Local_feature_35', 'Local_feature_36', 'Local_feature_37', 'Local_feature_38', 'Local_feature_39', 'Local_feature_40', 'Local_feature_41', 'Local_feature_42', 'Local_feature_43', 'Local_feature_44', 'Local_feature_45',

In [3]:
# =========================================
# 2. Merge & xử lý label
# =========================================
# Giả định:
#   - df_txs_features có cột 'txId' và 'Time step' (hoặc 'time_step')
#   - df_txs_classes có cột 'txId' và 'class'
#   - class:
#       1 = licit
#       2 = illicit
#       3 = unknown
# Ta drop class=3, và tạo label nhị phân: 0=licit, 1=illicit

df = df_txs_features.merge(df_txs_classes, on="txId", how="left")

if "class" not in df.columns:
    raise ValueError("Không tìm thấy cột 'class' sau khi merge!")

print("\nPhân bố class ban đầu:")
print(df["class"].value_counts(dropna=False))

# Bỏ các giao dịch không có nhãn hoặc nhãn 'unknown' = 3
df = df.dropna(subset=["class"]).copy()
df["class"] = df["class"].astype(int)
df = df[df["class"] != 3].copy()

# Tạo label nhị phân: 0=licit, 1=illicit
df["label"] = (df["class"] == 2).astype(int)

LABEL_COL = "label"
print("\nPhân bố label sau khi bỏ unknown (0=licit,1=illicit):")
print(df[LABEL_COL].value_counts())


Phân bố class ban đầu:
class
3    157205
2     42019
1      4545
Name: count, dtype: int64

Phân bố label sau khi bỏ unknown (0=licit,1=illicit):
label
1    42019
0     4545
Name: count, dtype: int64


In [4]:
# =========================================
# 3. Chọn feature (loại id/time/label)
# =========================================
# Cột id / thời gian không dùng làm feature
possible_ts_cols = ["Time step", "time_step"]
ts_col = None
for c in possible_ts_cols:
    if c in df.columns:
        ts_col = c
        break

if ts_col is None:
    raise ValueError("Không tìm thấy cột time-step (ví dụ 'Time step' hoặc 'time_step')!")

drop_id_cols = ["txId", ts_col]
drop_label_cols = ["class", LABEL_COL]

cols_to_drop = [c for c in drop_id_cols + drop_label_cols if c in df.columns]

feature_cols = [c for c in df.columns if c not in cols_to_drop]

X_df = df[feature_cols].copy()
y = df[LABEL_COL].values
time_steps = df[ts_col].values

print("\nSố feature:", len(feature_cols))
print("Một vài feature đầu:", feature_cols[:10])

# Chỉ giữ các cột số
num_cols = X_df.select_dtypes(include=[np.number]).columns.tolist()
X_df = X_df[num_cols].copy()
print("\nSố feature numeric:", len(num_cols))


Số feature: 182
Một vài feature đầu: ['Local_feature_1', 'Local_feature_2', 'Local_feature_3', 'Local_feature_4', 'Local_feature_5', 'Local_feature_6', 'Local_feature_7', 'Local_feature_8', 'Local_feature_9', 'Local_feature_10']

Số feature numeric: 182


In [5]:
# =========================================
# 4. Chia train/val/test THEO TIME_STEP (30/10/10)
# =========================================
unique_ts = np.sort(df[ts_col].unique())
n_ts = len(unique_ts)
print("\nSố time-step khác nhau:", n_ts)
print("Các time-step đầu:", unique_ts[:10], "...", unique_ts[-10:])

# Nếu đúng Elliptic (49 time-step), ta dùng 30/10/9
# Nếu >= 50, dùng đúng 30/10/10
# Nếu ít hơn thì chia theo tỉ lệ 60/20/20
if n_ts == 49:
    train_ts = unique_ts[:30]
    val_ts   = unique_ts[30:40]
    test_ts  = unique_ts[40:]
elif n_ts >= 50:
    train_ts = unique_ts[:30]
    val_ts   = unique_ts[30:40]
    test_ts  = unique_ts[40:50]
else:
    # fallback: chia theo tỉ lệ tương đối
    idx_train_end = int(0.6 * n_ts)
    idx_val_end   = int(0.8 * n_ts)
    train_ts = unique_ts[:idx_train_end]
    val_ts   = unique_ts[idx_train_end:idx_val_end]
    test_ts  = unique_ts[idx_val_end:]

print("\nTime-step TRAIN:", train_ts[0], "->", train_ts[-1])
print("Time-step VAL  :", val_ts[0],   "->", val_ts[-1])
print("Time-step TEST :", test_ts[0],  "->", test_ts[-1])

train_mask = df[ts_col].isin(train_ts)
val_mask   = df[ts_col].isin(val_ts)
test_mask  = df[ts_col].isin(test_ts)

X_train = X_df[train_mask].values
y_train = y[train_mask]

X_val   = X_df[val_mask].values
y_val   = y[val_mask]

X_test  = X_df[test_mask].values
y_test  = y[test_mask]

print("\nKích thước:")
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_val  :", X_val.shape,   "y_val  :", y_val.shape)
print("X_test :", X_test.shape,  "y_test :", y_test.shape)

print("\nPhân bố nhãn train:")
print(pd.Series(y_train).value_counts())
print("\nPhân bố nhãn val:")
print(pd.Series(y_val).value_counts())
print("\nPhân bố nhãn test:")
print(pd.Series(y_test).value_counts())


Số time-step khác nhau: 49
Các time-step đầu: [ 1  2  3  4  5  6  7  8  9 10] ... [40 41 42 43 44 45 46 47 48 49]

Time-step TRAIN: 1 -> 30
Time-step VAL  : 31 -> 40
Time-step TEST : 41 -> 49

Kích thước:
X_train: (26905, 182) y_train: (26905,)
X_val  : (9686, 182) y_val  : (9686,)
X_test : (9973, 182) y_test : (9973,)

Phân bố nhãn train:
1    23951
0     2954
Name: count, dtype: int64

Phân bố nhãn val:
1    8619
0    1067
Name: count, dtype: int64

Phân bố nhãn test:
1    9449
0     524
Name: count, dtype: int64


In [6]:
# =========================================
# 5. Chuẩn hóa feature (fit trên TRAIN)
# =========================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)


In [7]:
# =========================================
# 6. AutoML đơn giản: random search trên VAL (macro-F1)
#    Không dùng k-fold
# =========================================
def sample_param(dist, rng):
    """Lấy 1 giá trị từ distribution hoặc list."""
    if hasattr(dist, "rvs"):
        # scipy distribution
        return dist.rvs(random_state=rng)
    # list / tuple / set ...
    dist = list(dist)
    return dist[rng.randint(0, len(dist))]

def random_search_single_model(
    name,
    base_estimator,
    param_dist,
    X_train, y_train,
    X_val,   y_val,
    n_iter=30,
    scoring="macro"
):
    """
    Random search đơn giản:
      - Mỗi iter: sample 1 bộ siêu tham số
      - Train trên TRAIN
      - Đánh giá trên VAL theo F1-macro
      - Chọn bộ tốt nhất, rồi train lại trên TRAIN+VAL với bộ đó
    """
    print(f"\n===== Random search cho {name} (không k-fold, dùng VAL) =====")
    rng = np.random.RandomState(RANDOM_STATE)
    best_f1 = -1.0
    best_params = None

    for i in range(n_iter):
        params = {k: sample_param(v, rng) for k, v in param_dist.items()}

        model = clone(base_estimator)
        model.set_params(**params)
        model.fit(X_train, y_train)

        y_val_pred = model.predict(X_val)
        # macro-F1 dùng để chọn model
        f1_macro = f1_score(y_val, y_val_pred, average="macro")

        print(f"Iter {i+1:02d}/{n_iter}: F1_macro(val) = {f1_macro:.4f}, params = {params}")

        if f1_macro > best_f1:
            best_f1 = f1_macro
            best_params = params

    print(f"\n>>> {name} – best F1_macro(val) = {best_f1:.4f}")
    print("Best params:", best_params)

    # Train lại trên TRAIN+VAL với best_params trước khi test
    X_train_full = np.vstack([X_train, X_val])
    y_train_full = np.concatenate([y_train, y_val])

    best_model = clone(base_estimator)
    best_model.set_params(**best_params)
    best_model.fit(X_train_full, y_train_full)

    return best_model

# Tính scale_pos_weight cho XGB/LGBM
n_pos = np.sum(y_train == 1)
n_neg = np.sum(y_train == 0)
scale_pos_weight = n_neg / n_pos if n_pos > 0 else 1.0
print("\nscale_pos_weight (train):", scale_pos_weight)


scale_pos_weight (train): 0.12333514258277316


In [8]:
# =========================================
# 7. Định nghĩa search space & chạy AutoML cho RF / XGB / LGBM
# =========================================

# 1) RandomForest
rf_base = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1)
rf_param_dist = {
    "n_estimators": randint(200, 600),
    "max_depth": randint(3, 30),
    "min_samples_split": randint(2, 50),
    "min_samples_leaf": randint(1, 20),
    "max_features": ["sqrt", "log2", None],
    "class_weight": [None, "balanced"]
}

best_rf = random_search_single_model(
    "RandomForest",
    rf_base,
    rf_param_dist,
    X_train_scaled, y_train,
    X_val_scaled,   y_val,
    n_iter=30
)

# 2) XGBoost
xgb_base = XGBClassifier(
    random_state=RANDOM_STATE,
    tree_method="hist",      # hoặc "gpu_hist" nếu chạy GPU
    eval_metric="logloss",
    use_label_encoder=False
)

xgb_param_dist = {
    "n_estimators": randint(200, 800),
    "max_depth": randint(3, 12),
    "learning_rate": uniform(0.01, 0.29),
    "subsample": uniform(0.6, 0.4),
    "colsample_bytree": uniform(0.6, 0.4),
    "min_child_weight": randint(1, 10),
    "gamma": uniform(0.0, 5.0),
    "scale_pos_weight": [scale_pos_weight]
}

best_xgb = random_search_single_model(
    "XGBoost",
    xgb_base,
    xgb_param_dist,
    X_train_scaled, y_train,
    X_val_scaled,   y_val,
    n_iter=30
)

# 3) LightGBM
lgbm_base = LGBMClassifier(
    random_state=RANDOM_STATE,
    objective="binary",
    n_jobs=-1
)

lgbm_param_dist = {
    "n_estimators": randint(200, 800),
    "max_depth": randint(-1, 15),
    "num_leaves": randint(16, 256),
    "learning_rate": uniform(0.01, 0.29),
    "subsample": uniform(0.6, 0.4),
    "colsample_bytree": uniform(0.6, 0.4),
    "min_child_samples": randint(5, 50),
    "scale_pos_weight": [scale_pos_weight]
}

best_lgbm = random_search_single_model(
    "LightGBM",
    lgbm_base,
    lgbm_param_dist,
    X_train_scaled, y_train,
    X_val_scaled,   y_val,
    n_iter=30
)



===== Random search cho RandomForest (không k-fold, dùng VAL) =====
Iter 01/30: F1_macro(val) = 0.9220, params = {'n_estimators': 302, 'max_depth': 22, 'min_samples_split': 30, 'min_samples_leaf': 15, 'max_features': None, 'class_weight': 'balanced'}
Iter 02/30: F1_macro(val) = 0.8852, params = {'n_estimators': 388, 'max_depth': 23, 'min_samples_split': 40, 'min_samples_leaf': 19, 'max_features': None, 'class_weight': None}
Iter 03/30: F1_macro(val) = 0.9181, params = {'n_estimators': 287, 'max_depth': 23, 'min_samples_split': 37, 'min_samples_leaf': 8, 'max_features': None, 'class_weight': 'balanced'}
Iter 04/30: F1_macro(val) = 0.8360, params = {'n_estimators': 508, 'max_depth': 4, 'min_samples_split': 25, 'min_samples_leaf': 12, 'max_features': 'log2', 'class_weight': 'balanced'}
Iter 05/30: F1_macro(val) = 0.9213, params = {'n_estimators': 585, 'max_depth': 23, 'min_samples_split': 34, 'min_samples_leaf': 12, 'max_features': 'log2', 'class_weight': 'balanced'}
Iter 06/30: F1_macro

Parameters: { "use_label_encoder" } are not used.



Iter 01/30: F1_macro(val) = 0.8944, params = {'n_estimators': 302, 'max_depth': 6, 'learning_rate': np.float64(0.28570714885887566), 'subsample': np.float64(0.892797576724562), 'colsample_bytree': np.float64(0.8394633936788146), 'min_child_weight': 7, 'gamma': np.float64(2.229163764267956), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 02/30: F1_macro(val) = 0.8571, params = {'n_estimators': 414, 'max_depth': 10, 'learning_rate': np.float64(0.10677549723031632), 'subsample': np.float64(0.6571467271687763), 'colsample_bytree': np.float64(0.8603553891795411), 'min_child_weight': 5, 'gamma': np.float64(4.8495492608099715), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 03/30: F1_macro(val) = 0.8766, params = {'n_estimators': 691, 'max_depth': 8, 'learning_rate': np.float64(0.010225842093894155), 'subsample': np.float64(0.996884623716487), 'colsample_bytree': np.float64(0.8469926038510867), 'min_child_weight': 6, 'gamma': np.float64(0.03533152609858703), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 04/30: F1_macro(val) = 0.8456, params = {'n_estimators': 760, 'max_depth': 5, 'learning_rate': np.float64(0.12091397746747719), 'subsample': np.float64(0.9932923543227152), 'colsample_bytree': np.float64(0.786705157299192), 'min_child_weight': 5, 'gamma': np.float64(3.0377242595071916), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 05/30: F1_macro(val) = 0.8709, params = {'n_estimators': 220, 'max_depth': 11, 'learning_rate': np.float64(0.02886496196573106), 'subsample': np.float64(0.9795542149013333), 'colsample_bytree': np.float64(0.9862528132298237), 'min_child_weight': 2, 'gamma': np.float64(1.9270825126995805), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 06/30: F1_macro(val) = 0.9234, params = {'n_estimators': 545, 'max_depth': 7, 'learning_rate': np.float64(0.0769592094304232), 'subsample': np.float64(0.6964101864104046), 'colsample_bytree': np.float64(0.8733054075301833), 'min_child_weight': 8, 'gamma': np.float64(0.17194260557609198), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 07/30: F1_macro(val) = 0.8987, params = {'n_estimators': 405, 'max_depth': 3, 'learning_rate': np.float64(0.0850461946640049), 'subsample': np.float64(0.8650089137415928), 'colsample_bytree': np.float64(0.7246844304357644), 'min_child_weight': 6, 'gamma': np.float64(1.039708314340944), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 08/30: F1_macro(val) = 0.8699, params = {'n_estimators': 676, 'max_depth': 4, 'learning_rate': np.float64(0.2347885187747232), 'subsample': np.float64(0.9757995766256756), 'colsample_bytree': np.float64(0.9579309401710595), 'min_child_weight': 8, 'gamma': np.float64(2.852219872026997), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 09/30: F1_macro(val) = 0.8869, params = {'n_estimators': 451, 'max_depth': 11, 'learning_rate': np.float64(0.2887398870613112), 'subsample': np.float64(0.9378135394712606), 'colsample_bytree': np.float64(0.8989280440549523), 'min_child_weight': 5, 'gamma': np.float64(2.9337558283192413), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 10/30: F1_macro(val) = 0.8674, params = {'n_estimators': 416, 'max_depth': 11, 'learning_rate': np.float64(0.09591931665418388), 'subsample': np.float64(0.6661067756252009), 'colsample_bytree': np.float64(0.6062545626964776), 'min_child_weight': 9, 'gamma': np.float64(3.861223846483287), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 11/30: F1_macro(val) = 0.8813, params = {'n_estimators': 671, 'max_depth': 5, 'learning_rate': np.float64(0.21498862971580895), 'subsample': np.float64(0.8916028672163949), 'colsample_bytree': np.float64(0.9085081386743783), 'min_child_weight': 5, 'gamma': np.float64(4.631504392566745), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 12/30: F1_macro(val) = 0.9045, params = {'n_estimators': 240, 'max_depth': 9, 'learning_rate': np.float64(0.2565111875590418), 'subsample': np.float64(0.7797802696552814), 'colsample_bytree': np.float64(0.6381640465961645), 'min_child_weight': 7, 'gamma': np.float64(1.554911608578311), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 13/30: F1_macro(val) = 0.8614, params = {'n_estimators': 298, 'max_depth': 10, 'learning_rate': np.float64(0.08966931996711859), 'subsample': np.float64(0.8244973703390804), 'colsample_bytree': np.float64(0.7531707499015159), 'min_child_weight': 3, 'gamma': np.float64(3.803925243084487), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 14/30: F1_macro(val) = 0.9405, params = {'n_estimators': 417, 'max_depth': 9, 'learning_rate': np.float64(0.23358048218682267), 'subsample': np.float64(0.7975182385457563), 'colsample_bytree': np.float64(0.8090931317527976), 'min_child_weight': 3, 'gamma': np.float64(0.15714592843367126), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 15/30: F1_macro(val) = 0.8872, params = {'n_estimators': 440, 'max_depth': 6, 'learning_rate': np.float64(0.17334991587315127), 'subsample': np.float64(0.878206434570451), 'colsample_bytree': np.float64(0.6557325817623503), 'min_child_weight': 7, 'gamma': np.float64(2.0519146151781484), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 16/30: F1_macro(val) = 0.8757, params = {'n_estimators': 370, 'max_depth': 6, 'learning_rate': np.float64(0.2834275354618145), 'subsample': np.float64(0.8395461865954144), 'colsample_bytree': np.float64(0.8779139732158818), 'min_child_weight': 2, 'gamma': np.float64(3.1217702406689662), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 17/30: F1_macro(val) = 0.8642, params = {'n_estimators': 773, 'max_depth': 11, 'learning_rate': np.float64(0.04059333535077848), 'subsample': np.float64(0.782613828193164), 'colsample_bytree': np.float64(0.6873761748867334), 'min_child_weight': 4, 'gamma': np.float64(4.462794992449889), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 18/30: F1_macro(val) = 0.8786, params = {'n_estimators': 530, 'max_depth': 9, 'learning_rate': np.float64(0.269866476977813), 'subsample': np.float64(0.7272013899887455), 'colsample_bytree': np.float64(0.6440207698110707), 'min_child_weight': 9, 'gamma': np.float64(3.238450602706812), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 19/30: F1_macro(val) = 0.9178, params = {'n_estimators': 458, 'max_depth': 9, 'learning_rate': np.float64(0.057750197411453104), 'subsample': np.float64(0.8136357677501768), 'colsample_bytree': np.float64(0.7939319885435933), 'min_child_weight': 1, 'gamma': np.float64(0.599326836668414), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 20/30: F1_macro(val) = 0.9157, params = {'n_estimators': 573, 'max_depth': 10, 'learning_rate': np.float64(0.21387549807960157), 'subsample': np.float64(0.7454518409517176), 'colsample_bytree': np.float64(0.9887128330883843), 'min_child_weight': 4, 'gamma': np.float64(1.234380314193006), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 21/30: F1_macro(val) = 0.8465, params = {'n_estimators': 585, 'max_depth': 5, 'learning_rate': np.float64(0.09725470984686319), 'subsample': np.float64(0.713936197750987), 'colsample_bytree': np.float64(0.6147547789418131), 'min_child_weight': 1, 'gamma': np.float64(4.883074779163264), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 22/30: F1_macro(val) = 0.8675, params = {'n_estimators': 419, 'max_depth': 8, 'learning_rate': np.float64(0.01958471254115903), 'subsample': np.float64(0.7380284992106732), 'colsample_bytree': np.float64(0.8537405378805455), 'min_child_weight': 2, 'gamma': np.float64(2.6546729165856817), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 23/30: F1_macro(val) = 0.8989, params = {'n_estimators': 615, 'max_depth': 9, 'learning_rate': np.float64(0.20491930874770478), 'subsample': np.float64(0.9046478461314871), 'colsample_bytree': np.float64(0.6950550175969599), 'min_child_weight': 6, 'gamma': np.float64(1.838915663596266), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 24/30: F1_macro(val) = 0.8838, params = {'n_estimators': 397, 'max_depth': 8, 'learning_rate': np.float64(0.16537465838167995), 'subsample': np.float64(0.6361159080217633), 'colsample_bytree': np.float64(0.9341209982356952), 'min_child_weight': 6, 'gamma': np.float64(3.479064033954409), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 25/30: F1_macro(val) = 0.8726, params = {'n_estimators': 602, 'max_depth': 5, 'learning_rate': np.float64(0.24475530338051746), 'subsample': np.float64(0.7394663949166917), 'colsample_bytree': np.float64(0.6384706204365683), 'min_child_weight': 9, 'gamma': np.float64(3.45468869051233), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 26/30: F1_macro(val) = 0.8472, params = {'n_estimators': 750, 'max_depth': 4, 'learning_rate': np.float64(0.2529359307131251), 'subsample': np.float64(0.8702760468157122), 'colsample_bytree': np.float64(0.8940864476963089), 'min_child_weight': 2, 'gamma': np.float64(4.623468091392814), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 27/30: F1_macro(val) = 0.8852, params = {'n_estimators': 497, 'max_depth': 5, 'learning_rate': np.float64(0.0762795063212169), 'subsample': np.float64(0.6699819708383744), 'colsample_bytree': np.float64(0.9928673373317742), 'min_child_weight': 4, 'gamma': np.float64(2.6482528917800323), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 28/30: F1_macro(val) = 0.7834, params = {'n_estimators': 201, 'max_depth': 3, 'learning_rate': np.float64(0.03699980266371077), 'subsample': np.float64(0.9588863031813307), 'colsample_bytree': np.float64(0.9601672228653322), 'min_child_weight': 7, 'gamma': np.float64(1.6951489552435035), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 29/30: F1_macro(val) = 0.8605, params = {'n_estimators': 298, 'max_depth': 5, 'learning_rate': np.float64(0.2205271468723694), 'subsample': np.float64(0.9588441039810308), 'colsample_bytree': np.float64(0.954834569706047), 'min_child_weight': 8, 'gamma': np.float64(4.438850493804799), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.



Iter 30/30: F1_macro(val) = 0.8545, params = {'n_estimators': 786, 'max_depth': 3, 'learning_rate': np.float64(0.056872327087437995), 'subsample': np.float64(0.9594216754108317), 'colsample_bytree': np.float64(0.842571623863836), 'min_child_weight': 3, 'gamma': np.float64(1.8614138328087155), 'scale_pos_weight': np.float64(0.12333514258277316)}

>>> XGBoost – best F1_macro(val) = 0.9405
Best params: {'n_estimators': 417, 'max_depth': 9, 'learning_rate': np.float64(0.23358048218682267), 'subsample': np.float64(0.7975182385457563), 'colsample_bytree': np.float64(0.8090931317527976), 'min_child_weight': 3, 'gamma': np.float64(0.15714592843367126), 'scale_pos_weight': np.float64(0.12333514258277316)}


Parameters: { "use_label_encoder" } are not used.




===== Random search cho LightGBM (không k-fold, dùng VAL) =====
[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013528 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850
Iter 01/30: F1_macro(val) = 0.8349, params = {'n_estimators': 302, 'max_depth': 2, 'num_leaves': 108, 'learning_rate': np.float64(0.0631960890611875), 'subsample': np.float64(0.9118764001091078), 'colsample_bytree': np.float64(0.8387400631785948), 'min_child_samples': 23, 'scale_pos_weight': np.float64(0.12333514258277316)}




[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010190 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850




Iter 02/30: F1_macro(val) = 0.9636, params = {'n_estimators': 414, 'max_depth': 9, 'num_leaves': 218, 'learning_rate': np.float64(0.2611910822747312), 'subsample': np.float64(0.8404460046972835), 'colsample_bytree': np.float64(0.8832290311184181), 'min_child_samples': 26, 'scale_pos_weight': np.float64(0.12333514258277316)}
[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009616 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850
Iter 03/30: F1_macro(val) = 0.9626, params = {'n_estimators': 508, 'max_depth': 0, 'num_leaves': 103, 'learning_rate': np.float64(0.2514083658321223), 'subsample': np.float64(0.6849



[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012913 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850




Iter 04/30: F1_macro(val) = 0.9624, params = {'n_estimators': 360, 'max_depth': 10, 'num_leaves': 73, 'learning_rate': np.float64(0.16217936517334897), 'subsample': np.float64(0.7727780074568463), 'colsample_bytree': np.float64(0.7164916560792167), 'min_child_samples': 46, 'scale_pos_weight': np.float64(0.12333514258277316)}
[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012924 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850




Iter 05/30: F1_macro(val) = 0.9647, params = {'n_estimators': 675, 'max_depth': 10, 'num_leaves': 223, 'learning_rate': np.float64(0.07750368872478822), 'subsample': np.float64(0.6362425738131283), 'colsample_bytree': np.float64(0.8473544037332349), 'min_child_samples': 48, 'scale_pos_weight': np.float64(0.12333514258277316)}
[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013008 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41712
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 182
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850




Iter 06/30: F1_macro(val) = 0.9393, params = {'n_estimators': 766, 'max_depth': 2, 'num_leaves': 79, 'learning_rate': np.float64(0.14536123904191417), 'subsample': np.float64(0.9439761626945282), 'colsample_bytree': np.float64(0.8721230154351118), 'min_child_samples': 13, 'scale_pos_weight': np.float64(0.12333514258277316)}
[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009246 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850




Iter 07/30: F1_macro(val) = 0.9615, params = {'n_estimators': 366, 'max_depth': 0, 'num_leaves': 147, 'learning_rate': np.float64(0.28323850914860726), 'subsample': np.float64(0.8253152871382157), 'colsample_bytree': np.float64(0.7541666010159664), 'min_child_samples': 30, 'scale_pos_weight': np.float64(0.12333514258277316)}
[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013242 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41712
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 182
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850




Iter 08/30: F1_macro(val) = 0.9653, params = {'n_estimators': 764, 'max_depth': 0, 'num_leaves': 99, 'learning_rate': np.float64(0.07989738514754338), 'subsample': np.float64(0.8733054075301833), 'colsample_bytree': np.float64(0.8439986631130484), 'min_child_samples': 12, 'scale_pos_weight': np.float64(0.12333514258277316)}
[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011953 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41712
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 182
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850




Iter 09/30: F1_macro(val) = 0.9639, params = {'n_estimators': 234, 'max_depth': 12, 'num_leaves': 96, 'learning_rate': np.float64(0.0850461946640049), 'subsample': np.float64(0.8650089137415928), 'colsample_bytree': np.float64(0.7246844304357644), 'min_child_samples': 10, 'scale_pos_weight': np.float64(0.12333514258277316)}
[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015953 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850
Iter 10/30: F1_macro(val) = 0.9623, params = {'n_estimators': 765, 'max_depth': 8, 'num_leaves': 19, 'learning_rate': np.float64(0.06360779210240283), 'subsample': np.float64(0.9878



[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011731 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850




Iter 11/30: F1_macro(val) = 0.9667, params = {'n_estimators': 401, 'max_depth': 12, 'num_leaves': 243, 'learning_rate': np.float64(0.18339099385521468), 'subsample': np.float64(0.9687496940092467), 'colsample_bytree': np.float64(0.6353970008207678), 'min_child_samples': 27, 'scale_pos_weight': np.float64(0.12333514258277316)}
[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012797 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850
Iter 12/30: F1_macro(val) = 0.9678, params = {'n_estimators': 451, 'max_depth': 7, 'num_leaves': 205, 'learning_rate': np.float64(0.10434579592134664), 'subsample': np.float64(0.7



[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040821 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41712
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 182
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850
Iter 13/30: F1_macro(val) = 0.9628, params = {'n_estimators': 416, 'max_depth': 10, 'num_leaves': 203, 'learning_rate': np.float64(0.16738186411589207), 'subsample': np.float64(0.6563696899899051), 'colsample_bytree': np.float64(0.9208787923016158), 'min_child_samples': 5, 'scale_pos_weight': np.float64(0.12333514258277316)}




[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012817 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850




Iter 14/30: F1_macro(val) = 0.9647, params = {'n_estimators': 720, 'max_depth': 6, 'num_leaves': 144, 'learning_rate': np.float64(0.06762754764491), 'subsample': np.float64(0.602208846849441), 'colsample_bytree': np.float64(0.9261845713819337), 'min_child_samples': 21, 'scale_pos_weight': np.float64(0.12333514258277316)}
[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012374 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850




Iter 15/30: F1_macro(val) = 0.8407, params = {'n_estimators': 591, 'max_depth': 1, 'num_leaves': 178, 'learning_rate': np.float64(0.23366840053892426), 'subsample': np.float64(0.6296178606936361), 'colsample_bytree': np.float64(0.7433862914177091), 'min_child_samples': 45, 'scale_pos_weight': np.float64(0.12333514258277316)}
[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011569 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850




Iter 16/30: F1_macro(val) = 0.9612, params = {'n_estimators': 227, 'max_depth': 5, 'num_leaves': 216, 'learning_rate': np.float64(0.19075645677999178), 'subsample': np.float64(0.7323592099410596), 'colsample_bytree': np.float64(0.6254233401144095), 'min_child_samples': 27, 'scale_pos_weight': np.float64(0.12333514258277316)}
[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013889 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41712
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 182
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850
Iter 17/30: F1_macro(val) = 0.9621, params = {'n_estimators': 773, 'max_depth': 6, 'num_leaves': 52, 'learning_rate': np.float64(0.22158579171803858), 'subsample': np.float64(0.855



[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010335 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850
Iter 18/30: F1_macro(val) = 0.8453, params = {'n_estimators': 661, 'max_depth': 1, 'num_leaves': 16, 'learning_rate': np.float64(0.21930156113781324), 'subsample': np.float64(0.6943939678995823), 'colsample_bytree': np.float64(0.7024273291045295), 'min_child_samples': 31, 'scale_pos_weight': np.float64(0.12333514258277316)}




[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010501 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850




Iter 19/30: F1_macro(val) = 0.9636, params = {'n_estimators': 592, 'max_depth': 13, 'num_leaves': 30, 'learning_rate': np.float64(0.042158338035431085), 'subsample': np.float64(0.7757346007463081), 'colsample_bytree': np.float64(0.6806876809341584), 'min_child_samples': 36, 'scale_pos_weight': np.float64(0.12333514258277316)}
[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013908 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850
Iter 20/30: F1_macro(val) = 0.9453, params = {'n_estimators': 440, 'max_depth': 2, 'num_leaves': 111, 'learning_rate': np.float64(0.21169966506357696), 'subsample': np.float64(0.6



[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009442 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850
Iter 21/30: F1_macro(val) = 0.9638, params = {'n_estimators': 370, 'max_depth': 11, 'num_leaves': 51, 'learning_rate': np.float64(0.2834275354618145), 'subsample': np.float64(0.8395461865954144), 'colsample_bytree': np.float64(0.8779139732158818), 'min_child_samples': 32, 'scale_pos_weight': np.float64(0.12333514258277316)}




[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012143 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850
Iter 22/30: F1_macro(val) = 0.9669, params = {'n_estimators': 777, 'max_depth': 8, 'num_leaves': 60, 'learning_rate': np.float64(0.09573376889293704), 'subsample': np.float64(0.6421977039321082), 'colsample_bytree': np.float64(0.782613828193164), 'min_child_samples': 48, 'scale_pos_weight': np.float64(0.12333514258277316)}




[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009900 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850




Iter 23/30: F1_macro(val) = 0.9490, params = {'n_estimators': 755, 'max_depth': 2, 'num_leaves': 45, 'learning_rate': np.float64(0.2661512750864718), 'subsample': np.float64(0.7297380084021096), 'colsample_bytree': np.float64(0.6488351818802693), 'min_child_samples': 29, 'scale_pos_weight': np.float64(0.12333514258277316)}
[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007586 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850




Iter 24/30: F1_macro(val) = 0.9688, params = {'n_estimators': 430, 'max_depth': 12, 'num_leaves': 240, 'learning_rate': np.float64(0.08891835232154423), 'subsample': np.float64(0.8590760482165449), 'colsample_bytree': np.float64(0.6002081507981263), 'min_child_samples': 17, 'scale_pos_weight': np.float64(0.12333514258277316)}
[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015961 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850
Iter 25/30: F1_macro(val) = 0.9628, params = {'n_estimators': 458, 'max_depth': 5, 'num_leaves': 213, 'learning_rate': np.float64(0.15811671774749406), 'subsample': np.float64(0.7



[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008966 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41708
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850
Iter 26/30: F1_macro(val) = 0.9655, params = {'n_estimators': 573, 'max_depth': 14, 'num_leaves': 111, 'learning_rate': np.float64(0.17184958058502894), 'subsample': np.float64(0.7615344684232164), 'colsample_bytree': np.float64(0.6259568988435926), 'min_child_samples': 16, 'scale_pos_weight': np.float64(0.12333514258277316)}




[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011404 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41712
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 182
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850
Iter 27/30: F1_macro(val) = 0.9648, params = {'n_estimators': 709, 'max_depth': 5, 'num_leaves': 145, 'learning_rate': np.float64(0.21655847107808818), 'subsample': np.float64(0.6592347719813599), 'colsample_bytree': np.float64(0.9990961940195767), 'min_child_samples': 6, 'scale_pos_weight': np.float64(0.12333514258277316)}




[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010610 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41712
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 182
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850
Iter 28/30: F1_macro(val) = 0.9557, params = {'n_estimators': 419, 'max_depth': 4, 'num_leaves': 102, 'learning_rate': np.float64(0.09080747462861731), 'subsample': np.float64(0.9633063543866615), 'colsample_bytree': np.float64(0.695824756266789), 'min_child_samples': 6, 'scale_pos_weight': np.float64(0.12333514258277316)}




[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010303 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41712
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 182
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850
Iter 29/30: F1_macro(val) = 0.9605, params = {'n_estimators': 252, 'max_depth': 10, 'num_leaves': 233, 'learning_rate': np.float64(0.2958386316920742), 'subsample': np.float64(0.6968221086046001), 'colsample_bytree': np.float64(0.8688542189623514), 'min_child_samples': 15, 'scale_pos_weight': np.float64(0.12333514258277316)}




[LightGBM] [Info] Number of positive: 23951, number of negative: 2954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009974 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41712
[LightGBM] [Info] Number of data points in the train set: 26905, number of used features: 182
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.890206 -> initscore=2.092850
[LightGBM] [Info] Start training from score 2.092850




Iter 30/30: F1_macro(val) = 0.9648, params = {'n_estimators': 383, 'max_depth': 9, 'num_leaves': 160, 'learning_rate': np.float64(0.22118274109743927), 'subsample': np.float64(0.7471132530877013), 'colsample_bytree': np.float64(0.8529223322374317), 'min_child_samples': 10, 'scale_pos_weight': np.float64(0.12333514258277316)}

>>> LightGBM – best F1_macro(val) = 0.9688
Best params: {'n_estimators': 430, 'max_depth': 12, 'num_leaves': 240, 'learning_rate': np.float64(0.08891835232154423), 'subsample': np.float64(0.8590760482165449), 'colsample_bytree': np.float64(0.6002081507981263), 'min_child_samples': 17, 'scale_pos_weight': np.float64(0.12333514258277316)}
[LightGBM] [Info] Number of positive: 32570, number of negative: 4021
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013822 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 42202
[LightGBM] [Info] Number of data points in the train set: 36591, n

In [9]:
# =========================================
# 8. Đánh giá trên TEST (in Accuracy, F1 binary, micro, macro)
# =========================================
def evaluate_on_test(name, model, X_test, y_test):
    print(f"\n===== {name} trên TEST =====")
    y_pred = model.predict(X_test)

    acc      = accuracy_score(y_test, y_pred)
    f1_bin   = f1_score(y_test, y_pred)                 # pos_label=1 mặc định
    f1_micro = f1_score(y_test, y_pred, average="micro")
    f1_macro = f1_score(y_test, y_pred, average="macro")

    print("Accuracy :", acc)
    print("F1 (binary, pos_label=1):", f1_bin)
    print("F1 micro :", f1_micro)
    print("F1 macro :", f1_macro)

    print("\nclassification_report:")
    print(classification_report(y_test, y_pred, digits=4))

    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))

models = {
    "RandomForest": best_rf,
    "XGBoost":      best_xgb,
    "LightGBM":     best_lgbm,
}

for name, model in models.items():
    evaluate_on_test(name, model, X_test_scaled, y_test)


===== RandomForest trên TEST =====
Accuracy : 0.9764363782211972
F1 (binary, pos_label=1): 0.9877098478113069
F1 micro : 0.9764363782211972
F1 macro : 0.851430681481411

classification_report:
              precision    recall  f1-score   support

           0     0.9801    0.5630    0.7152       524
           1     0.9763    0.9994    0.9877      9449

    accuracy                         0.9764      9973
   macro avg     0.9782    0.7812    0.8514      9973
weighted avg     0.9765    0.9764    0.9734      9973

Confusion matrix:
[[ 295  229]
 [   6 9443]]

===== XGBoost trên TEST =====
Accuracy : 0.9655068685450717
F1 (binary, pos_label=1): 0.9818124140848049
F1 micro : 0.9655068685450717
F1 macro : 0.8242395403757358

classification_report:
              precision    recall  f1-score   support

           0     0.6772    0.6565    0.6667       524
           1     0.9810    0.9826    0.9818      9449

    accuracy                         0.9655      9973
   macro avg     0.8291   

