Clone repo

In [None]:
!git clone https://github.com/RicardoKnauer/TabMini.git


Cloning into 'TabMini'...
remote: Enumerating objects: 310, done.[K
remote: Counting objects: 100% (55/55), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 310 (delta 21), reused 26 (delta 17), pack-reused 255 (from 1)[K
Receiving objects: 100% (310/310), 651.82 KiB | 18.11 MiB/s, done.
Resolving deltas: 100% (70/70), done.


Import thư viện

In [None]:
import os
import pandas as pd
from glob import glob


Load tất cả 44 dataset vào list

In [None]:
# Đường dẫn tới thư mục chứa dataset
DATA_PATH = "/content/TabMini/plotting/data"

# Lấy tất cả đường dẫn đến các file X.csv và y.csv
x_paths = sorted(glob(f"{DATA_PATH}/*/*/X.csv"))
y_paths = sorted(glob(f"{DATA_PATH}/*/*/y.csv"))

# Kiểm tra số lượng
print(f"Tổng số tập dữ liệu: {len(x_paths)}")  # Nên là 44

# Tạo danh sách các dataset [(name, X_df, y_series)]
datasets = []

for x_path, y_path in zip(x_paths, y_paths):
    dataset_name = os.path.basename(os.path.dirname(x_path))
    X = pd.read_csv(x_path)
    y = pd.read_csv(y_path).squeeze()  # squeeze để y thành Series thay vì DataFrame
    datasets.append((dataset_name, X, y))


Tổng số tập dữ liệu: 44


XG Boost

In [None]:
# @title XG Boost
import os
import pandas as pd
import numpy as np
from glob import glob
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report
)
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBClassifier
from tqdm import tqdm
from collections import Counter
from scipy.stats import uniform, randint
import warnings
warnings.filterwarnings("ignore")

# Đường dẫn dataset
DATA_PATH = "/content/TabMini/plotting/data"
x_paths = sorted(glob(f"{DATA_PATH}/*/*/X.csv"))
y_paths = sorted(glob(f"{DATA_PATH}/*/*/y.csv"))

# Load dữ liệu
datasets = []
for x_path, y_path in zip(x_paths, y_paths):
    dataset_name = os.path.basename(os.path.dirname(x_path))
    X = pd.read_csv(x_path)
    y = pd.read_csv(y_path).squeeze()
    datasets.append((dataset_name, X, y))

# Không dùng grid cố định -> dùng RandomizedSearchCV để tìm hyperparam tốt nhất
param_dist = {
    "learning_rate": uniform(0.01, 0.3),
    "max_depth": randint(3, 10),
    "n_estimators": randint(50, 300),
    "subsample": uniform(0.5, 0.5),
    "colsample_bytree": uniform(0.5, 0.5),
    "gamma": uniform(0, 5),
}

results = []
accuracies = []
macro_avgs = []
weighted_avgs = []
best_params_all = []
skipped = []

print("🔁 Đang tối ưu và đánh giá 44 dataset bằng XGBoost...")
for name, X, y in tqdm(datasets):
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=42, stratify=y
        )

        model = XGBClassifier(
            use_label_encoder=False,
            eval_metric="logloss",
            verbosity=0,
            random_state=42,
            n_jobs=-1
        )

        search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_dist,
            n_iter=15,
            scoring="f1_macro",
            cv=3,
            verbose=0,
            n_jobs=-1,
            random_state=42
        )

        search.fit(X_train, y_train)
        best_model = search.best_estimator_
        best_params = search.best_params_

        y_pred = best_model.predict(X_test)
        y_prob = best_model.predict_proba(X_test)[:, 1] if len(np.unique(y)) == 2 else None

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average="macro", zero_division=0)
        rec = recall_score(y_test, y_pred, average="macro", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
        roc = roc_auc_score(y_test, y_prob) if y_prob is not None else np.nan

        report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

        results.append((name, {
            "Precision": prec,
            "Recall": rec,
            "F1-score": f1,
            "ROC AUC": roc
        }))
        accuracies.append(acc)
        macro_avgs.append({
            "Precision": report["macro avg"]["precision"],
            "Recall": report["macro avg"]["recall"],
            "F1-score": report["macro avg"]["f1-score"],
            "ROC AUC": np.nan
        })
        weighted_avgs.append({
            "Precision": report["weighted avg"]["precision"],
            "Recall": report["weighted avg"]["recall"],
            "F1-score": report["weighted avg"]["f1-score"],
            "ROC AUC": np.nan
        })
        best_params_all.append(tuple(sorted(best_params.items())))

    except Exception as e:
        skipped.append(name)
        continue

# Kết quả thành DataFrame
df_result = pd.DataFrame.from_dict(
    {name: metrics for name, metrics in results}, orient="index"
)

# Tổng kết
accuracy_row = pd.Series({
    "Precision": np.nan,
    "Recall": np.nan,
    "F1-score": np.nan,
    "ROC AUC": np.nanmean(df_result["ROC AUC"]),
}, name="Accuracy (mean)")

macro_avg_row = pd.Series(pd.DataFrame(macro_avgs).mean(), name="Macro avg")
weighted_avg_row = pd.Series(pd.DataFrame(weighted_avgs).mean(), name="Weighted avg")

df_result_final = pd.concat([
    df_result,
    pd.DataFrame([accuracy_row, macro_avg_row, weighted_avg_row])
])

# Thống kê hyperparameter phổ biến
most_common_param = Counter(best_params_all).most_common(1)[0][0]
best_hyperparams = dict(most_common_param)

# In kết quả
print("\n📋 **BẢNG KẾT QUẢ TỔNG HỢP (Precision, Recall, F1-score, ROC AUC)**\n")
with pd.option_context('display.float_format', '{:,.6f}'.format):
    print(df_result_final.fillna(""))

print("\n✅ Hyperparameter tốt nhất phổ biến nhất cho 44 dataset:")
print(best_hyperparams)

if skipped:
    print("\n⚠️ Dataset bị bỏ qua do lỗi:")
    print(skipped)


🔁 Đang tối ưu và đánh giá 44 dataset bằng XGBoost...


100%|██████████| 44/44 [01:40<00:00,  2.28s/it]


📋 **BẢNG KẾT QUẢ TỔNG HỢP (Precision, Recall, F1-score, ROC AUC)**

                            Precision   Recall F1-score  ROC AUC
analcatdata_aids             0.598214 0.598214 0.598214 0.625000
analcatdata_asbestos         0.819853 0.782468 0.787776 0.912338
analcatdata_bankruptcy       0.937500 0.937500 0.933333 0.955357
analcatdata_creditscore      1.000000 1.000000 1.000000 1.000000
analcatdata_cyyoung8092      0.770833 0.742236 0.754501 0.763975
analcatdata_cyyoung9302      0.812500 0.727273 0.756522 0.867424
analcatdata_fraud            0.737500 0.763889 0.745098 0.861111
analcatdata_japansolvent     0.817460 0.812500 0.811765 0.867188
labor                        0.928571 0.958333 0.939799 0.958333
lupus                        0.652941 0.647727 0.649351 0.710227
parity5                      0.200000 0.200000 0.200000 0.080000
postoperative_patient_data   0.370370 0.500000 0.425532 0.500000
analcatdata_boxing1          0.718750 0.593645 0.576471 0.928094
analcatdata_boxing2  




Light GBM

In [None]:
# @title Light GBM
# Cài đặt nếu chưa có
!pip install lightgbm -q

import os
import pandas as pd
from glob import glob
import numpy as np
from tqdm import tqdm
import time
import warnings
import logging
import json
from collections import Counter

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
import lightgbm as lgb

# Ẩn warnings, log LightGBM
logging.getLogger("lightgbm").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

DATA_PATH = "/content/TabMini/plotting/data"

# Load datasets
x_paths = sorted(glob(f"{DATA_PATH}/*/*/X.csv"))
y_paths = sorted(glob(f"{DATA_PATH}/*/*/y.csv"))
datasets = []
for x_path, y_path in zip(x_paths, y_paths):
    dataset_name = os.path.basename(os.path.dirname(x_path))
    X = pd.read_csv(x_path)
    y = pd.read_csv(y_path).squeeze()
    datasets.append((dataset_name, X, y))

print(f"Tổng số tập dữ liệu: {len(datasets)}")

# Hàm train và tính metrics
def train_evaluate_lightgbm(X, y, params, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_proba)
    accuracy = accuracy_score(y_test, y_pred)
    return precision, recall, f1, roc_auc, accuracy, len(y)

# Tìm hyperparameter tốt nhất với RandomizedSearchCV
def tune_lightgbm_hyperparameters(X, y, random_state=42):
    param_dist = {
        'learning_rate': [0.001, 0.01, 0.05, 0.1],
        'max_depth': [-1, 3, 5, 7],
        'n_estimators': [50, 100, 200],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'min_child_samples': [5, 10, 20],
    }
    model = lgb.LGBMClassifier(random_state=random_state, n_jobs=-1, verbosity=-1)
    search = RandomizedSearchCV(
        model,
        param_distributions=param_dist,
        n_iter=10,
        cv=3,
        scoring='f1',
        random_state=random_state,
        n_jobs=-1,
        verbose=0
    )
    X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=random_state, stratify=y)
    search.fit(X_train, y_train)
    return search.best_params_

# Đánh giá từng dataset với hyperparameter tốt nhất
results = []
best_params_all = {}
start_time = time.time()
print("🔁 Đang tìm hyperparameter tốt nhất cho 44 dataset...")

for name, X, y in tqdm(datasets):
    try:
        best_params = tune_lightgbm_hyperparameters(X, y)
        best_params_all[name] = best_params
        precision, recall, f1, roc_auc, accuracy, n_samples = train_evaluate_lightgbm(X, y, best_params)
        results.append({
            'Dataset': name,
            'Precision': precision,
            'Recall': recall,
            'F1-score': f1,
            'ROC AUC': roc_auc,
            'Accuracy': accuracy,
            'n_samples': n_samples
        })
    except Exception as e:
        print(f"Lỗi tại dataset {name}: {e}")

elapsed_time = time.time() - start_time

# Tạo bảng kết quả
df_results = pd.DataFrame(results).set_index('Dataset')

# Tổng hợp macro, weighted, và accuracy mean
macro_avg = df_results[['Precision', 'Recall', 'F1-score', 'ROC AUC', 'Accuracy']].mean()
weights = df_results['n_samples']
weighted_avg = (df_results[['Precision', 'Recall', 'F1-score', 'ROC AUC', 'Accuracy']]
                .multiply(weights, axis=0).sum() / weights.sum())
accuracy_mean = pd.Series({
    'Precision': np.nan,
    'Recall': np.nan,
    'F1-score': np.nan,
    'ROC AUC': np.nan,
    'Accuracy': df_results['Accuracy'].mean()
})

df_results.loc['Accuracy (mean)'] = accuracy_mean
df_results.loc['Macro avg'] = macro_avg
df_results.loc['Weighted avg'] = weighted_avg
df_results = df_results[['Precision', 'Recall', 'F1-score', 'ROC AUC', 'Accuracy']]

# In bảng kết quả
print("\n📋 **BẢNG KẾT QUẢ TỔNG HỢP (Precision, Recall, F1-score, ROC AUC, Accuracy)**\n")
with pd.option_context('display.float_format', '{:,.6f}'.format):
    print(df_results.fillna(""))

# ✅ Tính hyperparameter phổ biến nhất
params_as_strs = [json.dumps(p, sort_keys=True) for p in best_params_all.values()]
most_common_str, count = Counter(params_as_strs).most_common(1)[0]
most_common_params = json.loads(most_common_str)

# Chuyển float64 thành float thường
for k, v in most_common_params.items():
    if isinstance(v, float):
        most_common_params[k] = float(v)

df_common = pd.DataFrame([most_common_params])

# In ra hyperparameter phổ biến nhất
print("\n✅ Hyperparameter tốt nhất phổ biến nhất cho 44 dataset:")
print(df_common.to_string(index=False))

print(f"\n⏱️ Tổng thời gian chạy: {elapsed_time:.2f} giây")
print("✅ Đã xử lý đầy đủ 44 tập dữ liệu.")


Tổng số tập dữ liệu: 44
🔁 Đang tìm hyperparameter tốt nhất cho 44 dataset...


100%|██████████| 44/44 [04:55<00:00,  6.72s/it]


📋 **BẢNG KẾT QUẢ TỔNG HỢP (Precision, Recall, F1-score, ROC AUC, Accuracy)**

                            Precision   Recall F1-score  ROC AUC  Accuracy
Dataset                                                                   
analcatdata_aids             0.333333 0.400000 0.363636 0.480000  0.300000
analcatdata_asbestos         0.714286 0.625000 0.666667 0.819444  0.705882
analcatdata_bankruptcy       0.833333 1.000000 0.909091 0.920000  0.900000
analcatdata_creditscore      1.000000 1.000000 1.000000 1.000000  1.000000
analcatdata_cyyoung8092      0.500000 0.400000 0.444444 0.613333  0.750000
analcatdata_cyyoung9302      0.500000 0.250000 0.333333 0.766667  0.789474
analcatdata_fraud            0.600000 1.000000 0.750000 0.833333  0.777778
analcatdata_japansolvent     0.833333 0.833333 0.833333 0.800000  0.818182
labor                        1.000000 0.875000 0.933333 0.937500  0.916667
lupus                        0.666667 0.571429 0.615385 0.740260  0.722222
parity5              




CatBoost

In [13]:
# @title  CatBoost
!pip install catboost

import os
import pandas as pd
import numpy as np
import time
import json
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import (
    precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
)
from tqdm import tqdm
from collections import Counter
from scipy.stats import randint, uniform
from glob import glob
import warnings
warnings.filterwarnings("ignore")

DATA_PATH = "/content/TabMini/plotting/data"

# Load datasets
x_paths = sorted(glob(f"{DATA_PATH}/*/*/X.csv"))
y_paths = sorted(glob(f"{DATA_PATH}/*/*/y.csv"))
datasets = []
for x_path, y_path in zip(x_paths, y_paths):
    dataset_name = os.path.basename(os.path.dirname(x_path))
    X = pd.read_csv(x_path)
    y = pd.read_csv(y_path).squeeze()
    datasets.append((dataset_name, X, y))

print(f"Tổng số tập dữ liệu: {len(datasets)}")
print("🔁 Đang tìm hyperparameter tốt nhất cho 44 dataset...")

# Tối ưu hyperparameter cho CatBoost
param_distributions = {
    'iterations': randint(100, 300),
    'depth': randint(4, 10),
    'learning_rate': uniform(0.01, 0.2),
    'l2_leaf_reg': uniform(1, 10),
    'border_count': randint(32, 128)
}

results = []
best_params_list = []

start_time = time.time()

for dataset_name, X, y in tqdm(datasets, desc="Đang xử lý"):
    model = CatBoostClassifier(verbose=False, random_seed=42)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    search = RandomizedSearchCV(
        model,
        param_distributions=param_distributions,
        scoring='f1_weighted',
        n_iter=10,
        cv=skf,
        verbose=0,
        n_jobs=-1
    )
    search.fit(X, y)
    best_model = search.best_estimator_
    best_params_list.append(search.best_params_)

    y_preds, y_trues, y_probas = [], [], []

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        y_proba = best_model.predict_proba(X_test)[:, 1] if len(np.unique(y)) == 2 else None

        y_preds.extend(y_pred)
        y_trues.extend(y_test)
        if y_proba is not None:
            y_probas.extend(y_proba)

    precision = precision_score(y_trues, y_preds, average='weighted', zero_division=0)
    recall = recall_score(y_trues, y_preds, average='weighted', zero_division=0)
    f1 = f1_score(y_trues, y_preds, average='weighted', zero_division=0)
    try:
        roc_auc = roc_auc_score(y_trues, y_probas) if len(np.unique(y)) == 2 else np.nan
    except:
        roc_auc = np.nan
    accuracy = accuracy_score(y_trues, y_preds)

    results.append({
        'Dataset': dataset_name,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1,
        'ROC AUC': roc_auc,
        'Accuracy': accuracy,
        'n_samples': len(y)
    })

elapsed_time = time.time() - start_time

# Tạo bảng kết quả
df_results = pd.DataFrame(results).set_index("Dataset")

# Tính macro và weighted average
macro_avg = df_results[['Precision', 'Recall', 'F1-score', 'ROC AUC', 'Accuracy']].mean()

weights = df_results['n_samples']
weighted_avg = (df_results[['Precision', 'Recall', 'F1-score', 'ROC AUC', 'Accuracy']]
                .multiply(weights, axis=0).sum() / weights.sum())

accuracy_mean = pd.Series({
    'Precision': np.nan,
    'Recall': np.nan,
    'F1-score': np.nan,
    'ROC AUC': np.nan,
    'Accuracy': df_results['Accuracy'].mean()
})

# Thêm dòng tổng hợp
df_results.loc['Accuracy (mean)'] = accuracy_mean
df_results.loc['Macro avg'] = macro_avg
df_results.loc['Weighted avg'] = weighted_avg

# Ẩn cột 'n_samples' khi in
df_display = df_results.drop(columns=['n_samples'], errors='ignore')

print("\n📋 **BẢNG KẾT QUẢ TỔNG HỢP (Precision, Recall, F1-score, ROC AUC, Accuracy)**\n")
with pd.option_context('display.float_format', '{:,.6f}'.format):
    print(df_display.fillna(""))

# Tìm hyperparameter phổ biến nhất
param_counts = Counter([json.dumps(p, sort_keys=True) for p in best_params_list])
most_common_param = json.loads(param_counts.most_common(1)[0][0])

print("\n✅ Hyperparameter tốt nhất phổ biến nhất cho 44 dataset:")
for k, v in most_common_param.items():
    print(f" {k:20s}: {v}")

print(f"\n⏱️ Tổng thời gian chạy: {elapsed_time:.2f} giây")
print("✅ Đã xử lý đầy đủ 44 tập dữ liệu.")


Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
Tổng số tập dữ liệu: 44
🔁 Đang tìm hyperparameter tốt nhất cho 44 dataset...


Đang xử lý: 100%|██████████| 44/44 [25:44<00:00, 35.10s/it]


📋 **BẢNG KẾT QUẢ TỔNG HỢP (Precision, Recall, F1-score, ROC AUC, Accuracy)**

                            Precision   Recall F1-score  ROC AUC  Accuracy
Dataset                                                                   
analcatdata_aids             0.662338 0.660000 0.658772 0.678400  0.660000
analcatdata_asbestos         0.759036 0.759036 0.759036 0.834606  0.759036
analcatdata_bankruptcy       0.882448 0.880000 0.879808 0.953600  0.880000
analcatdata_creditscore      0.990357 0.990000 0.990056 0.987823  0.990000
analcatdata_cyyoung8092      0.826651 0.835052 0.822841 0.859018  0.835052
analcatdata_cyyoung9302      0.903442 0.902174 0.893367 0.883922  0.902174
analcatdata_fraud            0.773898 0.761905 0.766156 0.795756  0.761905
analcatdata_japansolvent     0.887236 0.884615 0.884615 0.880000  0.884615
labor                        0.898909 0.894737 0.891588 0.924324  0.894737
lupus                        0.696393 0.701149 0.695291 0.723077  0.701149
parity5              




Random Forest

In [None]:
# @title Random Forest
# Cài đặt nếu chưa có
!pip install scikit-learn -q

import os
import pandas as pd
from glob import glob
import numpy as np
from tqdm import tqdm
import time
import warnings
from collections import Counter

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
from scipy.stats import randint

warnings.filterwarnings("ignore")

DATA_PATH = "/content/TabMini/plotting/data"

# Load datasets
x_paths = sorted(glob(f"{DATA_PATH}/*/*/X.csv"))
y_paths = sorted(glob(f"{DATA_PATH}/*/*/y.csv"))
datasets = []
for x_path, y_path in zip(x_paths, y_paths):
    dataset_name = os.path.basename(os.path.dirname(x_path))
    X = pd.read_csv(x_path)
    y = pd.read_csv(y_path).squeeze()
    datasets.append((dataset_name, X, y))

print(f"Tổng số tập dữ liệu: {len(datasets)}")
print("🔁 Đang tìm hyperparameter tốt nhất cho 44 dataset...")

# Không fix cứng mà sẽ tối ưu RF bằng RandomizedSearchCV
param_distributions = {
    'n_estimators': randint(50, 300),
    'max_depth': [None, 5, 10, 20, 50],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

# Hàm train, search và evaluate
def search_evaluate_rf(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    rf = RandomForestClassifier(random_state=random_state, n_jobs=-1)

    search = RandomizedSearchCV(
        rf,
        param_distributions=param_distributions,
        n_iter=10,
        scoring='f1',
        n_jobs=-1,
        cv=3,
        random_state=random_state
    )
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    y_pred = best_model.predict(X_test)

    if len(np.unique(y)) == 2:
        y_proba = best_model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_proba)
    else:
        roc_auc = np.nan

    precision = precision_score(y_test, y_pred, average='binary', zero_division=0)
    recall = recall_score(y_test, y_pred, average='binary', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='binary', zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)

    return precision, recall, f1, roc_auc, accuracy, len(y), search.best_params_

# Chạy toàn bộ tập dữ liệu
results = []
best_param_list = []
start_time = time.time()

for name, X, y in tqdm(datasets):
    try:
        precision, recall, f1, roc_auc, accuracy, n_samples, best_params = search_evaluate_rf(X, y)
        results.append({
            'Dataset': name,
            'Precision': precision,
            'Recall': recall,
            'F1-score': f1,
            'ROC AUC': roc_auc,
            'Accuracy': accuracy,
            'n_samples': n_samples
        })
        best_param_list.append(str(best_params))
    except Exception as e:
        print(f"Lỗi tại dataset {name}: {e}")

elapsed_time = time.time() - start_time

# Tạo bảng kết quả
df_results = pd.DataFrame(results).set_index('Dataset')

# Tính macro và weighted average
macro_avg = df_results[['Precision', 'Recall', 'F1-score', 'ROC AUC', 'Accuracy']].mean()

weights = df_results['n_samples']
weighted_avg = (df_results[['Precision', 'Recall', 'F1-score', 'ROC AUC', 'Accuracy']]
                .multiply(weights, axis=0).sum() / weights.sum())

accuracy_mean = pd.Series({
    'Precision': np.nan,
    'Recall': np.nan,
    'F1-score': np.nan,
    'ROC AUC': np.nan,
    'Accuracy': df_results['Accuracy'].mean()
})

# Thêm dòng tổng hợp
df_results.loc['Accuracy (mean)'] = accuracy_mean
df_results.loc['Macro avg'] = macro_avg
df_results.loc['Weighted avg'] = weighted_avg

# Ẩn cột 'n_samples' khi in
df_display = df_results.drop(columns=['n_samples'], errors='ignore')

print("\n📋 **BẢNG KẾT QUẢ TỔNG HỢP (Precision, Recall, F1-score, ROC AUC, Accuracy)**\n")
with pd.option_context('display.float_format', '{:,.6f}'.format):
    print(df_display.fillna(""))

# Tìm hyperparameter phổ biến nhất
param_counts = Counter(best_param_list)
most_common_params = param_counts.most_common(1)[0][0]

print("\n✅ Hyperparameter tốt nhất phổ biến nhất cho 44 dataset:")
df_param = pd.DataFrame(eval(most_common_params), index=[0])
df_param.index = ['']
print(df_param.to_string())

print(f"\n⏱️ Tổng thời gian chạy: {elapsed_time:.2f} giây")
print("✅ Đã xử lý đầy đủ 44 tập dữ liệu.")


Tổng số tập dữ liệu: 44
🔁 Đang tìm hyperparameter tốt nhất cho 44 dataset...


100%|██████████| 44/44 [04:35<00:00,  6.25s/it]


📋 **BẢNG KẾT QUẢ TỔNG HỢP (Precision, Recall, F1-score, ROC AUC, Accuracy)**

                            Precision   Recall F1-score  ROC AUC  Accuracy
Dataset                                                                   
analcatdata_aids             0.500000 0.400000 0.444444 0.400000  0.500000
analcatdata_asbestos         0.833333 0.625000 0.714286 0.868056  0.764706
analcatdata_bankruptcy       0.714286 1.000000 0.833333 0.960000  0.800000
analcatdata_creditscore      1.000000 1.000000 1.000000 1.000000  1.000000
analcatdata_cyyoung8092      0.500000 0.400000 0.444444 0.520000  0.750000
analcatdata_cyyoung9302      0.500000 0.250000 0.333333 0.783333  0.789474
analcatdata_fraud            0.333333 0.333333 0.333333 0.666667  0.555556
analcatdata_japansolvent     0.833333 0.833333 0.833333 0.733333  0.818182
labor                        0.875000 0.875000 0.875000 0.937500  0.833333
lupus                        0.666667 0.571429 0.615385 0.733766  0.722222
parity5              




In [None]:
# @title ResNet (silent + summarized output only)
!pip install -q optuna

import os
import pandas as pd
import numpy as np
import time
from collections import Counter
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import optuna
from glob import glob
import warnings
warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)  # 🔇 Turn off Optuna logs

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load datasets
data_dir = "/content/TabMini/plotting/data"
x_paths = sorted(glob(f"{data_dir}/*/*/X.csv"))
y_paths = sorted(glob(f"{data_dir}/*/*/y.csv"))

datasets = []
for x_path, y_path in zip(x_paths, y_paths):
    dataset_name = os.path.basename(os.path.dirname(x_path))
    X = pd.read_csv(x_path).values.astype(np.float32)
    y = pd.read_csv(y_path).squeeze().values.astype(np.int64)
    datasets.append((dataset_name, X, y))

# ResNet Model
class ResNetTabular(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_blocks, dropout, num_classes):
        super().__init__()
        self.fc_in = nn.Linear(input_dim, hidden_dim)
        self.blocks = nn.Sequential(*[
            nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(hidden_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU()
            ) for _ in range(num_blocks)
        ])
        self.fc_out = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = F.relu(self.fc_in(x))
        residual = x
        for block in self.blocks:
            x = block(x) + residual
            residual = x
        return self.fc_out(x)

# Optuna Objective
def objective(trial, X, y):
    hidden_dim = trial.suggest_categorical("hidden_dim", [64, 128, 256])
    num_blocks = trial.suggest_int("num_blocks", 1, 4)
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    num_classes = len(np.unique(y))

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = []

    for train_idx, val_idx in skf.split(X, y):
        model = ResNetTabular(X.shape[1], hidden_dim, num_blocks, dropout, num_classes).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss()

        X_train = torch.tensor(X[train_idx]).to(device)
        y_train = torch.tensor(y[train_idx]).to(device)
        X_val = torch.tensor(X[val_idx]).to(device)
        y_val = torch.tensor(y[val_idx]).to(device)

        train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
        for _ in range(20):
            model.train()
            for xb, yb in train_loader:
                optimizer.zero_grad()
                preds = model(xb)
                loss = criterion(preds, yb)
                loss.backward()
                optimizer.step()

        model.eval()
        with torch.no_grad():
            val_preds = model(X_val).argmax(dim=1).cpu().numpy()
        scores.append(f1_score(y[val_idx], val_preds, average='weighted', zero_division=0))

    return np.mean(scores)

# Evaluation
results = []
best_param_list = []
start_time = time.time()

for dataset_name, X, y in datasets:
    try:
        study = optuna.create_study(direction="maximize")
        study.optimize(lambda trial: objective(trial, X, y), n_trials=10, show_progress_bar=False)
        best_params = study.best_params

        hidden_dim = best_params["hidden_dim"]
        num_blocks = best_params["num_blocks"]
        dropout = best_params["dropout"]
        lr = best_params["lr"]
        batch_size = best_params["batch_size"]
        num_classes = len(np.unique(y))

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        y_trues, y_preds, y_probas = [], [], []

        for train_idx, test_idx in skf.split(X, y):
            model = ResNetTabular(X.shape[1], hidden_dim, num_blocks, dropout, num_classes).to(device)
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
            criterion = nn.CrossEntropyLoss()

            X_train = torch.tensor(X[train_idx]).to(device)
            y_train = torch.tensor(y[train_idx]).to(device)
            X_test = torch.tensor(X[test_idx]).to(device)
            y_test = torch.tensor(y[test_idx]).to(device)

            train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
            for _ in range(20):
                model.train()
                for xb, yb in train_loader:
                    optimizer.zero_grad()
                    preds = model(xb)
                    loss = criterion(preds, yb)
                    loss.backward()
                    optimizer.step()

            model.eval()
            with torch.no_grad():
                preds = model(X_test)
                pred_labels = preds.argmax(dim=1).cpu().numpy()
                probas = F.softmax(preds, dim=1).cpu().numpy()

            y_trues.extend(y[test_idx])
            y_preds.extend(pred_labels)
            if num_classes == 2:
                y_probas.extend(probas[:, 1])

        precision = precision_score(y_trues, y_preds, average='weighted', zero_division=0)
        recall = recall_score(y_trues, y_preds, average='weighted', zero_division=0)
        f1 = f1_score(y_trues, y_preds, average='weighted', zero_division=0)
        accuracy = accuracy_score(y_trues, y_preds)
        roc_auc = roc_auc_score(y_trues, y_probas) if num_classes == 2 else np.nan

        results.append({
            "Dataset": dataset_name,
            "Precision": precision,
            "Recall": recall,
            "F1-score": f1,
            "ROC AUC": roc_auc,
            "Accuracy": accuracy,
            "n_samples": len(y)
        })
        best_param_list.append(json.dumps(best_params, sort_keys=True))
    except:
        continue

# Tổng hợp kết quả
elapsed_time = time.time() - start_time
df_results = pd.DataFrame(results).set_index("Dataset")

macro_avg = df_results[['Precision', 'Recall', 'F1-score', 'ROC AUC', 'Accuracy']].mean()
weights = df_results['n_samples']
weighted_avg = (df_results[['Precision', 'Recall', 'F1-score', 'ROC AUC', 'Accuracy']]
                .multiply(weights, axis=0).sum() / weights.sum())
accuracy_mean = pd.Series({
    'Precision': np.nan,
    'Recall': np.nan,
    'F1-score': np.nan,
    'ROC AUC': np.nan,
    'Accuracy': df_results['Accuracy'].mean()
})
df_results.loc['Accuracy (mean)'] = accuracy_mean
df_results.loc['Macro avg'] = macro_avg
df_results.loc['Weighted avg'] = weighted_avg

# In kết quả chính
df_display = df_results.drop(columns=['n_samples'], errors='ignore')
with pd.option_context('display.float_format', '{:,.6f}'.format):
    print(df_display.fillna(""))

# Thống kê hyperparameter phổ biến
param_counts = Counter(best_param_list)
most_common_params = param_counts.most_common(1)[0][0]
df_param = pd.DataFrame(json.loads(most_common_params), index=[''])
print("\n✅ Hyperparameter tốt nhất phổ biến nhất:")
print(df_param.to_string(index=False))

print(f"\n⏱️ Tổng thời gian chạy: {elapsed_time:.2f} giây")


[W 2025-06-11 15:06:41,087] Trial 3 failed with parameters: {'hidden_dim': 128, 'num_blocks': 3, 'dropout': 0.3131897622997283, 'lr': 0.006155342007827628, 'batch_size': 32} because of the following error: ValueError('Expected more than 1 value per channel when training, got input size torch.Size([1, 128])').
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<ipython-input-1-4016964781>", line 109, in <lambda>
    study.optimize(lambda trial: objective(trial, X, y), n_trials=10, show_progress_bar=False)
                                 ^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-1-4016964781>", line 89, in objective
    preds = model(xb)
            ^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^

                            Precision   Recall F1-score  ROC AUC  Accuracy
Dataset                                                                   
analcatdata_asbestos         0.671351 0.662651 0.663533 0.735311  0.662651
analcatdata_creditscore      0.990357 0.990000 0.990056 0.996449  0.990000
analcatdata_cyyoung9302      0.806880 0.826087 0.800605 0.822639  0.826087
analcatdata_fraud            0.697279 0.690476 0.693493 0.694960  0.690476
analcatdata_japansolvent     0.617208 0.615385 0.615385 0.754074  0.615385
labor                        0.933607 0.929825 0.930521 0.986486  0.929825
lupus                        0.655720 0.632184 0.635608 0.694231  0.632184
parity5                      0.322222 0.406250 0.326689 0.375000  0.406250
postoperative_patient_data   0.524313 0.704545 0.601212 0.429036  0.704545
analcatdata_boxing1          0.671751 0.683333 0.674390 0.669719  0.683333
analcatdata_boxing2          0.711885 0.712121 0.710511 0.728931  0.712121
appendicitis             

In [11]:
# @title FT-Transformer
# FT-Transformer for 44 Datasets from TabMini

# ====================== IMPORT ========================
import os
import time
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator, ClassifierMixin
from scipy.stats import uniform, randint

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ====================== DATASET & MODEL ========================
class TabularDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class FeatureTokenizer(nn.Module):
    def __init__(self, n_features, d_token):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(n_features, d_token))
        self.bias = nn.Parameter(torch.randn(n_features, d_token))

    def forward(self, x):
        x = x.unsqueeze(-1)
        return x * self.weight.unsqueeze(0) + self.bias.unsqueeze(0)

class MultiHeadAttention(nn.Module):
    def __init__(self, d_token, n_heads, dropout=0.1):
        super().__init__()
        self.d_k = d_token // n_heads
        self.n_heads = n_heads

        self.q = nn.Linear(d_token, d_token)
        self.k = nn.Linear(d_token, d_token)
        self.v = nn.Linear(d_token, d_token)
        self.o = nn.Linear(d_token, d_token)

        self.dropout = nn.Dropout(dropout)
        self.norm = nn.LayerNorm(d_token)

    def forward(self, x):
        B, N, D = x.shape
        q = self.q(x).view(B, N, self.n_heads, self.d_k).transpose(1, 2)
        k = self.k(x).view(B, N, self.n_heads, self.d_k).transpose(1, 2)
        v = self.v(x).view(B, N, self.n_heads, self.d_k).transpose(1, 2)

        scores = torch.matmul(q, k.transpose(-2, -1)) / np.sqrt(self.d_k)
        attn = torch.softmax(scores, dim=-1)
        out = torch.matmul(attn, v).transpose(1, 2).contiguous().view(B, N, D)
        return self.norm(self.o(out) + x)

class FeedForward(nn.Module):
    def __init__(self, d_token, d_ff, dropout=0.1):
        super().__init__()
        self.ff = nn.Sequential(
            nn.Linear(d_token, d_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_token)
        )
        self.norm = nn.LayerNorm(d_token)

    def forward(self, x):
        return self.norm(self.ff(x) + x)

class TransformerBlock(nn.Module):
    def __init__(self, d_token, n_heads, d_ff, dropout):
        super().__init__()
        self.attn = MultiHeadAttention(d_token, n_heads, dropout)
        self.ff = FeedForward(d_token, d_ff, dropout)

    def forward(self, x):
        return self.ff(self.attn(x))

class FTTransformer(nn.Module):
    def __init__(self, n_features, n_classes, d_token, n_layers, n_heads, d_ff, dropout):
        super().__init__()
        self.tokenizer = FeatureTokenizer(n_features, d_token)
        self.cls_token = nn.Parameter(torch.randn(1, 1, d_token))
        self.blocks = nn.Sequential(*[TransformerBlock(d_token, n_heads, d_ff, dropout) for _ in range(n_layers)])
        self.head = nn.Sequential(
            nn.Linear(d_token, d_token // 2),
            nn.ReLU(),
            nn.Linear(d_token // 2, n_classes)
        )

    def forward(self, x):
        B = x.size(0)
        tokens = self.tokenizer(x)
        cls = self.cls_token.expand(B, -1, -1)
        x = torch.cat([cls, tokens], dim=1)
        x = self.blocks(x)
        return self.head(x[:, 0])

class FTTransformerClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, d_token=32, n_layers=2, n_heads=4, d_ff=None,
                 dropout=0.1, lr=1e-3, batch_size=64, epochs=100, patience=10, random_state=42):
        self.d_token = d_token
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.d_ff = d_ff or d_token * 4
        self.dropout = dropout
        self.lr = lr
        self.batch_size = batch_size
        self.epochs = epochs
        self.patience = patience
        self.random_state = random_state
        self.scaler = StandardScaler()
        self.encoder = LabelEncoder()

    def fit(self, X, y):
        torch.manual_seed(self.random_state)
        np.random.seed(self.random_state)
        X = self.scaler.fit_transform(X)
        y = self.encoder.fit_transform(y)
        X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

        self.model = FTTransformer(
            n_features=X.shape[1], n_classes=len(np.unique(y)), d_token=self.d_token,
            n_layers=self.n_layers, n_heads=self.n_heads, d_ff=self.d_ff, dropout=self.dropout
        ).to(device)

        train_loader = DataLoader(TabularDataset(X_train, y_train), batch_size=self.batch_size, shuffle=True)
        val_loader = DataLoader(TabularDataset(X_val, y_val), batch_size=self.batch_size)

        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        criterion = nn.CrossEntropyLoss()
        best_loss = float('inf')
        counter = 0

        for _ in range(self.epochs):
            self.model.train()
            for xb, yb in train_loader:
                optimizer.zero_grad()
                loss = criterion(self.model(xb.to(device)), yb.to(device))
                loss.backward()
                optimizer.step()
            self.model.eval()
            val_loss = 0
            with torch.no_grad():
                for xb, yb in val_loader:
                    val_loss += criterion(self.model(xb.to(device)), yb.to(device)).item()
            if val_loss < best_loss:
                best_loss = val_loss
                best_state = self.model.state_dict()
                counter = 0
            else:
                counter += 1
                if counter >= self.patience:
                    break

        self.model.load_state_dict(best_state)
        return self

    def predict(self, X):
        self.model.eval()
        X = self.scaler.transform(X)
        loader = DataLoader(TabularDataset(X, np.zeros(len(X))), batch_size=self.batch_size)
        preds = []
        with torch.no_grad():
            for xb, _ in loader:
                pred = self.model(xb.to(device)).argmax(dim=1).cpu().numpy()
                preds.extend(pred)
        return self.encoder.inverse_transform(preds)

    def predict_proba(self, X):
        self.model.eval()
        X = self.scaler.transform(X)
        loader = DataLoader(TabularDataset(X, np.zeros(len(X))), batch_size=self.batch_size)
        probs = []
        with torch.no_grad():
            for xb, _ in loader:
                prob = F.softmax(self.model(xb.to(device)), dim=1).cpu().numpy()
                probs.extend(prob)
        return np.array(probs)

# ====================== ĐÁNH GIÁ 44 DATASET ========================
DATA_PATH = "/content/TabMini/plotting/data"
x_paths = sorted(glob(f"{DATA_PATH}/*/*/X.csv"))
y_paths = sorted(glob(f"{DATA_PATH}/*/*/y.csv"))
datasets = [(os.path.basename(os.path.dirname(x)), pd.read_csv(x), pd.read_csv(y).squeeze()) for x, y in zip(x_paths, y_paths)]

param_dist = {
    "d_token": [16, 32],
    "n_layers": randint(1, 3),
    "n_heads": [2, 4],
    "dropout": uniform(0.1, 0.3),
    "lr": uniform(1e-4, 1e-2),
    "batch_size": [32, 64],
    "epochs": [50],
    "patience": [10]
}

results = []
best_param_list = []
start = time.time()

for name, X, y in tqdm(datasets):
    try:
        if len(X) < 50: continue
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
        model = FTTransformerClassifier(random_state=42)
        search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=5, cv=3, scoring='f1_macro', n_jobs=1, random_state=42)
        search.fit(X_train, y_train)
        best_model = search.best_estimator_
        best_param_list.append(str(search.best_params_))
        y_pred = best_model.predict(X_test)
        y_prob = best_model.predict_proba(X_test)

        average = 'binary' if len(np.unique(y)) == 2 else 'macro'
        roc = roc_auc_score(y_test, y_prob[:, 1]) if average == 'binary' else np.nan

        results.append({
            'Dataset': name,
            'Precision': precision_score(y_test, y_pred, average=average, zero_division=0),
            'Recall': recall_score(y_test, y_pred, average=average, zero_division=0),
            'F1-score': f1_score(y_test, y_pred, average=average, zero_division=0),
            'ROC AUC': roc,
            'Accuracy': accuracy_score(y_test, y_pred),
            'n_samples': len(y_test)
        })
    except Exception as e:
        print(f"⚠️ Dataset {name} error: {e}")

elapsed = time.time() - start

# ====================== KẾT QUẢ ========================
df = pd.DataFrame(results).set_index('Dataset')
macro_avg = df[['Precision', 'Recall', 'F1-score', 'ROC AUC', 'Accuracy']].mean()
weights = df['n_samples']
weighted_avg = (df[['Precision', 'Recall', 'F1-score', 'ROC AUC', 'Accuracy']].multiply(weights, axis=0).sum() / weights.sum())
accuracy_mean = pd.Series({'Precision': np.nan, 'Recall': np.nan, 'F1-score': np.nan, 'ROC AUC': np.nan, 'Accuracy': df['Accuracy'].mean()})

df.loc['Accuracy (mean)'] = accuracy_mean
df.loc['Macro avg'] = macro_avg
df.loc['Weighted avg'] = weighted_avg
df_display = df.drop(columns=['n_samples'], errors='ignore')

print("\n📋 **BẢNG KẾT QUẢ TỔNG HỢP FT-Transformer (Precision, Recall, F1-score, ROC AUC, Accuracy)**\n")
with pd.option_context('display.float_format', '{:,.6f}'.format):
    print(df_display.fillna(""))

param_counts = Counter(best_param_list)
most_common_param = param_counts.most_common(1)[0][0]
print("\n✅ Hyperparameter tốt nhất phổ biến nhất cho 44 dataset:")
print(pd.DataFrame(eval(most_common_param), index=['']).to_string())
print(f"\n⏱️ Tổng thời gian chạy: {elapsed:.2f} giây")
print("✅ Đã xử lý xong 44 dataset")



Using device: cpu


100%|██████████| 44/44 [16:35<00:00, 22.63s/it]


📋 **BẢNG KẾT QUẢ TỔNG HỢP FT-Transformer (Precision, Recall, F1-score, ROC AUC, Accuracy)**

                            Precision   Recall F1-score  ROC AUC  Accuracy
Dataset                                                                   
analcatdata_aids             0.000000 0.000000 0.000000 0.560000  0.400000
analcatdata_asbestos         0.800000 1.000000 0.888889 0.930556  0.882353
analcatdata_bankruptcy       0.714286 1.000000 0.833333 1.000000  0.800000
analcatdata_creditscore      1.000000 0.933333 0.965517 1.000000  0.950000
analcatdata_cyyoung8092      0.428571 0.600000 0.500000 0.613333  0.700000
analcatdata_cyyoung9302      0.666667 0.500000 0.571429 0.783333  0.842105
analcatdata_japansolvent     0.800000 0.666667 0.727273 0.733333  0.727273
labor                        0.833333 0.625000 0.714286 0.875000  0.666667
lupus                        0.571429 0.571429 0.571429 0.805195  0.666667
postoperative_patient_data   0.000000 0.000000 0.000000 0.384615  0.722222
analca


