In [1]:
import pathlib

import pandas as pd
import numpy as np
from scipy.io import arff
from gosdt.model.threshold_guess import compute_thresholds, cut
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from imodels.tree.gosdt.pygosdt import OptimalTreeClassifier
import random
import torch
import sklearn
from gosdt import GOSDT
from metatree.model_metatree import LlamaForMetaTree as MetaTree
from metatree.decision_tree_class import DecisionTree, DecisionTreeForest
from metatree.run_train import preprocess_dimension_patch

from transformers import AutoConfig

model_name_or_path = "yzhuang/MetaTree"

config = AutoConfig.from_pretrained(model_name_or_path)
model = MetaTree.from_pretrained(
    model_name_or_path,
    config=config,
)   
decision_tree_forest = DecisionTreeForest()

ensemble_size = 1
seed = 42
# Load the new dataset
arff_file = arff.loadarff('./dataset/8.dataset_60_waveform-5000.arff')
df = pd.DataFrame(arff_file[0])
print(df)
mapping = {
    b'0': 0,
    b'1': 1,
    b'2': 2,
    b'3': 3,
    b'4': 4,
    b'5': 5,
    b'6': 6,
    b'7': 7,
    b'8': 8,
    b'9': 9,
    b'10': 10
}
df['Result'] = df['class'].apply(lambda x: mapping.get(x, 0))
# print(df['class'].unique())

# Drop the original binaryClass column
df.drop(columns=['class'], inplace=True)
print(df)
X, y = df.iloc[:,:-1], df.iloc[:,-1]
df.info()
df = df.astype(int)
df.info()
feature_names = df.columns[:-1]

print("Dataset Shapes X={}, y={}, Num of Classes={}".format(X.shape, y.shape, len(set(y))))

train_idx, test_idx = sklearn.model_selection.train_test_split(range(X.shape[0]), test_size=0.3, random_state=seed)
print(len(train_idx))

# Dimension 
random.seed(seed)
feature_idx = np.random.choice(X.shape[1], 10, replace=False)
X = X.iloc[:, feature_idx]  # Updated line

test_X, test_y = X.iloc[test_idx], y.iloc[test_idx]  # Updated line

for i in range(ensemble_size):
    # Sample Train and Test Data
    random.seed(seed+i+1)
    subset_idx = random.sample(train_idx, 256)
    train_X, train_y = X.iloc[subset_idx], y.iloc[subset_idx]  # Updated line

    input_x = torch.tensor(train_X.values, dtype=torch.float32)  # Convert DataFrame to NumPy array before tensor
    input_y = torch.nn.functional.one_hot(torch.tensor(train_y.values, dtype= torch.long)).float()  # Convert Series to NumPy array before tensor

    batch = {"input_x": input_x, "input_y": input_y, "input_y_clean": input_y}
    batch = preprocess_dimension_patch(batch, n_feature=10, n_class=10)
    model.depth = 2
    outputs = model.generate_decision_tree(batch['input_x'], batch['input_y'], depth=model.depth)
    decision_tree_forest.add_tree(DecisionTree(auto_dims=outputs.metatree_dimensions, auto_thresholds=outputs.tentative_splits, input_x=batch['input_x'], input_y=batch['input_y'], depth=model.depth))

    print("Decision Tree Features: ", [x.argmax(dim=-1) for x in outputs.metatree_dimensions])
    print("Decision Tree Threasholds: ", outputs.tentative_splits)


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


        x1    x2    x3    x4    x5    x6    x7    x8    x9   x10  ...   x32  \
0    -0.23 -1.21  1.20  1.23 -0.10  0.12  2.49  1.19  1.34  0.58  ... -0.86   
1     0.38  0.38 -0.31 -0.09  1.52  1.35  1.49  3.81  2.33  1.34  ...  1.28   
2    -0.69  1.00  1.08  1.48  2.44  3.39  3.09  4.08  5.48  3.61  ...  0.29   
3     0.40  0.68  0.27  1.39  1.03 -0.32 -1.23 -0.50  0.11  0.87  ...  0.43   
4    -0.81  1.59 -0.69  1.16  4.22  4.98  4.52  2.54  5.60  4.66  ...  0.62   
...    ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
4995  0.44  0.56  1.84  1.94  3.43  4.88  4.04  2.11  1.83  0.78  ... -2.56   
4996  1.18 -0.48  1.81  1.51  1.41  3.61  3.75  3.80  3.44  3.71  ...  0.37   
4997  0.64  0.81 -0.38 -0.88  1.55  0.10  0.42 -0.93 -0.70  1.54  ...  1.61   
4998  0.18  1.65  1.91  2.07  4.28  3.61  4.46  4.62  4.80  0.25  ... -0.65   
4999  2.05 -1.99  1.66  2.18  2.22  2.53  3.09  2.20  1.42  0.62  ...  0.28   

       x33   x34   x35   x36   x37   x38   x39   x4

In [2]:
tree_pred = decision_tree_forest.predict(torch.tensor(test_X.values, dtype=torch.float32))

accuracy = accuracy_score(test_y.values, tree_pred.argmax(dim=-1).squeeze(0))
print("MetaTree Test Accuracy: ", accuracy)


MetaTree Test Accuracy:  0.7073333333333334


In [3]:
cart_ensemble = []

for i in range(ensemble_size):
    random.seed(seed + i + 1)
    subset_idx = random.sample(train_idx, 256)
    train_X, train_y = X.iloc[subset_idx], y.iloc[subset_idx]  # Use .iloc for row indexing
    
    clf = sklearn.tree.DecisionTreeClassifier(max_depth=2, random_state=seed + i + 1)
    clf.fit(train_X.values, train_y.values)
    cart_ensemble.append(clf)

overall_pred = np.zeros((test_X.values.shape[0], len(set(test_y))))
for clf in cart_ensemble:
    overall_pred += clf.predict_proba(test_X.values)
overall_pred = overall_pred / len(cart_ensemble)

accuracy = accuracy_score(test_y.values, overall_pred.argmax(axis=-1))
print("CART Test Accuracy: ", accuracy)


CART Test Accuracy:  0.7146666666666667


In [4]:
# # Tiếp tục từ đoạn code của bạn
# # Định nghĩa các tham số cho mô hình GOSDT
# gosdt_params = {
#     'regularization': 0.01,
#     'time_limit': 60,  # giới hạn thời gian tính toán là 60 giây
#     'verbose': True
# }
# 
# # Khởi tạo mô hình GOSDT với các tham số đã định nghĩa
# gosdt_model = GOSDT(gosdt_params)
# 
# # Huấn luyện mô hình GOSDT với tập dữ liệu huấn luyện
# gosdt_model.fit(train_X, train_y)
# 
# # Dự đoán kết quả trên tập dữ liệu kiểm tra
# predictions = gosdt_model.predict(test_X)
# 
# # Tính toán độ chính xác của mô hình
# accuracy = accuracy_score(test_y, predictions)
# print(f'Độ chính xác của mô hình GOSDT: {accuracy}')


In [5]:
# gosdt_ensemble = []
# 
# for i in range(ensemble_size):
#     random.seed(seed + i + 1)
#     subset_idx = random.sample(train_idx, 256)
#     train_X, train_y = X.iloc[subset_idx], y.iloc[subset_idx]  # Use .iloc for row indexing
#     glf = OptimalTreeClassifier(random_state=seed + i + 1)
#     print(glf.max_depth())
#     gosdt_ensemble.append(predictions)
# overall_pred_gosdt = np.zeros((test_X.values.shape[0], len(set(test_y))))
# for glf in gosdt_ensemble:
#     overall_pred_gosdt += glf.predict_proba(test_X.values)
# overall_pred_gosdt = overall_pred_gosdt / len(gosdt_ensemble)
# 
# accuracy = accuracy_score(test_y.values, overall_pred_gosdt.argmax(axis=-1))
# print("GOSDT Test Accuracy: ", accuracy)
#     