In [1]:
import pathlib

import pandas as pd
import numpy as np
from scipy.io import arff
from gosdt.model.threshold_guess import compute_thresholds, cut
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from imodels.tree.gosdt.pygosdt import OptimalTreeClassifier
import random
import torch
import sklearn
from gosdt import GOSDT
from metatree.model_metatree import LlamaForMetaTree as MetaTree
from metatree.decision_tree_class import DecisionTree, DecisionTreeForest
from metatree.run_train import preprocess_dimension_patch

from transformers import AutoConfig

model_name_or_path = "yzhuang/MetaTree"

config = AutoConfig.from_pretrained(model_name_or_path)
model = MetaTree.from_pretrained(
    model_name_or_path,
    config=config,
)   
decision_tree_forest = DecisionTreeForest()

ensemble_size = 1
seed = 42
# Load the new dataset
arff_file = arff.loadarff('./dataset/5.dataset_30_page-blocks.arff')
df = pd.DataFrame(arff_file[0])
print(df)
mapping = {
    b'0': 0,
    b'1': 1,
    b'2': 2,
    b'3': 3,
    b'4': 4,
    b'5': 5,
    b'6': 6,
    b'7': 7,
    b'8': 8,
    b'9': 9,
    b'10': 10
}
df['Result'] = df['class'].apply(lambda x: mapping.get(x, 0))
# print(df['class'].unique())

# Drop the original binaryClass column
df.drop(columns=['class'], inplace=True)
print(df)
X, y = df.iloc[:,:-1], df.iloc[:,-1]
df.info()
df = df.astype(int)
df.info()
feature_names = df.columns[:-1]

print("Dataset Shapes X={}, y={}, Num of Classes={}".format(X.shape, y.shape, len(set(y))))

train_idx, test_idx = sklearn.model_selection.train_test_split(range(X.shape[0]), test_size=0.3, random_state=seed)
print(len(train_idx))

# Dimension 
random.seed(seed)
feature_idx = np.random.choice(X.shape[1], 6, replace=False)
X = X.iloc[:, feature_idx]  # Updated line

test_X, test_y = X.iloc[test_idx], y.iloc[test_idx]  # Updated line

for i in range(ensemble_size):
    # Sample Train and Test Data
    random.seed(seed+i+1)
    subset_idx = random.sample(train_idx, 256)
    train_X, train_y = X.iloc[subset_idx], y.iloc[subset_idx]  # Updated line

    input_x = torch.tensor(train_X.values, dtype=torch.float32)  # Convert DataFrame to NumPy array before tensor
    input_y = torch.nn.functional.one_hot(torch.tensor(train_y.values, dtype= torch.long)).float()  # Convert Series to NumPy array before tensor

    batch = {"input_x": input_x, "input_y": input_y, "input_y_clean": input_y}
    batch = preprocess_dimension_patch(batch, n_feature=10, n_class=10)
    model.depth = 2
    outputs = model.generate_decision_tree(batch['input_x'], batch['input_y'], depth=model.depth)
    decision_tree_forest.add_tree(DecisionTree(auto_dims=outputs.metatree_dimensions, auto_thresholds=outputs.tentative_splits, input_x=batch['input_x'], input_y=batch['input_y'], depth=model.depth))

    print("Decision Tree Features: ", [x.argmax(dim=-1) for x in outputs.metatree_dimensions])
    print("Decision Tree Threasholds: ", outputs.tentative_splits)


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


      height  lenght    area    eccen  p_black  p_and  mean_tr  blackpix  \
0        5.0     7.0    35.0    1.400    0.400  0.657     2.33      14.0   
1        6.0     7.0    42.0    1.167    0.429  0.881     3.60      18.0   
2        6.0    18.0   108.0    3.000    0.287  0.741     4.43      31.0   
3        5.0     7.0    35.0    1.400    0.371  0.743     4.33      13.0   
4        6.0     3.0    18.0    0.500    0.500  0.944     2.25       9.0   
...      ...     ...     ...      ...      ...    ...      ...       ...   
5468     4.0   524.0  2096.0  131.000    0.542  0.603    40.57    1136.0   
5469     7.0     4.0    28.0    0.571    0.714  0.929    10.00      20.0   
5470     6.0    95.0   570.0   15.833    0.300  0.911     1.64     171.0   
5471     7.0    41.0   287.0    5.857    0.213  0.801     1.36      61.0   
5472     8.0     1.0     8.0    0.125    1.000  1.000     8.00       8.0   

      blackand  wb_trans class  
0         23.0       6.0  b'1'  
1         37.0       

In [2]:
tree_pred = decision_tree_forest.predict(torch.tensor(test_X.values, dtype=torch.float32))

accuracy = accuracy_score(test_y.values, tree_pred.argmax(dim=-1).squeeze(0))
print("MetaTree Test Accuracy: ", accuracy)


MetaTree Test Accuracy:  0.8909866017052375


In [3]:
cart_ensemble = []

for i in range(ensemble_size):
    random.seed(seed + i + 1)
    subset_idx = random.sample(train_idx, 256)
    train_X, train_y = X.iloc[subset_idx], y.iloc[subset_idx]  # Use .iloc for row indexing
    
    clf = sklearn.tree.DecisionTreeClassifier(max_depth=2, random_state=seed + i + 1)
    clf.fit(train_X.values, train_y.values)
    cart_ensemble.append(clf)

overall_pred = np.zeros((test_X.values.shape[0], len(set(test_y))))
for clf in cart_ensemble:
    overall_pred += clf.predict_proba(test_X.values)
overall_pred = overall_pred / len(cart_ensemble)

accuracy = accuracy_score(test_y.values, overall_pred.argmax(axis=-1))
print("CART Test Accuracy: ", accuracy)


CART Test Accuracy:  0.012789281364190013


In [4]:
# # Tiếp tục từ đoạn code của bạn
# # Định nghĩa các tham số cho mô hình GOSDT
# gosdt_params = {
#     'regularization': 0.01,
#     'time_limit': 60,  # giới hạn thời gian tính toán là 60 giây
#     'verbose': True
# }
# 
# # Khởi tạo mô hình GOSDT với các tham số đã định nghĩa
# gosdt_model = GOSDT(gosdt_params)
# 
# # Huấn luyện mô hình GOSDT với tập dữ liệu huấn luyện
# gosdt_model.fit(train_X, train_y)
# 
# # Dự đoán kết quả trên tập dữ liệu kiểm tra
# predictions = gosdt_model.predict(test_X)
# 
# # Tính toán độ chính xác của mô hình
# accuracy = accuracy_score(test_y, predictions)
# print(f'Độ chính xác của mô hình GOSDT: {accuracy}')


In [5]:
# gosdt_ensemble = []
# 
# for i in range(ensemble_size):
#     random.seed(seed + i + 1)
#     subset_idx = random.sample(train_idx, 256)
#     train_X, train_y = X.iloc[subset_idx], y.iloc[subset_idx]  # Use .iloc for row indexing
#     glf = OptimalTreeClassifier(random_state=seed + i + 1)
#     print(glf.max_depth())
#     gosdt_ensemble.append(predictions)
# overall_pred_gosdt = np.zeros((test_X.values.shape[0], len(set(test_y))))
# for glf in gosdt_ensemble:
#     overall_pred_gosdt += glf.predict_proba(test_X.values)
# overall_pred_gosdt = overall_pred_gosdt / len(gosdt_ensemble)
# 
# accuracy = accuracy_score(test_y.values, overall_pred_gosdt.argmax(axis=-1))
# print("GOSDT Test Accuracy: ", accuracy)
#     