In [3]:
import pathlib

import pandas as pd
import numpy as np
from scipy.io import arff
from gosdt.model.threshold_guess import compute_thresholds, cut
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from imodels.tree.gosdt.pygosdt import OptimalTreeClassifier
import random
import torch
import sklearn
from gosdt import GOSDT
from metatree.model_metatree import LlamaForMetaTree as MetaTree
from metatree.decision_tree_class import DecisionTree, DecisionTreeForest
from metatree.run_train import preprocess_dimension_patch

from transformers import AutoConfig

model_name_or_path = "yzhuang/MetaTree"

config = AutoConfig.from_pretrained(model_name_or_path)
model = MetaTree.from_pretrained(
    model_name_or_path,
    config=config,
)   
decision_tree_forest = DecisionTreeForest()

ensemble_size = 1
seed = 42
# Load the new dataset
arff_file = arff.loadarff('./dataset/16.fried.arff')
df = pd.DataFrame(arff_file[0])
print(df)
df['Result'] = df['binaryClass'].apply(lambda x: 1 if x == b'P' else 0)

# Drop the original binaryClass column
df.drop(columns=['binaryClass'], inplace=True)
X, y = df.iloc[:,:-1], df.iloc[:,-1]
df = df.astype(int)
df.info()
feature_names = df.columns[:-1]

print("Dataset Shapes X={}, y={}, Num of Classes={}".format(X.shape, y.shape, len(set(y))))

train_idx, test_idx = sklearn.model_selection.train_test_split(range(X.shape[0]), test_size=0.3, random_state=seed)
print(len(train_idx))

# Dimension 
random.seed(seed)
feature_idx = np.random.choice(X.shape[1], 6, replace=False)
X = X.iloc[:, feature_idx]  # Updated line

test_X, test_y = X.iloc[test_idx], y.iloc[test_idx]  # Updated line

for i in range(ensemble_size):
    # Sample Train and Test Data
    random.seed(seed+i+1)
    subset_idx = random.sample(train_idx, 256)
    train_X, train_y = X.iloc[subset_idx], y.iloc[subset_idx]  # Updated line

    input_x = torch.tensor(train_X.values, dtype=torch.float32)  # Convert DataFrame to NumPy array before tensor
    input_y = torch.nn.functional.one_hot(torch.tensor(train_y.values, dtype= torch.long)).float()  # Convert Series to NumPy array before tensor

    batch = {"input_x": input_x, "input_y": input_y, "input_y_clean": input_y}
    batch = preprocess_dimension_patch(batch, n_feature=10, n_class=10)
    model.depth = 2
    outputs = model.generate_decision_tree(batch['input_x'], batch['input_y'], depth=model.depth)
    decision_tree_forest.add_tree(DecisionTree(auto_dims=outputs.metatree_dimensions, auto_thresholds=outputs.tentative_splits, input_x=batch['input_x'], input_y=batch['input_y'], depth=model.depth))

    print("Decision Tree Features: ", [x.argmax(dim=-1) for x in outputs.metatree_dimensions])
    print("Decision Tree Threasholds: ", outputs.tentative_splits)


          X1     X2     X3     X4     X5     X6     X7     X8     X9    X10  \
0      0.487  0.072  0.004  0.833  0.765  0.600  0.132  0.886  0.073  0.342   
1      0.223  0.401  0.659  0.528  0.843  0.713  0.580  0.473  0.572  0.528   
2      0.903  0.913  0.940  0.979  0.561  0.744  0.627  0.818  0.309  0.510   
3      0.791  0.857  0.359  0.844  0.155  0.948  0.114  0.292  0.412  0.991   
4      0.326  0.593  0.085  0.927  0.926  0.633  0.431  0.326  0.031  0.730   
...      ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
40763  0.911  0.960  0.205  0.224  0.703  0.531  0.061  1.000  0.337  0.444   
40764  0.673  0.533  0.086  0.097  0.123  0.405  0.674  0.752  0.772  0.624   
40765  0.052  0.644  0.645  0.838  0.466  0.360  0.177  0.849  0.153  0.683   
40766  0.835  0.074  0.780  0.916  0.621  0.410  0.342  0.466  0.896  0.447   
40767  0.537  0.490  0.169  0.638  0.726  0.605  0.907  0.394  0.495  0.991   

      binaryClass  
0            b'N'  
1          

In [4]:
tree_pred = decision_tree_forest.predict(torch.tensor(test_X.values, dtype=torch.float32))

accuracy = accuracy_score(test_y.values, tree_pred.argmax(dim=-1).squeeze(0))
print("MetaTree Test Accuracy: ", accuracy)


MetaTree Test Accuracy:  0.7427029678685307


In [5]:
cart_ensemble = []

for i in range(ensemble_size):
    random.seed(seed + i + 1)
    subset_idx = random.sample(train_idx, 256)
    train_X, train_y = X.iloc[subset_idx], y.iloc[subset_idx]  # Use .iloc for row indexing
    
    clf = sklearn.tree.DecisionTreeClassifier(max_depth=2, random_state=seed + i + 1)
    clf.fit(train_X.values, train_y.values)
    cart_ensemble.append(clf)

overall_pred = np.zeros((test_X.values.shape[0], len(set(test_y))))
for clf in cart_ensemble:
    overall_pred += clf.predict_proba(test_X.values)
overall_pred = overall_pred / len(cart_ensemble)

accuracy = accuracy_score(test_y.values, overall_pred.argmax(axis=-1))
print("CART Test Accuracy: ", accuracy)


CART Test Accuracy:  0.7424576894775571


In [6]:
# # Tiếp tục từ đoạn code của bạn
# # Định nghĩa các tham số cho mô hình GOSDT
# gosdt_params = {
#     'regularization': 0.01,
#     'time_limit': 60,  # giới hạn thời gian tính toán là 60 giây
#     'verbose': True
# }
# 
# # Khởi tạo mô hình GOSDT với các tham số đã định nghĩa
# gosdt_model = GOSDT(gosdt_params)
# 
# # Huấn luyện mô hình GOSDT với tập dữ liệu huấn luyện
# gosdt_model.fit(train_X, train_y)
# 
# # Dự đoán kết quả trên tập dữ liệu kiểm tra
# predictions = gosdt_model.predict(test_X)
# 
# # Tính toán độ chính xác của mô hình
# accuracy = accuracy_score(test_y, predictions)
# print(f'Độ chính xác của mô hình GOSDT: {accuracy}')


In [7]:
# gosdt_ensemble = []
# 
# for i in range(ensemble_size):
#     random.seed(seed + i + 1)
#     subset_idx = random.sample(train_idx, 256)
#     train_X, train_y = X.iloc[subset_idx], y.iloc[subset_idx]  # Use .iloc for row indexing
#     glf = OptimalTreeClassifier(random_state=seed + i + 1)
#     print(glf.max_depth())
#     gosdt_ensemble.append(predictions)
# overall_pred_gosdt = np.zeros((test_X.values.shape[0], len(set(test_y))))
# for glf in gosdt_ensemble:
#     overall_pred_gosdt += glf.predict_proba(test_X.values)
# overall_pred_gosdt = overall_pred_gosdt / len(gosdt_ensemble)
# 
# accuracy = accuracy_score(test_y.values, overall_pred_gosdt.argmax(axis=-1))
# print("GOSDT Test Accuracy: ", accuracy)
#     