In [1]:
! pip install wheel

# Install the UIC repo
! pip install ucimlrepo

from sklearn.metrics import classification_report
from gplearn.genetic import SymbolicClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
import pandas as pd 
import math
from statistics import mean 
from statistics import stdev
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score
import warnings

# Ignore all warnings
warnings.filterwarnings('ignore')




In [2]:
# Import the databases
from ucimlrepo import fetch_ucirepo 
  
# fetch datasets
iris = fetch_ucirepo(id=53) 
wine_quality = fetch_ucirepo(id=186) 
rice_cammeo_and_osmancik = fetch_ucirepo(id=545) 
breast_cancer_wisconsin_original = fetch_ucirepo(id=15) 
magic_gamma_telescope = fetch_ucirepo(id=159) 
banknote_authentication = fetch_ucirepo(id=267)
yeast = fetch_ucirepo(id=110)
letter_recognition = fetch_ucirepo(id=59) 

datasets = [iris, wine_quality,rice_cammeo_and_osmancik,breast_cancer_wisconsin_original,
            magic_gamma_telescope, banknote_authentication, yeast, letter_recognition]

datasets_strings = ['iris', 'wine_quality','rice_cammeo_and_osmancik','breast_cancer_wisconsin_original',
            'magic_gamma_telescope', 'banknote_authentication', 'yeast', 'letter_recognition']

In [3]:
# Configure the classifiers

#----------------------------------------------------------------------------------------------
# CART
from sklearn.tree import DecisionTreeClassifier

#----------------------------------------------------------------------------------------------
# C4.5
from ohmt.trees.univariate.c4 import C45
from ohmt.trees.splits.evaluation import gini

#----------------------------------------------------------------------------------------------
# OC1
from ohmt.trees.multivariate.oc1 import OC1
from ohmt.trees.splits.evaluation import gini

#----------------------------------------------------------------------------------------------
# SVM_ODT
! pip install Stree
from stree import Stree



# Experiment 1: Proposed method against Standard Decision Tree Algorithms

In [4]:
#----------------------------------------------------------------------------------------------
# CART

# Create an empty dataframe 
col_names =  ['database', 'mean_score', 'mean_std'] 
cart_df  = pd.DataFrame(columns = col_names) 
cart_df['database'] = datasets_strings

# Iterate over datasets
acc_scores = []
std_scores = []
for ds_cnt, ds in enumerate(datasets):
    # Preprocess dataset, and split into training and test sets
    # data as pandas dataframes 
    X = ds.data.features 
    y = ds.data.targets 

    # Drop empty entries, if any
    empty_row_idx = [index for index, row in X.iterrows() if row.isnull().any()]
    if len(empty_row_idx) >= 1:
        X = X.drop(index=empty_row_idx)
        y = y.drop(index=empty_row_idx)
        
    X = StandardScaler().fit_transform(X)

    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3)    
    dtc = DecisionTreeClassifier()

    acc_each_run = []
    std_each_run = []
    
    for train_index, test_index in sss.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            model = dtc.fit(X_train, y_train)
            score = cross_val_score(model, X_test, y_test, cv=10)
            acc_each_run.append(score.mean().round(3))
            std_each_run.append(score.std().round(3))

    ave_score = mean(acc_each_run)
    ave_std = mean(std_each_run)

    acc_scores.append(ave_score)
    std_scores.append(ave_std)

cart_df[ 'mean_score'] = acc_scores
cart_df[ 'mean_std'] = std_scores
cart_df.set_index("database", inplace = True)


In [5]:
#----------------------------------------------------------------------------------------------
# C4.5

# Create an empty dataframe 
col_names =  ['database', 'mean_score', 'mean_std'] 
c45_df  = pd.DataFrame(columns = col_names) 
c45_df['database'] = datasets_strings

# Iterate over datasets
acc_scores = []
std_scores = []
for ds_cnt, ds in enumerate(datasets):
    # Preprocess dataset, and split into training and test sets
    # data as pandas dataframes 
    X = ds.data.features 
    y = ds.data.targets 

    # Drop empty entries, if any
    empty_row_idx = [index for index, row in X.iterrows() if row.isnull().any()]
    if len(empty_row_idx) >= 1:
        X = X.drop(index=empty_row_idx)
        y = y.drop(index=empty_row_idx)
        
    ordinal_encoder = OrdinalEncoder()
    X = X.values[:, :-1]
    y = ordinal_encoder.fit_transform(y)
    y =  y[:, -1].astype(int)

    # Generate the 10 runs to calculate the accuracy and standard deviation
    acc_each_run = []
    i = 1
    while i < 11:
        # Test and train sets
        train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size=.3,
                                                               stratify=y)
        c45_tree = C45()
        c45_tree = c45_tree.fit(train_features, train_labels, max_depth=10, min_eps=0.000000000000001, min_samples=10,
                    node_fitness_function=gini)
    
        # Evaluate tree 
        predicted_test_labels = c45_tree.predict(test_features) 
        score = accuracy_score(test_labels, predicted_test_labels)
    
        acc_each_run.append(score)
        i += 1
    

    ave_score = mean(acc_each_run)
    ave_std = stdev(std_each_run)

    acc_scores.append(round(ave_score,3))
    std_scores.append(round(ave_std,3))

c45_df[ 'mean_score'] = acc_scores
c45_df[ 'mean_std'] = std_scores
c45_df.set_index("database", inplace = True)


ValueError: 'list' argument must have no negative elements

# Experiment 2: Proposed method against Oblique Decision Tree Algorithms  

In [None]:
#----------------------------------------------------------------------------------------------
# OC1

# Create an empty dataframe 
col_names =  ['database', 'mean_score', 'mean_std'] 
oc1_df  = pd.DataFrame(columns = col_names) 
oc1_df['database'] = datasets_strings

# Iterate over datasets
acc_scores = []
std_scores = []
for ds_cnt, ds in enumerate(datasets):
    # Preprocess dataset, and split into training and test sets
    # data as pandas dataframes 
    X = ds.data.features 
    y = ds.data.targets 

    # Drop empty entries, if any
    empty_row_idx = [index for index, row in X.iterrows() if row.isnull().any()]
    if len(empty_row_idx) >= 1:
        X = X.drop(index=empty_row_idx)
        y = y.drop(index=empty_row_idx)
    
    ordinal_encoder = OrdinalEncoder()
    X = X.values[:, :-1]
    y = ordinal_encoder.fit_transform(y)
    y =  y[:, -1].astype(int)

    # Generate the 10 runs to calculate the accuracy and standard deviation
    acc_each_run = []
    i = 1
    while i < 11:
        # Test and train sets
        train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size=.3,
                                                               stratify=y)
        oc1_tree = OC1()
        oc1_tree = oc1_tree.fit(train_features, train_labels, max_depth=10, min_eps=0.000000000000001, min_samples=10,
                    node_fitness_function=gini)
    
        # Evaluate tree 
        predicted_test_labels = oc1_tree.predict(test_features) 
        score = accuracy_score(test_labels, predicted_test_labels)
    
        acc_each_run.append(score)
        i += 1

    ave_score = mean(acc_each_run)
    ave_std = stdev(std_each_run)

    acc_scores.append(round(ave_score,3))
    std_scores.append(round(ave_std,3))

oc1_df[ 'mean_score'] = acc_scores
oc1_df[ 'mean_std'] = std_scores
oc1_df.set_index("database", inplace = True)


In [None]:
#----------------------------------------------------------------------------------------------
#  SVM_ODT

# Create an empty dataframe 
col_names =  ['database', 'mean_score', 'mean_std'] 
svm_tree_df  = pd.DataFrame(columns = col_names) 
svm_tree_df['database'] = datasets_strings

# Iterate over datasets
acc_scores = []
std_scores = []
for ds_cnt, ds in enumerate(datasets):
    # Preprocess dataset, and split into training and test sets
    # data as pandas dataframes 
    X = ds.data.features 
    y = ds.data.targets 

    # Drop empty entries, if any
    empty_row_idx = [index for index, row in X.iterrows() if row.isnull().any()]
    if len(empty_row_idx) >= 1:
        X = X.drop(index=empty_row_idx)
        y = y.drop(index=empty_row_idx)
        
    X = StandardScaler().fit_transform(X)

    
    clf = Stree(random_state=1, max_features="auto")
    val_acc_scores = cross_val_score(clf, X, y,
                         cv=10, n_jobs=-1)

    ave_score = val_acc_scores.mean().round(3)
    ave_std = val_acc_scores.std().round(3)

    acc_scores.append(ave_score)
    std_scores.append(ave_std)

svm_tree_df[ 'mean_score'] = acc_scores
svm_tree_df[ 'mean_std'] = std_scores
svm_tree_df.set_index("database", inplace = True)
