In [None]:
! pip install wheel

# Install the UIC repo
! pip install ucimlrepo

from sklearn.metrics import classification_report
from gplearn.genetic import SymbolicClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
import pandas as pd 
import math
from statistics import mean, stdev
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score
import time
import warnings

# Ignore all warnings
warnings.filterwarnings('ignore')


In [None]:
# Import the databases
from ucimlrepo import fetch_ucirepo 
  
# fetch datasets
iris = fetch_ucirepo(id=53) 
wine_quality = fetch_ucirepo(id=186) 
rice_cammeo_and_osmancik = fetch_ucirepo(id=545) 
breast_cancer_wisconsin_original = fetch_ucirepo(id=15) 
student_performance = fetch_ucirepo(id=320) 
drug_consumption = fetch_ucirepo(id=373)
yeast = fetch_ucirepo(id=110)
letter_recognition = fetch_ucirepo(id=59) 

datasets = [iris, wine_quality,rice_cammeo_and_osmancik,breast_cancer_wisconsin_original,
            student_performance, drug_consumption, yeast, letter_recognition]

datasets_strings = ['iris', 'wine_quality','rice_cammeo_and_osmancik','breast_cancer_wisconsin_original',
            'student_performance', 'drug_consumption', 'yeast', 'letter_recognition']

In [None]:
# Configure the classifiers

#----------------------------------------------------------------------------------------------
# CART
from sklearn.tree import DecisionTreeClassifier

#----------------------------------------------------------------------------------------------
# C4.5
from ohmt.trees.univariate.c4 import C45
from ohmt.trees.splits.evaluation import gini

#----------------------------------------------------------------------------------------------
# OC1
from ohmt.trees.multivariate.oc1 import OC1
from ohmt.trees.splits.evaluation import gini

#----------------------------------------------------------------------------------------------
# SVM_ODT
! pip install Stree
from stree import Stree

# Experiment 1: Proposed method against Standard Decision Tree Algorithms

In [None]:
#----------------------------------------------------------------------------------------------
# CART

# Create an empty dataframe 
col_names = ['database', 'mean_score', 'mean_std','mean_size', 'size_std', 'ave_time','std_time']  
cart_df  = pd.DataFrame(columns = col_names) 
cart_df['database'] = datasets_strings

# Iterate over datasets
acc_scores = []
std_scores = []
depth_scores = []
depth_std_scores = []
ave_time_scores = []
std_time_scores = []

for ds_cnt, ds in enumerate(datasets):
    # Preprocess dataset, and split into training and test sets
    # data as pandas dataframes 
    X = ds.data.features 
    y = ds.data.targets 

    # Drop empty entries, if any
    empty_row_idx = [index for index, row in X.iterrows() if row.isnull().any()]
    if len(empty_row_idx) >= 1:
        X = X.drop(index=empty_row_idx)
        y = y.drop(index=empty_row_idx)
        
    X = StandardScaler().fit_transform(X)

    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3)    
    dtc = DecisionTreeClassifier(max_depth = 10)

    acc_each_run = []
    std_each_run = []
    depth_each_run = []
    time_each_run = []

    
    for train_index, test_index in sss.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            start_time = time.time()
            model = dtc.fit(X_train, y_train)
            end_time = time.time()
        
            score = cross_val_score(model, X_test, y_test, cv=10)
            exec_duration = end_time-start_time
            tree_depth = model.tree_.max_depth

            acc_each_run.append(score.mean().round(3))
            std_each_run.append(score.std().round(3))
            depth_each_run.append(tree_depth)
            time_each_run.append(exec_duration)

    ave_score = mean(acc_each_run)
    ave_std = mean(std_each_run)
    ave_depth = mean(depth_each_run)
    ave_depth_std = stdev(depth_each_run)
    ave_time = mean(time_each_run)
    time_std = stdev(time_each_run)

    acc_scores.append(ave_score)
    std_scores.append(ave_std)
    depth_scores.append(ave_depth)
    depth_std_scores.append(ave_depth_std)
    ave_time_scores.append(ave_time)
    std_time_scores.append(time_std)

cart_df[ 'mean_score'] = acc_scores
cart_df[ 'mean_std'] = std_scores
cart_df[ 'mean_size'] = depth_scores
cart_df[ 'size_std'] = [round(x,3) for x in depth_std_scores]
cart_df[ 'ave_time'] = [round(x,3) for x in ave_time_scores]
cart_df[ 'std_time'] = [round(x,3) for x in std_time_scores]
cart_df.set_index("database", inplace = True)

In [None]:
#----------------------------------------------------------------------------------------------
# C4.5

# Create an empty dataframe 
col_names =  ['database', 'mean_score', 'mean_std','mean_size', 'size_std', 'ave_time','std_time'] 
c45_df  = pd.DataFrame(columns = col_names) 
c45_df['database'] = datasets_strings

# Iterate over datasets
acc_scores = []
std_scores = []
depth_scores = []
depth_std_scores = []
ave_time_scores = []
std_time_scores = []

for ds_cnt, ds in enumerate(datasets):
    # Preprocess dataset, and split into training and test sets
    # data as pandas dataframes 
    X = ds.data.features 
    y = ds.data.targets 

    # Drop empty entries, if any
    empty_row_idx = [index for index, row in X.iterrows() if row.isnull().any()]
    if len(empty_row_idx) >= 1:
        X = X.drop(index=empty_row_idx)
        y = y.drop(index=empty_row_idx)
        
    ordinal_encoder = OrdinalEncoder()
    X = X.values[:, :-1]
    X = StandardScaler().fit_transform(X)
    y = ordinal_encoder.fit_transform(y)
    y =  y[:, -1].astype(int)

    # Generate the 10 runs to calculate the accuracy and standard deviation
    acc_each_run = []
    depth_each_run = []
    time_each_run = []
    i = 1
    while i < 11:
        # Test and train sets
        train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size=.3,
                                                               stratify=y)
        c45_tree = C45()
        start_time = time.time()
        c45_tree = c45_tree.fit(train_features, train_labels, max_depth=10, min_eps=0.000000000000001, min_samples=10,
                    node_fitness_function=gini)
        end_time = time.time()
        
        tree_size = max(c45_tree.depth.values())
        exec_duration = end_time-start_time
        
    
        # Evaluate tree 
        predicted_test_labels = c45_tree.predict(test_features) 
        score = accuracy_score(test_labels, predicted_test_labels)
    
        acc_each_run.append(score)
        depth_each_run.append(tree_size)
        time_each_run.append(exec_duration)
        i += 1
    

    ave_score = mean(acc_each_run)
    ave_std = stdev(acc_each_run)
    ave_depth = mean(depth_each_run)
    ave_depth_std = stdev(depth_each_run)
    ave_time = mean(time_each_run)
    time_std = stdev(time_each_run)

    acc_scores.append(round(ave_score,3))
    std_scores.append(round(ave_std,3))
    depth_scores.append(round(ave_depth,3))
    depth_std_scores.append(round(ave_depth_std,3))
    ave_time_scores.append(round(ave_time,3))
    std_time_scores.append(round(time_std,3))

c45_df[ 'mean_score'] = acc_scores
c45_df[ 'mean_std'] = std_scores
c45_df[ 'mean_size'] = depth_scores
c45_df[ 'size_std'] = depth_std_scores
c45_df[ 'ave_time'] = ave_time_scores
c45_df[ 'std_time'] = std_time_scores
c45_df.set_index("database", inplace = True)

# Experiment 2: Proposed method against Oblique Decision Tree Algorithms  

In [None]:
#----------------------------------------------------------------------------------------------
# OC1

# Create an empty dataframe 
col_names =  ['database', 'mean_score', 'mean_std','mean_size', 'size_std', 'ave_time','std_time'] 
oc1_df  = pd.DataFrame(columns = col_names) 
oc1_df['database'] = datasets_strings

# Iterate over datasets
acc_scores = []
std_scores = []
depth_scores = []
depth_std_scores = []
ave_time_scores = []
std_time_scores = []

for ds_cnt, ds in enumerate(datasets):
    # Preprocess dataset, and split into training and test sets
    # data as pandas dataframes 
    X = ds.data.features 
    y = ds.data.targets 

    # Drop empty entries, if any
    empty_row_idx = [index for index, row in X.iterrows() if row.isnull().any()]
    if len(empty_row_idx) >= 1:
        X = X.drop(index=empty_row_idx)
        y = y.drop(index=empty_row_idx)
    
    ordinal_encoder = OrdinalEncoder()
    X = X.values[:, :-1]
    X = StandardScaler().fit_transform(X)
    y = ordinal_encoder.fit_transform(y)
    y =  y[:, -1].astype(int)

    # Generate the 10 runs to calculate the accuracy and standard deviation
    acc_each_run = []
    depth_each_run = []
    time_each_run = []
    i = 1
    while i < 11:
        # Test and train sets
        train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size=.3,
                                                               stratify=y)
        oc1_tree = OC1()
        start_time = time.time()
        oc1_tree = oc1_tree.fit(train_features, train_labels, max_depth=10, min_eps=0.000000000000001, min_samples=10,
                    node_fitness_function=gini)
        end_time = time.time()
        tree_size = max(oc1_tree.depth.values())
        exec_duration = end_time-start_time
    
        # Evaluate tree 
        predicted_test_labels = oc1_tree.predict(test_features) 
        score = accuracy_score(test_labels, predicted_test_labels)
    
        acc_each_run.append(score) 
        depth_each_run.append(tree_size)
        time_each_run.append(exec_duration)
        i += 1

    ave_score = mean(acc_each_run)
    ave_std = stdev(acc_each_run)
    ave_depth = mean(depth_each_run)
    ave_depth_std = stdev(depth_each_run)
    ave_time = mean(time_each_run)
    time_std = stdev(time_each_run)

    acc_scores.append(round(ave_score,3))
    std_scores.append(round(ave_std,3))
    depth_scores.append(round(ave_depth,3))
    depth_std_scores.append(round(ave_depth_std,3))
    ave_time_scores.append(round(ave_time,3))
    std_time_scores.append(round(time_std,3))

oc1_df[ 'mean_score'] = acc_scores
oc1_df[ 'mean_std'] = std_scores
oc1_df[ 'mean_size'] = depth_scores
oc1_df[ 'size_std'] = depth_std_scores
oc1_df[ 'ave_time'] = ave_time_scores
oc1_df[ 'std_time'] = std_time_scores
oc1_df.set_index("database", inplace = True)

In [None]:
#----------------------------------------------------------------------------------------------
#  SVM_ODT

# Create an empty dataframe 
col_names =   ['database', 'mean_score', 'mean_std','mean_size', 'size_std', 'ave_time','std_time'] 
svm_tree_df  = pd.DataFrame(columns = col_names) 
svm_tree_df['database'] = datasets_strings

# Iterate over datasets
acc_scores = []
std_scores = []
depth_scores = []
depth_std_scores = []
ave_time_scores = []
std_time_scores = []

for ds_cnt, ds in enumerate(datasets):
    # Preprocess dataset, and split into training and test sets
    # data as pandas dataframes 
    X = ds.data.features 
    y = ds.data.targets 

    # Drop empty entries, if any
    empty_row_idx = [index for index, row in X.iterrows() if row.isnull().any()]
    if len(empty_row_idx) >= 1:
        X = X.drop(index=empty_row_idx)
        y = y.drop(index=empty_row_idx)
        
    X = StandardScaler().fit_transform(X)
    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3) 
    svm_tree = Stree(random_state=1, max_depth=10, multiclass_strategy="ovr")

    depth_each_run = []
    time_each_run = []
    depth_each_run = []
    time_each_run = []

    for train_index, test_index in sss.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            start_time = time.time()
            model = svm_tree.fit(X_train, y_train)
            end_time = time.time()
        
            score = cross_val_score(model, X_test, y_test, cv=10)
            exec_duration = end_time-start_time
            tree_depth =model.max_depth

            acc_each_run.append(score.mean().round(3))
            std_each_run.append(score.std().round(3))
            depth_each_run.append(tree_depth)
            time_each_run.append(exec_duration)

    ave_score = mean(acc_each_run)
    ave_std = mean(std_each_run)
    ave_depth = mean(depth_each_run)
    ave_depth_std = stdev(depth_each_run)
    ave_time = mean(time_each_run)
    time_std = stdev(time_each_run)

    acc_scores.append(ave_score)
    std_scores.append(ave_std)
    depth_scores.append(ave_depth)
    depth_std_scores.append(ave_depth_std)
    ave_time_scores.append(ave_time)
    std_time_scores.append(time_std)

svm_tree_df[ 'mean_score'] = acc_scores
svm_tree_df[ 'mean_std'] = std_scores
svm_tree_df[ 'mean_size'] = depth_scores
svm_tree_df[ 'size_std'] = [round(x,3) for x in depth_std_scores]
svm_tree_df[ 'ave_time'] = [round(x,3) for x in ave_time_scores]
svm_tree_df.set_index("database", inplace = True)

# Experiment 3: The Effect of Hybrid Initialisation of the First Population

In [None]:
# Create an empty dataframe 
col_names = ['database', 'mean_score', 'mean_std','mean_size', 'size_std', 'ave_time','std_time']  
warm_started_df  = pd.DataFrame(columns = col_names) 
warm_started_df['database'] = datasets_strings

# Iterate over datasets
acc_scores = []
std_scores = []
depth_scores = []
depth_std_scores = []
ave_time_scores = []
std_time_scores = []

for ds_cnt, ds in enumerate(datasets):
    # Preprocess dataset, and split into training and test sets
    # data as pandas dataframes 
    X = ds.data.features 
    y = ds.data.targets 

    # Drop empty entries, if any
    empty_row_idx = [index for index, row in X.iterrows() if row.isnull().any()]
    if len(empty_row_idx) >= 1:
        X = X.drop(index=empty_row_idx)
        y = y.drop(index=empty_row_idx)
        
    X = StandardScaler().fit_transform(X)

    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3)    

    acc_each_run = []
    std_each_run = []
    depth_each_run = []
    time_each_run = []

    
    for train_index, test_index in sss.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            start_time = time.time()

            # Initialise the SymbolicClassifier
            function_set = ['add', 'sub', 'mul']
            base_cl = SymbolicClassifier(population_size = 100, generations = 50, stopping_criteria = 0.01, function_set=function_set,
                      p_crossover = 0.95,parsimony_coefficient=0.0008,p_subtree_mutation = 0.1, p_hoist_mutation = 0.05, p_point_mutation = 0.1,
                      max_samples = 0.9, verbose = 0, warm_start=True)

            # Wrap the base classifier with OneVsRestClassifier
            ovr_classifier = OneVsRestClassifier(base_cl)
            
            # Train the classifier
            ovr_classifier.fit(X_train,y_train)

            end_time = time.time()
        
            score = cross_val_score(ovr_classifier, X_test, y_test, cv=10)
            exec_duration = end_time-start_time
            tree_depth = base_cl.genetic_.max_depth

            acc_each_run.append(score.mean().round(3))
            std_each_run.append(score.std().round(3))
            depth_each_run.append(tree_depth)
            time_each_run.append(exec_duration)

    ave_score = mean(acc_each_run)
    ave_std = mean(std_each_run)
    ave_depth = mean(depth_each_run)
    ave_depth_std = stdev(depth_each_run)
    ave_time = mean(time_each_run)
    time_std = stdev(time_each_run)

    acc_scores.append(ave_score)
    std_scores.append(ave_std)
    depth_scores.append(ave_depth)
    depth_std_scores.append(ave_depth_std)
    ave_time_scores.append(ave_time)
    std_time_scores.append(time_std)

warm_started_df[ 'mean_score'] = acc_scores
warm_started_df[ 'mean_std'] = std_scores
warm_started_df[ 'mean_size'] = depth_scores
warm_started_df[ 'size_std'] = [round(x,3) for x in depth_std_scores]
warm_started_df[ 'ave_time'] = [round(x,3) for x in ave_time_scores]
warm_started_df[ 'std_time'] = [round(x,3) for x in std_time_scores]
warm_started_df.set_index("database", inplace = True)