In [1]:
# Install the UIC repo
! pip install ucimlrepo

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from gplearn.genetic import SymbolicClassifier
from sklearn.multiclass import OneVsRestClassifier
import time
from statistics import mean, stdev
import warnings
# Ignore all warnings
warnings.filterwarnings('ignore')



In [2]:
# Import the databases
from ucimlrepo import fetch_ucirepo 
  
# fetch datasets
iris = fetch_ucirepo(id=53) 
wine_quality = fetch_ucirepo(id=186) 
rice_cammeo_and_osmancik = fetch_ucirepo(id=545) 
breast_cancer_wisconsin_original = fetch_ucirepo(id=15) 
student_performance = fetch_ucirepo(id=320) 
drug_consumption = fetch_ucirepo(id=373)
yeast = fetch_ucirepo(id=110)
letter_recognition = fetch_ucirepo(id=59) 

datasets = [iris, wine_quality,rice_cammeo_and_osmancik,breast_cancer_wisconsin_original,
            student_performance, drug_consumption, yeast, letter_recognition]

datasets_strings = ['iris', 'wine_quality','rice_cammeo_and_osmancik','breast_cancer_wisconsin_original',
            'student_performance', 'drug_consumption', 'yeast', 'letter_recognition']


In [3]:
# Create an empty dataframe 
col_names = ['database', 'mean_score', 'mean_std','mean_size', 'size_std', 'ave_time','std_time']  
results_df  = pd.DataFrame(columns = col_names) 
results_df['database'] = datasets_strings

# Iterate over datasets
acc_scores = []
std_scores = []
depth_scores = []
depth_std_scores = []
ave_time_scores = []
std_time_scores = []

for ds_cnt, ds in enumerate(datasets):
    # Preprocess dataset, and split into training and test sets
    # data as pandas dataframes 
    X = ds.data.features 
    y = ds.data.targets 

    # Drop empty entries, if any
    empty_row_idx = [index for index, row in X.iterrows() if row.isnull().any()]
    if len(empty_row_idx) >= 1:
        X = X.drop(index=empty_row_idx)
        y = y.drop(index=empty_row_idx)
        
    X = StandardScaler().fit_transform(X)

    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3)    

    acc_each_run = []
    std_each_run = []
    depth_each_run = []
    time_each_run = []

    
    for train_index, test_index in sss.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            start_time = time.time()

            # Initialise the SymbolicClassifier
            function_set = ['add', 'sub', 'mul']
            base_cl = SymbolicClassifier(population_size = 100, generations = 50, stopping_criteria = 0.01, function_set=function_set,
            p_crossover = 0.95,parsimony_coefficient=0.0008,p_subtree_mutation = 0.1, p_hoist_mutation = 0.05, p_point_mutation = 0.1,
            max_samples = 0.9, verbose = 0)

            # Wrap the base classifier with OneVsRestClassifier
            ovr_classifier = OneVsRestClassifier(base_cl)
            
            # Train the classifier
            ovr_classifier.fit(X_train,y_train)

            end_time = time.time()
        
            score = cross_val_score(ovr_classifier, X_test, y_test, cv=10)
            exec_duration = end_time-start_time
            tree_depth = base_cl.genetic_.max_depth

            acc_each_run.append(score.mean().round(3))
            std_each_run.append(score.std().round(3))
            depth_each_run.append(tree_depth_)
            time_each_run.append(exec_duration)

    ave_score = mean(acc_each_run)
    ave_std = mean(std_each_run)
    ave_depth = mean(depth_each_run)
    ave_depth_std = stdev(depth_each_run)
    ave_time = mean(time_each_run)
    time_std = stdev(time_each_run)

    acc_scores.append(ave_score)
    std_scores.append(ave_std)
    depth_scores.append(ave_depth)
    depth_std_scores.append(ave_depth_std)
    ave_time_scores.append(ave_time)
    std_time_scores.append(time_std)

results_df[ 'mean_score'] = acc_scores
results_df[ 'mean_std'] = std_scores
results_df[ 'mean_size'] = depth_scores
results_df[ 'size_std'] = [round(x,3) for x in depth_std_scores]
results_df[ 'ave_time'] = [round(x,3) for x in ave_time_scores]
results_df[ 'std_time'] = [round(x,3) for x in std_time_scores]
results_df.set_index("database", inplace = True)
results_df

KeyboardInterrupt: 