# Imports

In [254]:
import pandas as pd
import numpy as np

#sklearn
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer

# Utils
import random
import os
from tqdm import tqdm_notebook as tqdm


# pretty table
from prettytable import ALL as ALL
from prettytable import PrettyTable

SEED = 42

In [166]:
def set_seed():
    random.seed(SEED)
    np.random.seed(SEED)
set_seed()

# Cascading Classifier

Basic implementation of the cascading meta learner using simple decision trees with various depths.
for each sample, if the current model not confident with above 95% it passes it to the next model.

In [167]:
class CascadingTreeClassifier:
    def __init__(self, cascading_depth, threshold):
        self.depth = cascading_depth
        self.threshold = threshold
        self.trees = self.generate_trees(cascading_depth)
        self.tree_counter = {i+1: 0 for i in range(cascading_depth)}
        self.descison_counter = {i+1: 0 for i in range(cascading_depth)}

    @staticmethod
    def generate_trees(cascading_depth):
        trees = [DecisionTreeClassifier(max_depth=i, random_state=SEED+i) for i in range(1, cascading_depth + 1)]
        return trees

    def fit(self, X, y):
        _ = [t.fit(X, y) for t in self.trees]
        return self


    def predict_instance(self, x):
        for i, tree in enumerate(self.trees):
            confidence_arr = tree.predict_proba(x.reshape(1, - 1))
            self.tree_counter[i + 1] += 1
            if confidence_arr.max() > self.threshold:
                self.descison_counter[i+1] += 1
                return confidence_arr
        
        self.descison_counter[i+1] += 1
        return confidence_arr

    def predict_proba(self, X):
        predictions = np.apply_along_axis(func1d=self.predict_instance, axis=1, arr=X)
        predictions = np.squeeze(predictions, axis=1)
        return predictions

    def predict(self, X):
        proba_pred = self.predict_proba(X)
        classes_pred = np.argmax(proba_pred, axis=1)
        return classes_pred

    


generate_batch_indices will generate a slices of a given range

In [168]:
def generate_batch_indices(X_len, batch_num, op, y=None):
    batches = []
    idxs = set()
    
    batch_size = np.ceil(X_len / (batch_num - (batch_num - 1) * op)).astype(int)
    
    if y is None: # features splitting
        
        for _ in range(batch_num):
            b = []
            b_size_left = batch_size

            if len(idxs) < batch_size:
                b = list(idxs)
                b_size_left -= len(idxs)
                idxs = set()
                
            if not idxs:
                idxs = set(range(X_len))

            b += random.sample(idxs, b_size_left)
            batches.append(b)

            idxs = idxs - set(b)
    
        return batches
        
    else: # data splitting
        skf = StratifiedKFold(n_splits=batch_num, shuffle=True, random_state=SEED)
        X = np.arange(X_len)
        for others_indices, batch_indices in skf.split(X, y):
            num_to_choose = batch_size - len(batch_indices)
            batches.append(np.concatenate([batch_indices, np.random.choice(others_indices, num_to_choose, replace=False)]))
            
        return batches

#  Changes to the basic implementation

We are suggesting 3 changes to the basic implementation:
 - AdvancedCascadingTreeClassifier: each tree was trained on a different slice from the data with overlapping between slices
 - FeatureCascadingTreeClassifier: each tree was trained on a different slice from the features with overlapping between slices
 - CombineCascadingTreeClassifier: each tree was trained on both different data and different features with overlapping between slices

In [169]:
class AdvancedCascadingTreeClassifier(CascadingTreeClassifier):
    def __init__(self, cascading_depth, threshold, overlapping_percentage):
        super().__init__(cascading_depth, threshold)
        self.overlapping_percentage = overlapping_percentage

    def fit(self, X, y):        
        batch_indices = generate_batch_indices(X.shape[0], self.depth, self.overlapping_percentage, y)
        
        for i, tree in enumerate(self.trees):
            tree.fit(X[batch_indices[i],:], y[batch_indices[i]])
            
        return self

In [170]:
class FeatureCascadingTreeClassifier(CascadingTreeClassifier):
    def __init__(self, cascading_depth, threshold, overlapping_percentage):
        super().__init__(cascading_depth, threshold)
        self.overlapping_percentage = overlapping_percentage
        

    def fit(self, X, y):
        self.feature_indices = generate_batch_indices(X.shape[1], self.depth, self.overlapping_percentage)
        
        
        for i, tree in enumerate(self.trees):
            tree.fit(X[:,self.feature_indices[i]], y)
            
        return self
    
    def predict_instance(self, x):
        for i, tree in enumerate(self.trees):
            confidence_arr = tree.predict_proba(x[self.feature_indices[i]].reshape(1, - 1))
            self.tree_counter[i + 1] += 1
            if confidence_arr.max() > self.threshold:
                self.descison_counter[i+1] += 1
                return confidence_arr
        
        self.descison_counter[i+1] += 1
        return confidence_arr

In [171]:
class CombineCascadingTreeClassifier(CascadingTreeClassifier):
    def __init__(self, cascading_depth, threshold, overlapping_percentage):
        super().__init__(cascading_depth, threshold)
        self.overlapping_percentage = overlapping_percentage
        

    def fit(self, X, y):
        self.feature_indices = generate_batch_indices(X.shape[1], self.depth, self.overlapping_percentage)
        batch_indices = generate_batch_indices(X.shape[0], self.depth, self.overlapping_percentage, y)

        for i, tree in enumerate(self.trees):
            tree.fit(X.take(batch_indices[i], axis=0).take(self.feature_indices[i], axis=1), y[batch_indices[i]])
            
        return self
    
    def predict_instance(self, x):
        for i, tree in enumerate(self.trees):
            confidence_arr = tree.predict_proba(x[self.feature_indices[i]].reshape(1, - 1))
            self.tree_counter[i + 1] += 1
            if confidence_arr.max() > self.threshold:
                self.descison_counter[i+1] += 1
                return confidence_arr
        
        self.descison_counter[i+1] += 1
        return confidence_arr

# Evaluate

The evaluate function extract x,y from datasets and train on the different classifiers. 
It returns for each classifier the minimal log loss.

In [252]:
def evaluate(extract_X_y):
    
    # prepare train/test
    X,y = extract_X_y()
    y =  LabelEncoder().fit_transform(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)
    
    
    eval_reuslts = {}
    # vanila cdt evaluate
    set_seed()
    cdt = CascadingTreeClassifier(15, 0.95)
    test_acc, test_loss, train_acc, train_loss = evaluate_by_cdt(cdt, X_train, X_test, y_train, y_test)
    eval_reuslts.update({'vanila_test_acc': test_acc, 'vanila_test_loss': test_loss, 'vanila_train_acc': train_acc, 'vanila_train_loss': train_loss})
    
    min_test_loss = np.inf
    for op in np.arange(0.85, 0.99, 0.01):
        # cdt advanced, overlapping data
        set_seed()
        cdt_advanced = AdvancedCascadingTreeClassifier(15, 0.95, op)
        test_acc, test_loss, train_acc, train_loss =  evaluate_by_cdt(cdt_advanced, X_train, X_test, y_train, y_test)
        if test_loss < min_test_loss:
            eval_reuslts.update({'advanced_test_acc': test_acc, 'advanced_test_loss': test_loss, 'advanced_train_acc': train_acc, 'advanced_train_loss': train_loss})
            min_test_loss = test_loss
    
    min_test_loss = np.inf
    for op in np.arange(0.85, 0.99, 0.01):
        # cdt by features
        set_seed()
        cdt_features = FeatureCascadingTreeClassifier(15, 0.95, 0.95)
        test_acc, test_loss, train_acc, train_loss = evaluate_by_cdt(cdt_features, X_train, X_test, y_train, y_test)
        if test_loss < min_test_loss:
            eval_reuslts.update({'features_test_acc': test_acc, 'features_test_loss': test_loss, 'features_train_acc': train_acc, 'features_train_loss': train_loss})
            min_test_loss = test_loss
            
    min_test_loss = np.inf
    for op in np.arange(0.85, 0.99, 0.01):
        #  cdt combined, combination of cdt_features + cdt_advanced
        set_seed()
        cdt_combine = CombineCascadingTreeClassifier(15, 0.95, 0.98)
        test_acc, test_loss, train_acc, train_loss = evaluate_by_cdt(cdt_combine, X_train, X_test, y_train, y_test)
        if test_loss < min_test_loss:
            eval_reuslts.update({'combined_test_acc': test_acc, 'combined_test_loss': test_loss, 'combined_train_acc': train_acc, 'combined_train_loss': train_loss})
            min_test_loss = test_loss
    return eval_reuslts
    
    
    
    

In [173]:
def evaluate_by_cdt(cdt, X_train, X_test, y_train, y_test):
    cdt.fit(X_train, y_train)
    train_probs, train_preds, test_probs, test_preds  = cdt.predict_proba(X_train), cdt.predict(X_train), cdt.predict_proba(X_test),  cdt.predict(X_test)
    
    test_acc, test_loss= accuracy_score(y_test, test_preds), log_loss(y_test, test_probs)
    train_acc, train_loss =  accuracy_score(y_train, train_preds), log_loss(y_train, train_probs)
    
    return test_acc, test_loss, train_acc, train_loss
    

## Datasets Functions

The various dataset we checked on the 4 classifiers. each function loads the data and pre-processes it.

In [174]:
def fetal_health():
    if not os.path.exists(r'./datasets/fetal_health.csv'):
        !kaggle datasets download -d andrewmvd/fetal-health-classification  -f 'fetal_health.csv' -p './datasets/'
    df_fetal = pd.read_csv(r'./datasets/fetal_health.csv')
    X, y = df_fetal.iloc[:,:-1].values, df_fetal['fetal_health'].astype(int).values
    
    return X,y

In [175]:
def Frogs_MFCCs(): # 
    if not os.path.exists(r'./datasets/Frogs_MFCCs.csv'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00406/Anuran%20Calls%20(MFCCs).zip' -P './datasets/'
        !unzip -q './datasets/Anuran Calls (MFCCs).zip' 'Frogs_MFCCs.csv' -d './datasets/' && rm './datasets/Anuran Calls (MFCCs).zip'

    df_frogs = pd.read_csv(r'./datasets/Frogs_MFCCs.csv')
    X = df_frogs.iloc[:, :22].values
    y = df_frogs['Genus'].values
    
    return X,y


In [176]:
def avila(): # https://archive.ics.uci.edu/ml/machine-learning-databases/00459/
    if not os.path.exists(r'./datasets/avila-tr.txt'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00459/avila.zip' -P './datasets/'
        !unzip -q './datasets/avila.zip' 'avila/avila-tr.txt' -d './datasets/' && rm './datasets/avila.zip'
        !mv './datasets/avila/avila-tr.txt' './datasets/avila-tr.txt' && rmdir './datasets/avila'

    col_names = ['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'Class']
    df_avila = pd.read_csv(r'./datasets/avila-tr.txt', delimiter = ",", names = col_names)
    df_avila = df_avila[df_avila['Class'] != 'B']
    X, y = df_avila.iloc[:,:-1].values, df_avila['Class'].values
    
    return X,y


In [177]:
def log2(): # https://archive.ics.uci.edu/ml/machine-learning-databases/00542/
    
    if not os.path.exists(r'./datasets/log2.csv'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00542/log2.csv' -P './datasets/'


    df_log2 = pd.read_csv(r'./datasets/log2.csv')
    cols_order = list(df_log2.columns)
    cols_order[-1], cols_order[cols_order.index('Action')] = cols_order[cols_order.index('Action')], cols_order[-1]
    df_log2 = df_log2[cols_order]
    X, y = df_log2.iloc[:,:-1].values, df_log2['Action'].values
    X = MinMaxScaler().fit_transform(X)
    
    return X,y

In [178]:
def wine_quality_red(): # https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/
    if not os.path.exists(r'./datasets/winequality-red.csv'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv' -P './datasets/'


    df_wine_quality_red = pd.read_csv(r'./datasets/winequality-red.csv', delimiter=';')
    X, y = df_wine_quality_red.iloc[:,:-1].values, df_wine_quality_red['quality'].values
    X = MinMaxScaler().fit_transform(X)
    
    return X,y

In [187]:
def wine_quality_white(): # https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/
    if not os.path.exists(r'./datasets/winequality-white.csv'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv' -P './datasets/'


    df_wine_quality_white = pd.read_csv(r'./datasets/winequality-white.csv', delimiter=';')
    df_wine_quality_white = df_wine_quality_white[df_wine_quality_white['quality'] != 9]
    X, y = df_wine_quality_white.iloc[:,:-1].values, df_wine_quality_white['quality'].values
    X = MinMaxScaler().fit_transform(X)
    
    return X,y

In [180]:
def CTG(): # https://archive.ics.uci.edu/ml/datasets/cardiotocography
    if not os.path.exists(r'./datasets/CTG.xls'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00193/CTG.xls' -P './datasets/'


    df_ctg = pd.read_excel(r'./datasets/CTG.xlsx')
    X, y = df_ctg.iloc[:,:-1].values, df_ctg['CLASS'].values
    X = MinMaxScaler().fit_transform(X)
    
    return X,y
    

In [181]:
def Dry_beans(): # https://archive.ics.uci.edu/ml/machine-learning-databases/00602/
    if not os.path.exists(r'./datasets/Dry_Bean_Dataset.xlsx'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00602/DryBeanDataset.zip' -P './datasets/'
        !unzip -q './datasets/DryBeanDataset.zip' 'DryBeanDataset/Dry_Bean_Dataset.xlsx' -d './datasets/' && rm './datasets/DryBeanDataset.zip'
        !mv './datasets/DryBeanDataset/Dry_Bean_Dataset.xlsx' './datasets/Dry_Bean_Dataset.xlsx' && rmdir './datasets/DryBeanDataset'

    df_dry_beans = pd.read_excel(r'./datasets/Dry_Bean_Dataset.xlsx')
    X, y = df_dry_beans.iloc[:,:-1].values, df_dry_beans['Class'].values
    X = MinMaxScaler().fit_transform(X)
    
    return X,y
    

In [None]:

def Dry_beans(): # https://archive.ics.uci.edu/ml/machine-learning-databases/00241/
    if not os.path.exists(r'./datasets/Dry_Bean_Dataset.xlsx'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00602/DryBeanDataset.zip' -P './datasets/'
        !unzip -q './datasets/DryBeanDataset.zip' 'DryBeanDataset/Dry_Bean_Dataset.xlsx' -d './datasets/' && rm './datasets/DryBeanDataset.zip'
        !mv './datasets/DryBeanDataset/Dry_Bean_Dataset.xlsx' './datasets/Dry_Bean_Dataset.xlsx' && rmdir './datasets/DryBeanDataset'

    df_dry_beans = pd.read_excel(r'./datasets/Dry_Bean_Dataset.xlsx')
    X, y = df_dry_beans.iloc[:,:-1].values, df_dry_beans['Class'].values
    X = MinMaxScaler().fit_transform(X)
    
    return X,y

In [215]:
def Data_Cortex_Nuclear(): # https://archive.ics.uci.edu/ml/machine-learning-databases/00342/
    if not os.path.exists(r'./datasets/Data_Cortex_Nuclear.xls'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00342/Data_Cortex_Nuclear.xls' -P './datasets/'

    df_nuclear = pd.read_excel(r'./datasets/Data_Cortex_Nuclear.xls')
    df_nuclear.drop(['MouseID', 'BAD_N', 'BCL2_N', 'pCFOS_N', 'H3AcK18_N', 'EGR1_N', 'H3MeK4_N', 'ELK_N', 'MEK_N', 'Bcatenin_N', 'Genotype', 'Treatment', 'Behavior'], axis=1, inplace=True)
    df_nuclear.drop([987, 988, 989], axis=0, inplace=True)

    X, y = df_nuclear.iloc[:,:-1].values, df_nuclear['class'].values
    X = MinMaxScaler().fit_transform(X)
    
    return X,y
    

In [229]:

def turkiye_student(): #h ttps://archive.ics.uci.edu/ml/machine-learning-databases/00262/
    if not os.path.exists(r'./datasets/turkiye-student-evaluation_generic.csv'):
        !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00262/turkiye-student-evaluation_generic.csv' -P './datasets/'

    df_students = pd.read_csv(r'./datasets/turkiye-student-evaluation_generic.csv')
    cols_order = list(df_students.columns)
    cols_order[-1], cols_order[cols_order.index('class')] = cols_order[cols_order.index('class')], cols_order[-1]
    df_students = df_students[cols_order]
#     df_students.drop(['instr'], axis=1, inplace=True)
    X, y = df_students.iloc[:,:-1].values, df_students['class'].values
#     X = MinMaxScaler().fit_transform(X)
        
    return X,y


# Compare Classifiers

Running across all the datasets and computing loss and accuracy for both train and test. presenting the results in a table

In [None]:
datasets = [fetal_health, Frogs_MFCCs, avila, log2, wine_quality_red, wine_quality_white, CTG, Dry_beans, Data_Cortex_Nuclear, turkiye_student]
# datasets = [wine_quality_white]
result_table = PrettyTable(hrules=ALL)
# Leaves_sha_64

result_table.field_names = ['Dataset',
                            'Vanila_Acc', 'Vanila_Loss',
                            'Advanced_Acc' , 'Advanced_loss',
                            'Features_Acc' , 'Features_loss',
                            'Combined_Acc' , 'Combined_loss']

for dataset in tqdm(datasets):
    result = evaluate(dataset)
    result_table.add_row([dataset.__name__, '', '', '', '', '', '', '', ''])
    
    for key in result.keys():
        if key.endswith('loss'):
            result[key] = f'{result[key]:.4f}'
        else:
            result[key] = f'{result[key] * 100:.2f}%'
    
    BOLD = '\033[1m'
    BgCYAN = '\033[46m'
    BgORANGE = '\033[43m'
    END = '\033[0m'

    result_table.add_row(['train', result['vanila_train_acc'], result['vanila_train_loss'],
                                   result['advanced_train_acc'], result['advanced_train_loss'],
                                   result['features_train_acc'], result['features_train_loss'],
                                   result['combined_train_acc'], result['combined_train_loss']])
    
    van_acc_test, van_loss_test = result ['vanila_test_acc'], result['vanila_test_loss']
    adv_acc_test, adv_loss_test = result['advanced_test_acc'], result['advanced_test_loss']
    fea_acc_test, fea_loss_test = result['features_test_acc'], result['features_test_loss']
    comb_acc_test, comb_loss_test = result['combined_test_acc'], result['combined_test_loss']
    
    acc_scores = [van_acc_test, adv_acc_test, fea_acc_test, comb_acc_test]
    acc_scores[np.argmax(acc_scores)] = BgCYAN + BOLD + acc_scores[np.argmax(acc_scores)] + END
    loss_scores = [van_loss_test, adv_loss_test, fea_loss_test, comb_loss_test]
    loss_scores[np.argmin(loss_scores)] = BgORANGE + BOLD + loss_scores[np.argmin(loss_scores)] + END
    
    result_table.add_row(['test', acc_scores[0], loss_scores[0],
                                  acc_scores[1], loss_scores[1],
                                  acc_scores[2], loss_scores[2],
                                  acc_scores[3], loss_scores[3]])
print(result_table)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

## Conclusions

As we can see the most of the upgraded classifiers achevied better performance over the vanila classifier version.
