# Imports

In [119]:
import pandas as pd
import numpy as np

#sklearn
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit

# Utils
import random

SEED = 42

In [120]:
def set_seed():
    random.seed(SEED)
    np.random.seed(SEED)
set_seed()

# Cascading Classifier

In [249]:
class CascadingTreeClassifier:
    def __init__(self, cascading_depth, threshold):
        self.depth = cascading_depth
        self.threshold = threshold
        self.trees = self.generate_trees(cascading_depth)
        self.tree_counter = {i+1: 0 for i in range(cascading_depth)}
        self.descison_counter = {i+1: 0 for i in range(cascading_depth)}

    @staticmethod
    def generate_trees(cascading_depth):
        trees = [DecisionTreeClassifier(max_depth=i, random_state=SEED+i) for i in range(1, cascading_depth + 1)]
        return trees

    def fit(self, X, y):
        _ = [t.fit(X, y) for t in self.trees]
        return self


    def predict_instance(self, x):
        for i, tree in enumerate(self.trees):
            confidence_arr = tree.predict_proba(x.reshape(1, - 1))
            self.tree_counter[i + 1] += 1
            if confidence_arr.max() > self.threshold:
                self.descison_counter[i+1] += 1
                return confidence_arr
        
        self.descison_counter[i+1] += 1
        return confidence_arr

    def predict_proba(self, X):
        predictions = np.apply_along_axis(func1d=self.predict_instance, axis=1, arr=X)
        predictions = np.squeeze(predictions, axis=1)
        return predictions

    def predict(self, X):
        proba_pred = self.predict_proba(X)
        classes_pred = np.argmax(proba_pred, axis=1)
        return classes_pred

    


In [231]:
def generate_batch_indices(indices_range, batch_num, op):
    batches = []
    idxs = set()
    
    batch_size = np.ceil(indices_range / (batch_num - (batch_num - 1) * op)).astype(int)

    for _ in range(batch_num):
        b = []
        b_size_left = batch_size

        if len(idxs) < batch_size:
            b = list(idxs)
            b_size_left -= len(idxs)
            idxs = set()

        if not idxs:
            idxs = set(range(indices_range))


        b += random.sample(idxs, b_size_left)
        batches.append(b)
        
        idxs = idxs - set(b)
    
    return batches

In [216]:
class AdvancedCascadingTreeClassifier(CascadingTreeClassifier):
    def __init__(self, cascading_depth, threshold, overlapping_percentage):
        super().__init__(cascading_depth, threshold)
        self.overlapping_percentage = overlapping_percentage

    def fit(self, X, y):        
        batch_indices = generate_batch_indices(X.shape[0], self.depth, self.overlapping_percentage)
        
        for i, tree in enumerate(self.trees):
            tree.fit(X[batch_indices[i],:], y[batch_indices[i]])
            
        return self

In [217]:
class FeatureCascadingTreeClassifier(CascadingTreeClassifier):
    def __init__(self, cascading_depth, threshold, overlapping_percentage):
        super().__init__(cascading_depth, threshold)
        self.overlapping_percentage = overlapping_percentage
        

    def fit(self, X, y):
        self.feature_indices = generate_batch_indices(X.shape[1], self.depth, self.overlapping_percentage)

        for i, tree in enumerate(self.trees):
            tree.fit(X[:,self.feature_indices[i]], y)
            
        return self
    
    def predict_instance(self, x):
        for i, tree in enumerate(self.trees):
            confidence_arr = tree.predict_proba(x[self.feature_indices[i]].reshape(1, - 1))
            self.tree_counter[i + 1] += 1
            if confidence_arr.max() > self.threshold:
                self.descison_counter[i+1] += 1
                return confidence_arr
        
        self.descison_counter[i+1] += 1
        return confidence_arr

In [218]:
class CombineCascadingTreeClassifier(CascadingTreeClassifier):
    def __init__(self, cascading_depth, threshold, overlapping_percentage):
        super().__init__(cascading_depth, threshold)
        self.overlapping_percentage = overlapping_percentage
        

    def fit(self, X, y):
        self.feature_indices = generate_batch_indices(X.shape[1], self.depth, self.overlapping_percentage)
        batch_indices = generate_batch_indices(X.shape[0], self.depth, self.overlapping_percentage)

        for i, tree in enumerate(self.trees):
            tree.fit(X.take(batch_indices[i], axis=0).take(self.feature_indices[i], axis=1), y[batch_indices[i]])
            
        return self
    
    def predict_instance(self, x):
        for i, tree in enumerate(self.trees):
            confidence_arr = tree.predict_proba(x[self.feature_indices[i]].reshape(1, - 1))
            self.tree_counter[i + 1] += 1
            if confidence_arr.max() > self.threshold:
                self.descison_counter[i+1] += 1
                return confidence_arr
        
        self.descison_counter[i+1] += 1
        return confidence_arr

# Dataset 1

In [2]:
!kaggle datasets download -d andrewmvd/fetal-health-classification  -f 'fetal_health.csv' -p ./datasets/

Downloading fetal_health.csv to ./datasets
100%|████████████████████████████████████████| 223k/223k [00:00<00:00, 1.02MB/s]



## Preprocessing

In [123]:
df_fetal = pd.read_csv(r'./datasets/fetal_health.csv')
df_fetal

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.000,0.000,0.000,0.000,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.000,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.000,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.000,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.000,0.008,0.000,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,140.0,0.000,0.000,0.007,0.000,0.0,0.0,79.0,0.2,25.0,...,137.0,177.0,4.0,0.0,153.0,150.0,152.0,2.0,0.0,2.0
2122,140.0,0.001,0.000,0.007,0.000,0.0,0.0,78.0,0.4,22.0,...,103.0,169.0,6.0,0.0,152.0,148.0,151.0,3.0,1.0,2.0
2123,140.0,0.001,0.000,0.007,0.000,0.0,0.0,79.0,0.4,20.0,...,103.0,170.0,5.0,0.0,153.0,148.0,152.0,4.0,1.0,2.0
2124,140.0,0.001,0.000,0.006,0.000,0.0,0.0,78.0,0.4,27.0,...,103.0,169.0,6.0,0.0,152.0,147.0,151.0,4.0,1.0,2.0


In [155]:
X, y = df_fetal.iloc[:,:-1].values, df_fetal['fetal_health'].astype(int).values
y =  LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

In [268]:
set_seed()
cdt = CascadingTreeClassifier(15, 0.95)

In [269]:
cdt.fit(X_train, y_train)

<__main__.CascadingTreeClassifier at 0x7f16c9cf52e0>

In [270]:
train_probs = cdt.predict_proba(X_train)
train_preds = cdt.predict(X_train)
test_probs = cdt.predict_proba(X_test)
test_preds = cdt.predict(X_test)
print(f'acc test {accuracy_score(y_test, test_preds):.4f}')
print(f'log loss test: {log_loss(y_test, test_probs):.4f}')
print(f'acc train {accuracy_score(y_train, train_preds):.4f}')
print(f'log loss train: {log_loss(y_train, train_probs):.4f}')

acc test 0.9108
log loss test: 1.7332
acc train 0.9665
log loss train: 0.1477


In [253]:
set_seed()
cdt_advanced = AdvancedCascadingTreeClassifier(15, 0.95, 0.9)

In [254]:
cdt_advanced.fit(X_train, y_train)

<__main__.AdvancedCascadingTreeClassifier at 0x7f16c9fb7ca0>

In [255]:
train_probs = cdt_advanced.predict_proba(X_train)
train_preds = cdt_advanced.predict(X_train)
test_probs = cdt_advanced.predict_proba(X_test)
test_preds = cdt_advanced.predict(X_test)
print(f'acc test {accuracy_score(y_test, test_preds):.4f}')
print(f'log loss test: {log_loss(y_test, test_probs):.4f}')
print(f'acc train {accuracy_score(y_train, train_preds):.4f}')
print(f'log loss train: {log_loss(y_train, train_probs):.4f}')

acc test 0.9061
log loss test: 1.3895
acc train 0.9288
log loss train: 0.8918


In [265]:
set_seed()
cdt_features = FeatureCascadingTreeClassifier(15, 0.95, 0.95)

In [266]:
cdt_features.fit(X_train, y_train)

<__main__.FeatureCascadingTreeClassifier at 0x7f16c9e259d0>

In [267]:
train_probs = cdt_features.predict_proba(X_train)
train_preds = cdt_features.predict(X_train)
test_probs = cdt_features.predict_proba(X_test)
test_preds = cdt_features.predict(X_test)
print(f'acc test {accuracy_score(y_test, test_preds):.4f}')
print(f'log loss test: {log_loss(y_test, test_probs):.4f}')
print(f'acc train {accuracy_score(y_train, train_preds):.4f}')
print(f'log loss train: {log_loss(y_train, train_probs):.4f}')

acc test 0.9178
log loss test: 1.5655
acc train 0.9688
log loss train: 0.1433


In [280]:
set_seed()
cdt_combine = CombineCascadingTreeClassifier(15, 0.95, 0.98)

In [281]:
cdt_combine.fit(X_train, y_train)

<__main__.CombineCascadingTreeClassifier at 0x7f16ca14e4c0>

In [282]:
train_probs = cdt_combine.predict_proba(X_train)
train_preds = cdt_combine.predict(X_train)
test_probs = cdt_combine.predict_proba(X_test)
test_preds = cdt_combine.predict(X_test)
print(f'acc test {accuracy_score(y_test, test_preds):.4f}')
print(f'log loss test: {log_loss(y_test, test_probs):.4f}')
print(f'acc train {accuracy_score(y_train, train_preds):.4f}')
print(f'log loss train: {log_loss(y_train, train_probs):.4f}')

acc test 0.9108
log loss test: 0.8598
acc train 0.9418
log loss train: 0.2792


# Dataset 2
https://archive.ics.uci.edu/ml/datasets/Anuran+Calls+%28MFCCs%29

## Preprocessing

In [288]:
df_frogs = pd.read_csv(r'./datasets/Frogs_MFCCs.csv')
df_frogs

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species,RecordID
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.108351,-0.077623,-0.009568,0.057684,0.118680,0.014038,Leptodactylidae,Adenomera,AdenomeraAndre,1
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,-0.090974,-0.056510,-0.035303,0.020140,0.082263,0.029056,Leptodactylidae,Adenomera,AdenomeraAndre,1
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,-0.050691,-0.023590,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae,Adenomera,AdenomeraAndre,1
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae,Adenomera,AdenomeraAndre,1
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.172700,0.266434,...,-0.048885,-0.053074,-0.088550,-0.031346,0.108610,0.079244,Leptodactylidae,Adenomera,AdenomeraAndre,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7190,1.0,-0.554504,-0.337717,0.035533,0.034511,0.443451,0.093889,-0.100753,0.037087,0.081075,...,0.069430,0.071001,0.021591,0.052449,-0.021860,-0.079860,Hylidae,Scinax,ScinaxRuber,60
7191,1.0,-0.517273,-0.370574,0.030673,0.068097,0.402890,0.096628,-0.116460,0.063727,0.089034,...,0.061127,0.068978,0.017745,0.046461,-0.015418,-0.101892,Hylidae,Scinax,ScinaxRuber,60
7192,1.0,-0.582557,-0.343237,0.029468,0.064179,0.385596,0.114905,-0.103317,0.070370,0.081317,...,0.082474,0.077771,-0.009688,0.027834,-0.000531,-0.080425,Hylidae,Scinax,ScinaxRuber,60
7193,1.0,-0.519497,-0.307553,-0.004922,0.072865,0.377131,0.086866,-0.115799,0.056979,0.089316,...,0.051796,0.069073,0.017963,0.041803,-0.027911,-0.096895,Hylidae,Scinax,ScinaxRuber,60


In [292]:
df_frogs[['Family', 'Genus', 'Species']].value_counts()

Family           Genus          Species               
Leptodactylidae  Adenomera      AdenomeraHylaedactylus    3478
Hylidae          Hypsiboas      HypsiboasCordobae         1121
Leptodactylidae  Adenomera      AdenomeraAndre             672
Dendrobatidae    Ameerega       Ameeregatrivittata         542
Hylidae          Hypsiboas      HypsiboasCinerascens       472
                 Dendropsophus  HylaMinuta                 310
Leptodactylidae  Leptodactylus  LeptodactylusFuscus        270
Hylidae          Scinax         ScinaxRuber                148
                 Osteocephalus  OsteocephalusOophagus      114
Bufonidae        Rhinella       Rhinellagranulosa           68
dtype: int64

In [299]:
X = df_frogs.iloc[:, :22].values
y = df_frogs['Genus'].values
y = LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

In [301]:
set_seed()
cdt = CascadingTreeClassifier(15, 0.95)

In [302]:
cdt.fit(X_train, y_train)

<__main__.CascadingTreeClassifier at 0x7f16c8219790>

In [303]:
train_probs = cdt.predict_proba(X_train)
train_preds = cdt.predict(X_train)
test_probs = cdt.predict_proba(X_test)
test_preds = cdt.predict(X_test)
print(f'acc test {accuracy_score(y_test, test_preds):.4f}')
print(f'log loss test: {log_loss(y_test, test_probs):.4f}')
print(f'acc train {accuracy_score(y_train, train_preds):.4f}')
print(f'log loss train: {log_loss(y_train, train_probs):.4f}')

acc test 0.9243
log loss test: 1.3792
acc train 0.9600
log loss train: 0.2172


In [304]:
set_seed()
cdt_advanced = AdvancedCascadingTreeClassifier(15, 0.95, 0.9)

In [305]:
cdt_advanced.fit(X_train, y_train)

<__main__.AdvancedCascadingTreeClassifier at 0x7f16c7eea9d0>

In [306]:
train_probs = cdt_advanced.predict_proba(X_train)
train_preds = cdt_advanced.predict(X_train)
test_probs = cdt_advanced.predict_proba(X_test)
test_preds = cdt_advanced.predict(X_test)
print(f'acc test {accuracy_score(y_test, test_preds):.4f}')
print(f'log loss test: {log_loss(y_test, test_probs):.4f}')
print(f'acc train {accuracy_score(y_train, train_preds):.4f}')
print(f'log loss train: {log_loss(y_train, train_probs):.4f}')

acc test 0.9117
log loss test: 1.9747
acc train 0.9350
log loss train: 1.3029


In [307]:
set_seed()
cdt_features = FeatureCascadingTreeClassifier(15, 0.95, 0.95)

In [308]:
cdt_features.fit(X_train, y_train)

<__main__.FeatureCascadingTreeClassifier at 0x7f16c7eea8e0>

In [309]:
train_probs = cdt_features.predict_proba(X_train)
train_preds = cdt_features.predict(X_train)
test_probs = cdt_features.predict_proba(X_test)
test_preds = cdt_features.predict(X_test)
print(f'acc test {accuracy_score(y_test, test_preds):.4f}')
print(f'log loss test: {log_loss(y_test, test_probs):.4f}')
print(f'acc train {accuracy_score(y_train, train_preds):.4f}')
print(f'log loss train: {log_loss(y_train, train_probs):.4f}')

acc test 0.9263
log loss test: 1.1128
acc train 0.9569
log loss train: 0.2252


In [310]:
set_seed()
cdt_combine = CombineCascadingTreeClassifier(15, 0.95, 0.98)

In [311]:
cdt_combine.fit(X_train, y_train)

<__main__.CombineCascadingTreeClassifier at 0x7f16c8002850>

In [312]:
train_probs = cdt_combine.predict_proba(X_train)
train_preds = cdt_combine.predict(X_train)
test_probs = cdt_combine.predict_proba(X_test)
test_preds = cdt_combine.predict(X_test)
print(f'acc test {accuracy_score(y_test, test_preds):.4f}')
print(f'log loss test: {log_loss(y_test, test_probs):.4f}')
print(f'acc train {accuracy_score(y_train, train_preds):.4f}')
print(f'log loss train: {log_loss(y_train, train_probs):.4f}')

acc test 0.9263
log loss test: 1.3124
acc train 0.9501
log loss train: 0.5693
