# PIP install

In [21]:
!pip install -U scikit-learn

Note: you may need to restart the kernel to use updated packages.


# Imports

In [1]:
import pandas as pd
import numpy as np

#sklearn
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit

In [7]:

!kaggle datasets download -d andrewmvd/fetal-health-classification  -f 'fetal_health.csv' -p ./datasets/

fetal_health.csv: Skipping, found more recently modified local copy (use --force to force download)


In [2]:
df_fetal = pd.read_csv(r'./datasets/fetal_health.csv')
df_fetal

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.000,0.000,0.000,0.000,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.000,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.000,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.000,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.000,0.008,0.000,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,140.0,0.000,0.000,0.007,0.000,0.0,0.0,79.0,0.2,25.0,...,137.0,177.0,4.0,0.0,153.0,150.0,152.0,2.0,0.0,2.0
2122,140.0,0.001,0.000,0.007,0.000,0.0,0.0,78.0,0.4,22.0,...,103.0,169.0,6.0,0.0,152.0,148.0,151.0,3.0,1.0,2.0
2123,140.0,0.001,0.000,0.007,0.000,0.0,0.0,79.0,0.4,20.0,...,103.0,170.0,5.0,0.0,153.0,148.0,152.0,4.0,1.0,2.0
2124,140.0,0.001,0.000,0.006,0.000,0.0,0.0,78.0,0.4,27.0,...,103.0,169.0,6.0,0.0,152.0,147.0,151.0,4.0,1.0,2.0


# Cascading Classifier

In [3]:
class CascadingTreeClassifier:
    def __init__(self, cascading_depth, threshold):
        self.depth = cascading_depth
        self.threshold = threshold
        self.trees = self.generate_trees(cascading_depth)
        self.tree_counter = {i+1: 0 for i in range(cascading_depth)}
        self.descison_counter = {i+1: 0 for i in range(cascading_depth)}

    @staticmethod
    def generate_trees(cascading_depth):
        trees = [DecisionTreeClassifier(max_depth=i) for i in range(1, cascading_depth + 1)]
        return trees

    def fit(self, X, y):
        _ = [t.fit(X, y) for t in self.trees]
        return self


    def predict_instance(self, x):
        for i, tree in enumerate(self.trees):
            confidence_arr = tree.predict_proba(x.reshape(1, - 1))
            self.tree_counter[i + 1] += 1
            if confidence_arr.max() > self.threshold:
                self.descison_counter[i+1] += 1
                return confidence_arr
        
        self.descison_counter[i+1] += 1
        return confidence_arr

    def predict_proba(self, X):
        predictions = np.apply_along_axis(func1d=self.predict_instance, axis=1, arr=X)
        predictions = np.squeeze(predictions, axis=1)
        return predictions

    def predict(self, X):
        proba_pred = self.predict_proba(X)
        classes_pred = np.argmax(proba_pred, axis=1)
        return classes_pred

    


# Preprocessing

In [4]:
cdt = CascadingTreeClassifier(15, 0.95)

In [7]:
X, y = df_fetal.iloc[:,:-1].values, df_fetal['fetal_health'].astype(int).values
y =  LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [64]:
cdt.fit(X_train, y_train)

<__main__.CascadingTreeClassifier at 0x7f42c12c3100>

In [65]:
train_probs = cdt.predict_proba(X_train)
train_preds = cdt.predict(X_train)
test_probs = cdt.predict_proba(X_test)
test_preds = cdt.predict(X_test)
print(f'acc test {accuracy_score(y_test, test_preds):.4f}')
print(f'log loss test: {log_loss(y_test, test_probs):.4f}')
print(f'acc train {accuracy_score(y_train, train_preds):.4f}')
print(f'log loss train: {log_loss(y_train, train_probs):.4f}')

acc test 0.9131
log loss test: 1.6521
acc train 0.9665
log loss train: 0.1477


In [14]:
class AdvancedCascadingTreeClassifier(CascadingTreeClassifier):
    def __init__(self, cascading_depth, threshold, overlapping_percentage):
        super().__init__(cascading_depth, threshold)
        self.overlapping_percentage = overlapping_percentage

    def fit(self, X, y):
        all_data = np.column_stack((X, y))
        np.random.shuffle(all_data)
        X, y = all_data[:, :-1], all_data[:, -1]
        chunk_size = int(X.shape[0] / (self.depth - (self.depth - 1) * self.overlapping_percentage)) + 1
        step = int(chunk_size - self.overlapping_percentage * chunk_size) + 1

        chunks_x = []
        chunks_y = []
        for i in range(0, X.shape[0], step):
            chunks_x.append(X[i: i + chunk_size, :])
            chunks_y.append(y[i: i + chunk_size])
        
        for i, tree in enumerate(self.trees):
            tree.fit(chunks_x[i], chunks_y[i])
            
        return self

In [36]:
cdt_advanced = AdvancedCascadingTreeClassifier(15, 0.95, 0.1)

In [37]:
cdt_advanced.fit(X_train, y_train)

<__main__.AdvancedCascadingTreeClassifier at 0x7ff167d75130>

In [38]:
train_probs = cdt_advanced.predict_proba(X_train)
train_preds = cdt_advanced.predict(X_train)
test_probs = cdt_advanced.predict_proba(X_test)
test_preds = cdt_advanced.predict(X_test)
print(f'acc test {accuracy_score(y_test, test_preds):.4f}')
print(f'log loss test: {log_loss(y_test, test_probs):.4f}')
print(f'acc train {accuracy_score(y_train, train_preds):.4f}')
print(f'log loss train: {log_loss(y_train, train_probs):.4f}')

acc test 0.8498
log loss test: 2.9457
acc train 0.8635
log loss train: 2.6105
