In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import json
import glob
from numpy import array

In [2]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier 
from sklearn.cluster import KMeans, MiniBatchKMeans, spectral_clustering, Birch
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import Ridge, Lasso, RidgeClassifier, LogisticRegression
from sklearn.svm import SVR, NuSVR, LinearSVR, SVC
from cache_decorator import Cache
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate, RandomizedSearchCV
from skopt import BayesSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, Normalizer
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from multiprocessing import cpu_count
from pathlib import Path
from xgboost import XGBRegressor, XGBClassifier
from mord import OrdinalRidge, LogisticAT, LogisticIT

In [50]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA, TruncatedSVD
import skfuzzy as fuzz

class GroupedPCA(BaseEstimator, TransformerMixin):

    def __init__(self, n_components=2, mask=None):  
        # mask should contain selected cols. Suppose it is boolean to avoid code overhead
        self.n_components = n_components
        self.mask = mask

        n_groups = np.max(self.mask)
        self.indices = []
        for i in range(0, n_groups+1):
            indeces_list = []
            for j, k in enumerate(self.mask):
                if k == i:
                    indeces_list.append(j)
            self.indices.append(indeces_list)

    def fit(self, X, y = None):
        self.pca_list = []
        for i, idx in enumerate(self.indices[1:]): 
                pca = PCA(n_components=self.n_components)
                pca.fit(X[:, idx])
                self.pca_list.append(pca)
        return self

    def transform(self, X, y = None):
        transformed_cols = X[:, self.indices[0]]
        for k, idx in enumerate(self.indices[1:]):
            pca_transformed = self.pca_list[k].transform(X[:, idx])
            transformed_cols = np.hstack([transformed_cols, pca_transformed]) 
        return transformed_cols

class GroupedSVD(BaseEstimator, TransformerMixin):

    def __init__(self, n_components=2, mask=None):  
        # mask should contain selected cols. Suppose it is boolean to avoid code overhead
        self.n_components = n_components
        self.mask = mask

        n_groups = np.max(self.mask)
        self.indices = []
        for i in range(0, n_groups+1):
            indeces_list = []
            for j, k in enumerate(self.mask):
                if k == i:
                    indeces_list.append(j)
            self.indices.append(indeces_list)

    def fit(self, X, y = None):
        self.svd_list = []
        for i, idx in enumerate(self.indices[1:]): 
                svd = TruncatedSVD(n_components=self.n_components)
                svd.fit(X[:, idx])
                self.svd_list.append(svd)
        return self

    def transform(self, X, y = None):
        transformed_cols = X[:, self.indices[0]]
        for k, idx in enumerate(self.indices[1:]):
            svd_transformed = self.svd_list[k].transform(X[:, idx])
            transformed_cols = np.hstack([transformed_cols, svd_transformed])
        return transformed_cols
        
class MaskedPCA(BaseEstimator, TransformerMixin):

    def __init__(self, n_components=2, mask=None):  
        # mask should contain selected cols. Suppose it is boolean to avoid code overhead
        self.n_components = n_components
        self.mask = mask

    def fit(self, X, y = None):
        
        self.pca = PCA(n_components=self.n_components)
        mask = self.mask
        mask = self.mask if self.mask is not None else slice(None)

        self.pca.fit(X[:, mask])
        return self

    def transform(self, X, y = None):
        mask = self.mask if self.mask is not None else slice(None)
        pca_transformed = self.pca.transform(X[:, mask])
        if self.mask is not None:
            remaining_cols = X[:, ~mask]
            return np.hstack([remaining_cols, pca_transformed])
        else:
            return pca_transformed
        

class MaskedSVD(BaseEstimator, TransformerMixin):

    def __init__(self, n_components=2, mask=None):  
        # mask should contain selected cols. Suppose it is boolean to avoid code overhead
        self.n_components = n_components
        self.mask = mask

    def fit(self, X, y = None):
        self.svd = TruncatedSVD(n_components=self.n_components)
        mask = self.mask
        mask = self.mask if self.mask is not None else slice(None)
        self.svd.fit(X[:, mask])
        return self

    def transform(self, X, y = None):
        mask = self.mask if self.mask is not None else slice(None)
        svd_transformed = self.svd.transform(X[:, mask])
        if self.mask is not None:
            remaining_cols = X[:, ~mask]
            return np.hstack([remaining_cols, svd_transformed])
        else:
            return svd_transformed
        
class ChainedEstimator(BaseEstimator, TransformerMixin):
    def __init__(self, regressor = GradientBoostingRegressor(), clf = RandomForestClassifier(), binary_threshold = 2):
        self.binary_threshold = binary_threshold
        self.regressor = regressor
        self.clf = clf
        
    def fit(self, X, y = None):
        self.regressor.fit(X, y)
        
        self.sub_estimators = {}
        for i in range(2,7,1):
            class_0_idx = np.where(y == i-1)
            class_1_idx = np.where(y == i)
            class_2_idx = np.where(y == i + 1)
            
            data_0 = X[class_0_idx, :]
            data_0 = data_0.reshape((data_0.shape[1], data_0.shape[2]))
            
            data_1 = X[class_1_idx, :]
            data_1 = data_1.reshape((data_1.shape[1], data_1.shape[2]))
            
            data_2 = X[class_2_idx, :]
            data_2 = data_2.reshape((data_2.shape[1], data_2.shape[2]))
            
            sub_x_1 = np.vstack((data_0, data_1))
            sub_y_1 = [0 for r in range(0, data_0.shape[0])] + [1 for r in range(0, data_1.shape[0])]
            clf_1 = self.clf
            clf_1.fit(sub_x_1, sub_y_1)
            self.sub_estimators[str(i-1)+str(i)] = clf_1
            
            sub_x_2 = np.vstack((data_1, data_2))
            sub_y_2 = [1 for r in range(0, data_1.shape[0])] + [0 for r in range(0, data_2.shape[0])]
            clf_2 = self.clf   
            clf_2.fit(sub_x_2, sub_y_2)
            self.sub_estimators[str(i)+str(i+1)] = clf_2
            
            sub_x_3 = np.vstack((data_0, data_2))
            sub_y_3 = [0 for r in range(0, data_0.shape[0])] + [1 for r in range(0, data_2.shape[0])]
            clf_3 = self.clf
            clf_3.fit(sub_x_3, sub_y_3)
            self.sub_estimators[str(i-1)+str(i+1)] = clf_3

        return self
    
    def predict(self, X):        
        regressor_preds = self.regressor.predict(X)
        clf_predictions = []
        for index, p in enumerate(regressor_preds):
            p = int(np.round(p))
            if p > 7:
                p = 7
            clf_preds = [self.sub_estimators[k].predict([X[index, :]]) for k in self.sub_estimators.keys() if str(p) in k]

            if p == 1:
                p = p if clf_preds[0] == 0 else p+1
            elif p == 7:
                p = p if clf_preds[0] == 0 else p-1
            elif len(clf_preds) > 1:
                if clf_preds[0] == 0 or clf_preds[1] == 0:
                    p = p-1 if self.sub_estimators[str(p-1)+str(p+1)].predict([X[index, :]]) == 0 else p+1
            clf_predictions.append(p)
        return np.array(clf_predictions)
    
class GranularBinaryClassifier(BaseEstimator, TransformerMixin):
    def __init__(self, estimator = RandomForestClassifier(), binary_threshold = 2):
        self.binary_threshold = binary_threshold
        self.estimator = estimator
        
    def recursive_ensamble(self, X, y, binary_threshold, dict_estimators, key = "0"):
        if len(np.unique(y)) == 1:
            dict_estimators[key] = int(np.unique(y)[0])
            return dict_estimators
        else:
            class_0_idx = np.where(y <= binary_threshold)
            class_1_idx = np.where(y > binary_threshold)
  
            data_0 = X[class_0_idx, :]
            data_0 = data_0.reshape((data_0.shape[1], data_0.shape[2]))
            labels_0 = y[class_0_idx]
            
            data_1 = X[class_1_idx, :]
            data_1 = data_1.reshape((data_1.shape[1], data_1.shape[2]))
            labels_1 = y[class_1_idx]
            
            sub_x = np.vstack((data_0, data_1))
            sub_y = [0 for r in range(0, data_0.shape[0])] + [1 for r in range(0, data_1.shape[0])]

            t_0 = (np.min(labels_0)+np.max(labels_0)) // 2
            t_1 = (np.min(labels_1)+np.max(labels_1)) // 2

            dict_estimators[key] = self.estimator
            dict_estimators[key].fit(sub_x, sub_y)

            dict_estimators = self.recursive_ensamble(data_0, labels_0, t_0, dict_estimators, key+"0")
            dict_estimators = self.recursive_ensamble(data_1, labels_1, t_1,  dict_estimators,  key+"1")
        return dict_estimators
            
    def fit(self, X, y = None):
        self.dict_estimators = self.recursive_ensamble(X, y, self.binary_threshold, {})

        return self

    def predict(self, X):        
        predictions = []
        for index in range(0, X.shape[0]):
            key = "0"
            while type(self.dict_estimators[key]) != int:
                p = self.dict_estimators[key].predict([X[index, :]])[0]
                key = key+str(p)
            
            predictions.append(self.dict_estimators[key])
        return np.array(predictions)
    
class FuzzyCMeans(BaseEstimator, TransformerMixin):
    def __init__(self, n_centers = 2, m = 2, error = 0.0005, maxiter=1000):
        self.n_centers = n_centers
        self.error = error
        self.maxiter = maxiter
        self.m = m
            
    def fit(self, X, y = None):
        self.cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(X.T, self.n_centers, self.m, error=self.error, maxiter=self.maxiter, init=None)
        return self

    def predict(self, X):        
        u, u0, d, jm, p, fpc = fuzz.cluster.cmeans_predict(X.T, self.cntr, self.m, error=self.error, maxiter=self.maxiter)
        cluster_membership = np.argmax(u, axis=0) + 1
        return cluster_membership
            

In [10]:
from tensorflow.keras.layers import Dense, InputLayer, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import RootMeanSquaredError, Accuracy
from tensorflow.keras.optimizers import SGD, Adam
from extra_keras_metrics import get_standard_binary_metrics
from tensorflow.keras import regularizers

def baseline_model():
    # create model
    model = Sequential()

    model.add(Dense(8, activation='relu', kernel_initializer='ones'))
    model.add(Dense(1,activation='sigmoid'))
    
    # Compile model
    model.compile(loss="binary_crossentropy",optimizer='nadam', metrics=get_standard_binary_metrics())
    return model

In [51]:
from tensorflow.keras.layers import Dense, InputLayer, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import RootMeanSquaredError, Accuracy
from tensorflow.keras.optimizers import SGD, Adam
from extra_keras_metrics import get_standard_binary_metrics
from tensorflow.keras import regularizers

def get_dataset(df, dataset_settings):
    X = df.dropna()
    X.Sesso = pd.Categorical(X.Sesso)
    X.Sesso = X.Sesso.cat.codes
    
    no_injury_parts = {
                        "TESTA" : 5,
                        "TORACE" : 5,
                        "ADDOME" : 5,
                        "SCHELETRO" : 5
                      }

    y = X.filter(["Altezza di precipitazione (m)"])
    y = y.rename(columns={"Altezza di precipitazione (m)" : "Altezza"})
    X = X.drop(["Unnamed: 0","Altezza di precipitazione (m)", "Casi", "mgh"], axis = 1)

    if dataset_settings.get("only_BMI") == True:
        X = X.drop(["Peso", "Altezza soggetto"], axis = 1)
    
    if dataset_settings.get("total_dmg") == True:
        def total_dmg(row):
            max_dmg = 0
            row["total_dmg"] = 0

            for k, v in no_injury_parts.items():
                row["total_dmg"] = row["total_dmg"] + row[k]
                max_dmg = max_dmg + v

            row["total_dmg"] = row["total_dmg"] / max_dmg
            
            return row
                
        X = X.apply(lambda row: total_dmg(row), axis = 1)
    
    if dataset_settings.get("percentage") == True:
        for k, v in no_injury_parts.items():
            X[k] = X[k].apply(lambda x: x / v)
        
    if dataset_settings.get("type_of_labels") == "floors":
        heights = y.sort_values('Altezza', ascending = True).Altezza.unique()
        binarize = dataset_settings.get("binarize")
        
        for i, v in enumerate(heights):
            y.Altezza = y.Altezza.replace(v, i+1)
            if binarize != None:
                if i+1 <= binarize:
                    y.Altezza = y.Altezza.replace(i+1, 0)
                else:
                    y.Altezza = y.Altezza.replace(i+1, 1)
        if binarize != None:    
            print(f"RATIO low floors vs high floors: {len(y.loc[y['Altezza'] == 0])/len(y)} // {len(y.loc[y['Altezza'] == 1])/len(y)}") 
    
    dataset_variants = dataset_settings.get('dataset_variants')
    dataset_list = []
    for dataset_variant in dataset_variants:
        if dataset_variant == "complete":
            X_ = X
        elif dataset_variant == "only_binary":
            if dataset_settings.get("total_dmg") == True:
                X_ = X.drop(["TESTA", "TORACE", "ADDOME", "SCHELETRO", "total_dmg"], axis = 1)
            else:
                X_ = X.drop(["TESTA", "TORACE", "ADDOME", "SCHELETRO"], axis = 1)

        elif dataset_variant == "only_totals":
            if dataset_settings.get("total_dmg") == True:
                X_ = X.filter(["Sesso", "Età", "Altezza soggetto", "Peso", "BMI", "TESTA", "TORACE", "ADDOME", "SCHELETRO", "total_dmg"])
            else:
                X_ = X.filter(["Sesso", "Età", "Altezza soggetto", "Peso", "BMI", "TESTA", "TORACE", "ADDOME", "SCHELETRO"])   
        else:
            raise Exception(f"{dataset_variant} is not an available variant of the dataset")
                       
        dataset_list.append({"dataset_variant": dataset_variant, "X": X_, "y" : y})

    return dataset_list


def baseline_model():
    # create model
    model = Sequential()

    model.add(Dense(8, activation='relu', kernel_initializer='ones'))
    model.add(Dense(1,activation='sigmoid'))
    
    # Compile model
    model.compile(loss="binary_crossentropy",optimizer='nadam', metrics=get_standard_binary_metrics())
    return model

def eval_params(params, X, mask):
    parsed_params = []
    for spaces in params:
        parsed_spaces = {}
        for k,v in spaces.items():
            parsed_spaces[k] = eval(v)
        parsed_params.append(parsed_spaces)
    return parsed_params



In [54]:
cpu_1 = 7
cpu_2 = 7
@Cache(
    cache_path="results/{experiment_name}.json"
)
def experiment(path, experiment_name, experiment_setup):
    path_str = "data/victims_of_fall_V1.csv"
    df = pd.read_csv(path_str)
    binarize = None if "binarize" not in experiment_setup.keys() else eval(experiment_setup["binarize"])
        
    dataset_list = get_dataset(df, experiment_setup["dataset_settings"])

    all_performance = []
    for dataset in dataset_list:
        dataset_variant = dataset["dataset_variant"]
        print(dataset_variant)

        X = dataset["X"].to_numpy()
        y = dataset["y"].to_numpy(dtype = 'int32').ravel()

        #TYPE OF TASK
        task = experiment_setup['task']
        
        #THE MASK IS REQUIRED WHEN MASKEDPCA OR MASKEDSVD IS USED
        if experiment_setup["dataset_settings"]["only_BMI"] == 1:
            mask = np.arange(X.shape[1]) > 2
        else:
            mask = np.arange(X.shape[1]) > 4

        #BUILDING THE PIPELINE
        pipe_steps = []
        for key, value in experiment_setup["pipe"].items():
            pipe_steps.append((key, eval(value)))
        pipe = Pipeline(pipe_steps)    
        
        hp_optimizer = experiment_setup["hp_optimizer"]
        metrics = hp_optimizer.get("metrics")
        
        params_list = []
        parsed_params = {}
        for parameter, values in hp_optimizer.get("params")[0].items():
            parsed_params[parameter] = eval(values)
        params_list.append(parsed_params)
            
        list_skf = []
                
        if "n_split_outer_cv" in hp_optimizer.keys():
            cv_type = "n_split_outer_cv"
            list_skf.append(StratifiedKFold(n_splits=hp_optimizer[cv_type], shuffle = True, random_state=42))
        
        if "n_split_inner_cv" in hp_optimizer.keys():
            cv_type = "n_split_inner_cv"
            list_skf.append(StratifiedKFold(n_splits=hp_optimizer[cv_type], shuffle = True, random_state=42))
            
        if hp_optimizer["type"] == 'GridSearchCV':
            optimizer = GridSearchCV(pipe, parsed_params, n_jobs = cpu_1, cv=list_skf[-1], verbose=1, scoring = metrics, refit = metrics[0], return_train_score=True).fit(X,y)
        elif hp_optimizer["type"] == 'RandomizeSearchCV':
            n_iter = hp_optimizer["n_iter"] if "n_iter" in hp_optimizer.keys() else 100
            print(f"n_iter:{n_iter}")
            optimizer = RandomizedSearchCV(pipe, parsed_params, n_iter = n_iter, cv=list_skf[-1], verbose=1, scoring = metrics, refit = metrics[0], return_train_score=True).fit(X,y)
        elif hp_optimizer["type"] == 'BayesSearchCV':
            n_iter = hp_optimizer["n_iter"] if "n_iter" in hp_optimizer.keys() else 100
            print(f"n_iter:{n_iter}")  
            optimizer = BayesSearchCV(pipe, parsed_params, n_jobs = cpu_1, n_iter = n_iter, cv=list_skf[-1], verbose=1, scoring = metrics, refit = metrics[0], return_train_score=True).fit(X,y)

        if "n_split_inner_cv" in hp_optimizer.keys():
            cv_dic = cross_validate(optimizer, X, y, cv=list_skf[0], n_jobs = cpu_2, scoring=metrics, return_estimator=True, verbose = 2, return_train_score=True)
            best_params_cv = [estimator.best_params_ for estimator in cv_dic["estimator"]]
            
            scores_test_dict = {}
            scores_train_dict = {}
            for metric in metrics:
                scores_test_dict[metric] = np.mean(cv_dic[f"test_{metric}"])
                scores_train_dict[metric] = np.mean(cv_dic[f"train_{metric}"])

            cv_results = str(cv_dic)
        else:
            best_params_cv = [optimizer.best_params_]
            best_model = pd.DataFrame(optimizer.cv_results_).iloc[optimizer.best_index_]

            scores_test_dict = {}
            scores_train_dict = {}
            for metric in metrics:
                scores_test_dict[metric] = best_model[f"mean_test_{metric}"]
                scores_train_dict[metric] = best_model[f"mean_train_{metric}"]

            cv_results = str(optimizer.cv_results_)
            
        score = {
            "experiment_name": experiment_name,
            "dataset_variant": dataset_variant,
            "estimator" : experiment_setup['pipe']['estimator'],
            "task": task,
            "hp_optimizer": hp_optimizer['type'],
            "cv_type": cv_type,
            "mean_test_score": scores_test_dict,
            "mean_train_score": scores_train_dict,
            "best_params": str(best_params_cv),
            "cv_results" : cv_results,
            "experiment_setup": experiment_setup
        }
        
        all_performance.append(score)
                
    return all_performance

In [None]:
%%time

import json
from types import SimpleNamespace
experiment_name = "Experiment_12_22_12_21"
with open (f"./input_exp/{experiment_name}.json", "r") as myfile:
    data=myfile.read().replace('\n','')

experiment_setup = json.loads(data)

all_performance = experiment("./results/", experiment_name, experiment_setup)

In [94]:
from numpy import nan
from numpy import array
for performance in all_performance:
    print(f"Dataset variant: {performance['dataset_variant']}")
    print(f"Estimator: {performance['estimator']}")
    print(f"Mean test score: {performance['mean_test_score']}")
    print(f"Mean train score: {performance['mean_train_score']}")
    print(f"best_params: {eval(performance['best_params'])[0]}")
    print("----------------")

Dataset variant: complete
Estimator: KMeans(n_clusters = 7)
Mean test score: {'accuracy': 0.15831932773109242, 'f1_micro': 0.15831932773109242, 'f1_macro': 0.12977050727050726, 'neg_root_mean_squared_error': -2.8543852984878653, 'neg_mean_absolute_error': -2.2566386554621847}
Mean train score: {'accuracy': 0.13620212471524984, 'f1_micro': 0.13620212471524984, 'f1_macro': 0.11545181741383063, 'neg_root_mean_squared_error': -2.934951387840884, 'neg_mean_absolute_error': -2.339177364756978}
best_params: {'feature_extraction__n_components': 3, 'feature_extraction': MaskedPCA(mask=array([False, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True]),
          n_components=3), 'estimator__tol': 0.001, 'estimator__max_iter': 900, 'estimator__algorithm': 'elkan'}
----------------
Dataset variant: only_totals
Estimator: KMeans(n_clusters = 7

In [96]:
from numpy import nan
from numpy import array
for performance in all_performance:
    print(f"Dataset variant: {performance['dataset_variant']}")
    print(f"Estimator: {performance['estimator']}")
    print(f"Mean test score: {performance['mean_test_score']}")
    print(f"Mean train score: {performance['mean_train_score']}")
    print(f"best_params: {eval(performance['best_params'])[0]}")
    print("----------------")

Dataset variant: complete
Estimator: DecisionTreeClassifier()
Mean test score: {'accuracy': 0.774033613445378, 'f1_micro': 0.7740336134453781, 'f1_macro': 0.6992170456320336, 'neg_root_mean_squared_error': -0.47051608781896526, 'neg_mean_absolute_error': -0.2259663865546219}
Mean train score: {'accuracy': 0.8485001383832579, 'f1_micro': 0.8485001383832577, 'f1_macro': 0.8022112226935935, 'neg_root_mean_squared_error': -0.3866963867649651, 'neg_mean_absolute_error': -0.15149986161674225}
best_params: {'scaler': MinMaxScaler(), 'feature_extraction__n_components': 3, 'feature_extraction': MaskedSVD(mask=array([False, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True]),
          n_components=3), 'estimator__min_samples_split': 16, 'estimator__min_samples_leaf': 4, 'estimator__max_depth': 5, 'estimator__criterion': 'gini'}
----------------
Dataset

In [2]:
%%time
import json

#This block run all the experiments
files = glob.glob(".\\input_exp\\*.json")
for file in files:
    with open (file, "r") as myfile:
        data=myfile.read().replace('\n','')
    splits = file.split("\\")
    filename = splits[2][:-5]

    experiment_setup = json.loads(data)
    all_performance = experiment("./results/", filename, experiment_setup)

NameError: name 'experiment' is not defined

In [85]:
import glob
import json
files = glob.glob("./results/*.json")

#This block acquire all the results obtained
filter_floor_labels = True

#SET THIS TO 'regressor' to get the best regression models
#or set it to 'clf' to get the best models in the binary classification task
all_experiments_results = []
for i, file in enumerate(files):
    with open (file, "r") as myfile:
        
        data=myfile.read().replace('\n','')
        experiment_results = json.loads(data)
        
        if type(experiment_results) == dict:
            experiment_results = [experiment_results]
            
        all_experiments_results.append(pd.DataFrame(experiment_results))

df = pd.concat(all_experiments_results)
df = df.reset_index().drop('index', axis = 1)

In [88]:
filter_dict = {"task" : "Regression", "cv_type" : "n_split_inner_cv"}
target_metric = "neg_root_mean_squared_error"
filtered_df = df
for k, v in filter_dict.items():
    filtered_df = filtered_df.loc[filtered_df[k] == v]

filtered_df = filtered_df.reset_index().drop('index', axis = 1)
filtered_df = filtered_df.sort_values(by = 'mean_test_score', axis=0, ascending = False, key=lambda x: x.apply(lambda y: y[target_metric]))

In [None]:
filtered_df

In [95]:
filter_dict = {"task" : "Classification", "cv_type" : "n_split_inner_cv"}
target_metric = "accuracy"
filtered_df = df
for k, v in filter_dict.items():
    filtered_df = filtered_df.loc[filtered_df[k] == v]

filtered_df = filtered_df.reset_index().drop('index', axis = 1)
filtered_df = filtered_df.sort_values(by = 'mean_test_score', axis=0, ascending = False, key=lambda x: x.apply(lambda y: y[target_metric]))

In [87]:
filtered_df

Unnamed: 0,experiment_name,dataset_variant,estimator,task,hp_optimizer,cv_type,mean_test_score,mean_train_score,best_params,cv_results,experiment_setup
49,Experiment_4_08_12_21,only_totals,RidgeClassifier(),Classification,RandomizeSearchCV,n_split_inner_cv,"{'accuracy': 0.8240336134453781, 'f1_micro': 0...","{'accuracy': 0.8289338102233292, 'f1_micro': 0...","[{'scaler': StandardScaler(), 'feature_extract...","{'fit_time': array([5.554003 , 5.64599991, 5....",{'dataset_settings': {'dataset_variants': ['co...
40,Experiment_3_08_12_21,only_totals,SVC(),Classification,RandomizeSearchCV,n_split_inner_cv,"{'accuracy': 0.8210924369747898, 'f1_micro': 0...","{'accuracy': 0.8478327052862404, 'f1_micro': 0...","[{'scaler': StandardScaler(), 'feature_extract...","{'fit_time': array([7.58810472, 7.39698505, 7....",{'dataset_settings': {'dataset_variants': ['co...
50,Experiment_4_08_12_21,only_binary,RidgeClassifier(),Classification,RandomizeSearchCV,n_split_inner_cv,"{'accuracy': 0.8151260504201681, 'f1_micro': 0...","{'accuracy': 0.8367535287730729, 'f1_micro': 0...","[{'scaler': StandardScaler(), 'feature_extract...","{'fit_time': array([5.86500692, 6.15400267, 6....",{'dataset_settings': {'dataset_variants': ['co...
65,Experiment_6_08_12_21,only_binary,"XGBClassifier(C = 2, kernel='rbf')",Classification,RandomizeSearchCV,n_split_inner_cv,"{'accuracy': 0.8094957983193277, 'f1_micro': 0...","{'accuracy': 0.9097294075067595, 'f1_micro': 0...","[{'scaler': MinMaxScaler(), 'feature_extractio...","{'fit_time': array([73.43499851, 75.50999522, ...",{'dataset_settings': {'dataset_variants': ['co...
58,Experiment_5_08_12_21,only_totals,"XGBClassifier(C = 2, kernel='rbf')",Classification,RandomizeSearchCV,n_split_inner_cv,"{'accuracy': 0.8094957983193277, 'f1_micro': 0...","{'accuracy': 0.9498105213855357, 'f1_micro': 0...","[{'scaler': StandardScaler(), 'feature_extract...","{'fit_time': array([81.93052936, 76.83855128, ...",{'dataset_settings': {'dataset_variants': ['co...
...,...,...,...,...,...,...,...,...,...,...,...
15,Experiment_12_22_12_21,complete,KMeans(n_clusters = 7),Classification,RandomizeSearchCV,n_split_inner_cv,"{'accuracy': 0.15831932773109242, 'f1_micro': ...","{'accuracy': 0.13620212471524984, 'f1_micro': ...","[{'feature_extraction__n_components': 3, 'feat...","{'fit_time': array([18.2070601 , 18.91806054, ...",{'dataset_settings': {'dataset_variants': ['co...
9,Experiment_11_22_12_21,complete,"FuzzyCMeans(n_centers = 7, m = 2)",Classification,RandomizeSearchCV,n_split_inner_cv,"{'accuracy': 0.1405042016806723, 'f1_micro': 0...","{'accuracy': 0.14794873432543487, 'f1_micro': ...","[{'feature_extraction__n_components': 24, 'fea...","{'fit_time': array([15.09903526, 11.22470641, ...",{'dataset_settings': {'dataset_variants': ['co...
17,Experiment_12_22_12_21,only_binary,KMeans(n_clusters = 7),Classification,RandomizeSearchCV,n_split_inner_cv,"{'accuracy': 0.13781512605042018, 'f1_micro': ...","{'accuracy': 0.11470268889314683, 'f1_micro': ...","[{'feature_extraction__n_components': 17, 'fea...","{'fit_time': array([20.09005904, 19.76706648, ...",{'dataset_settings': {'dataset_variants': ['co...
16,Experiment_12_22_12_21,only_totals,KMeans(n_clusters = 7),Classification,RandomizeSearchCV,n_split_inner_cv,"{'accuracy': 0.11142857142857143, 'f1_micro': ...","{'accuracy': 0.10980498605522557, 'f1_micro': ...","[{'feature_extraction__n_components': 4, 'feat...","{'fit_time': array([18.08812666, 17.72712469, ...",{'dataset_settings': {'dataset_variants': ['co...


In [96]:
#THIS IS USED TO EXPLODE THE DICT OF THE SCORES AS COLUMNS OF THE RESULTED FILTERED_DF
values = {}
df_table = filtered_df[:0]
estimators = {}
for index, record in filtered_df.iterrows():
    cv_result = record.cv_results
    
    i_scoring = cv_result.rfind("scoring")
    i_test = cv_result[i_scoring:].find("'test")
    cv_result = "{"+record[i_scoring+i_test:]
    cv_result = eval(cv_result)
    
    for k, list_scores in cv_result.items():
        if values.get(k) == None:
            values[k] = []
        mean = np.round(np.mean(np.abs(list_scores)), 2)
        std = np.round(np.std(np.abs(list_scores)), 2)
        
        values[k].append(str(mean)+" (std. "+str(std)+")")
    
for k, v in values.items():
    filtered_df[k] = v

TypeError: eval() arg 1 must be a string, bytes or code object

In [118]:
#THIS IS USED TO EXPLODE THE DICT OF THE SCORES AS COLUMNS OF THE RESULTED FILTERED_DF
values = {}
df_table = filtered_df[:0]
estimators = {}
for index, record in filtered_df.iterrows():
    cv_result = record.cv_results

    i_scoring = cv_result.rfind("scoring")
    i_test = cv_result[i_scoring:].find("'test")
    cv_result = "{"+cv_result[i_scoring+i_test:]
    cv_result = eval(cv_result)
    
    for k, list_scores in cv_result.items():
        if values.get(k) == None:
            values[k] = []
        mean = np.round(np.mean(np.abs(list_scores)), 2)
        std = np.round(np.std(np.abs(list_scores)), 2)
        
        values[k].append(str(mean)+" (std. "+str(std)+")")
    
for k, v in values.items():
    filtered_df[k] = v

In [119]:
#THIS IS USED TO EXPLODE THE DICT OF THE SCORES AS COLUMNS OF THE RESULTED FILTERED_DF
values = {}
for record in filtered_df.mean_test_score:
    for k, v in record.items():
        if values.get(k) == None:
            values[k] = []
        values[k].append(v)

for k, v in values.items():
    filtered_df[k] = v

In [101]:
names = {'dataset_variant': 'Dataset variant', 'estimator': 'Model', 'test_accuracy': 'Accuracy'}
df_table = filtered_df.filter(['dataset_variant', 'estimator', 'test_accuracy'])
df_table = df_table.rename(columns = names)

df_table.Model = df_table.Model.apply(lambda x: x if x.find('(') == -1 else x[:x.find('(')])
filtered_table = df_table[:0]
estimators = {}
for index, row in df_table.iterrows():
    if estimators.get(row.Model) == None:
        estimators[row.Model] = 1
        filtered_table = filtered_table.append(row)
        

In [102]:
filtered_table

Unnamed: 0,Dataset variant,Model,Accuracy
43,only_totals,RidgeClassifier,0.82 (std. 0.06)
34,only_totals,SVC,0.82 (std. 0.06)
59,only_binary,XGBClassifier,0.81 (std. 0.06)
26,only_binary,RandomForestClassifier,0.81 (std. 0.08)
1,only_totals,GradientBoostingClassifier,0.8 (std. 0.06)
70,only_totals,DecisionTreeClassifier,0.78 (std. 0.06)
38,only_binary,OneVsRestClassifier,0.36 (std. 0.08)
45,complete,OneVsOneClassifier,0.35 (std. 0.07)
80,only_binary,LogisticRegression,0.32 (std. 0.07)
13,only_totals,LogisticIT,0.31 (std. 0.07)


In [30]:
filtered_table.to_latex("classification.txt")

In [86]:
names = {'dataset_variant': 'Dataset variant', 'estimator': 'Model', 'test_neg_root_mean_squared_error': 'RMSE', 'test_r2': 'R squared', 'test_neg_mean_absolute_error' : 'MAE'}
df_table = filtered_df.filter(['dataset_variant', 'estimator', 'test_neg_root_mean_squared_error', 'test_r2', 'test_neg_mean_absolute_error'])
df_table = df_table.rename(columns = names)

df_table.Model = df_table.Model.apply(lambda x: x if x.find('(') == -1 else x[:x.find('(')])
filtered_table = df_table[:0]
estimators = {}
for index, row in df_table.iterrows():
    if estimators.get(row.Model) == None:
        estimators[row.Model] = 1
        filtered_table = filtered_table.append(row)
        

In [89]:
filtered_table.to_latex()

'\\begin{tabular}{llllll}\n\\toprule\n{} & Dataset variant &                      Model &              RMSE &         R squared &               MAE \\\\\n\\midrule\n7  &     only\\_totals &      RandomForestRegressor &  1.47 (std. 0.23) &  0.43 (std. 0.18) &  1.16 (std. 0.17) \\\\\n12 &        complete &  GradientBoostingRegressor &  1.51 (std. 0.18) &  0.41 (std. 0.15) &  1.23 (std. 0.14) \\\\\n33 &        complete &                        SVR &   1.54 (std. 0.2) &  0.39 (std. 0.16) &  1.21 (std. 0.16) \\\\\n25 &     only\\_totals &                      Ridge &  1.54 (std. 0.17) &  0.38 (std. 0.14) &  1.23 (std. 0.14) \\\\\n20 &     only\\_binary &      DecisionTreeRegressor &  1.65 (std. 0.18) &  0.29 (std. 0.15) &  1.35 (std. 0.15) \\\\\n\\bottomrule\n\\end{tabular}\n'

In [191]:
names = {'dataset_variant': 'Dataset variant', 'estimator': 'Model', 'test_accuracy': 'Accuracy'}
df_table = filtered_df.filter(['dataset_variant', 'estimator', 'test_accuracy'])
df_table = df_table.rename(columns = names)

name_model = "ChainedEstimator"
filtered_table = df_table[:0]
estimators = {}
for index, row in df_table.iterrows():

    if estimators.get(row.Model) == None and name_model in row.Model:
        str_regressor = 'regressor = '
        i_regressor = row.Model.find('regressor = ')
        i_comma = row.Model.find(',')

        regressor = row.Model[i_regressor+len(str_regressor):i_comma]
        i_p = regressor.find('(')
        regressor = regressor[:i_p]
        row['Regressor'] = regressor
        
        str_clf = 'clf = '
        i_clf = row.Model.find(str_clf)
        clf = row.Model[i_clf+len(str_clf):]
        i_p = clf.find('(')
        clf = clf[:i_p]
        row['Classifier'] = clf
        print(regressor)
        print(clf)

        estimators[row.Model] = 1
        filtered_table = filtered_table.append(row)
        

RandomForestClassifier
RidgeClassifier
RandomForestRegressor
SVC
GradientBoostingRegressor
SVC
RandomForestRegressor
DecisionTreeClassifier


In [193]:
filtered_table.to_latex()

'\\begin{tabular}{llllll}\n\\toprule\n{} & Dataset variant &                                              Model &          Accuracy &              Classifier &                  Regressor \\\\\n\\midrule\n70 &     only\\_totals &  ChainedEstimator(regressor = RandomForestClass... &  0.29 (std. 0.06) &         RidgeClassifier &     RandomForestClassifier \\\\\n79 &     only\\_totals &  ChainedEstimator(regressor = RandomForestRegre... &  0.28 (std. 0.06) &                     SVC &      RandomForestRegressor \\\\\n82 &     only\\_totals &  ChainedEstimator(regressor = GradientBoostingR... &  0.26 (std. 0.06) &                     SVC &  GradientBoostingRegressor \\\\\n4  &     only\\_totals &  ChainedEstimator(regressor = RandomForestRegre... &  0.26 (std. 0.05) &  DecisionTreeClassifier &      RandomForestRegressor \\\\\n\\bottomrule\n\\end{tabular}\n'

In [None]:
    """i_x = row.Model.find('(')
    temp_row = row.Model[i_x+1:]
    i_x = temp_row.find('(')
    temp_row = temp_row[:i_x]
    i_x = temp_row.find('=')
    temp_row = temp_row[i_x+1:]
    i_x = temp_row.find('regressor')"""

In [156]:
filtered_table.to_latex()

'\\begin{tabular}{llll}\n\\toprule\n{} & Dataset variant &                    Model &          Accuracy \\\\\n\\midrule\n39 &        complete &                      SVC &  0.18 (std. 0.04) \\\\\n23 &     only\\_binary &   DecisionTreeClassifier &  0.18 (std. 0.05) \\\\\n49 &     only\\_totals &          RidgeClassifier &  0.17 (std. 0.03) \\\\\n\\bottomrule\n\\end{tabular}\n'

In [145]:
filtered_table.to_latex() #OneVsRestClassifier

'\\begin{tabular}{llll}\n\\toprule\n{} & Dataset variant &                   Model &          Accuracy \\\\\n\\midrule\n38 &     only\\_binary &           XGBClassifier &  0.36 (std. 0.08) \\\\\n18 &        complete &                     SVC &   0.34 (std. 0.1) \\\\\n54 &        complete &  DecisionTreeClassifier &  0.31 (std. 0.06) \\\\\n16 &     only\\_totals &         RidgeClassifier &   0.3 (std. 0.04) \\\\\n\\bottomrule\n\\end{tabular}\n'

In [148]:
filtered_table.to_latex() #OneVsOneClassifier

'\\begin{tabular}{llll}\n\\toprule\n{} & Dataset variant &                   Model &          Accuracy \\\\\n\\midrule\n45 &        complete &           XGBClassifier &  0.35 (std. 0.07) \\\\\n31 &     only\\_totals &                     SVC &  0.35 (std. 0.08) \\\\\n61 &     only\\_totals &  DecisionTreeClassifier &  0.34 (std. 0.11) \\\\\n27 &        complete &         RidgeClassifier &  0.32 (std. 0.09) \\\\\n\\bottomrule\n\\end{tabular}\n'

In [150]:
filtered_table.to_latex() #OneVsOneClassifier

'\\begin{tabular}{llll}\n\\toprule\n{} & Dataset variant &                               Model &          Accuracy \\\\\n\\midrule\n39 &        complete &                                 SVC &  0.18 (std. 0.04) \\\\\n23 &     only\\_binary &  estimator = DecisionTreeClassifier &  0.18 (std. 0.05) \\\\\n49 &     only\\_totals &                     RidgeClassifier &  0.17 (std. 0.03) \\\\\n\\bottomrule\n\\end{tabular}\n'

In [117]:
best_results = filtered_df.iloc[0]
cv_result = best_results.cv_results.replace("\n",'')
cv_result = cv_result.replace(" ", '')
try:
    cv_dic = eval(best_results.cv_results)
except:
    i_scoring = cv_result.rfind("scoring")
    i_test = cv_result[i_scoring:].find("'test")
    cv_result = "{"+cv_result[i_scoring+i_test:]
metrics = best_results.experiment_setup['hp_optimizer']['metrics']
for metric in metrics:
    print(cv_dic[f"test_{metric}"])
    print(cv_dic[f"train_{metric}"])

NameError: name 'cv_dic' is not defined

In [179]:
best_results.experiment_setup

{'dataset_settings': {'dataset_variants': ['complete',
   'only_totals',
   'only_binary'],
  'type_of_labels': 'floors',
  'only_BMI': 1},
 'task': 'Regression',
 'pipe': {'feature_extraction': 'GroupedSVD(mask=mask)',
  'scaler': 'MinMaxScaler()',
  'estimator': "RandomForestRegressor(criterion = 'squared_error')"},
 'hp_optimizer': {'type': 'RandomizeSearchCV',
  'n_iter': 100,
  'params': [{'feature_extraction__n_components': '[1, 2, 3]',
    'estimator__n_estimators': '[100, 200, 250]',
    'estimator__max_depth': '[10, 15]',
    'estimator__max_features': "['sqrt', 'log2']"}],
  'metrics': ['neg_root_mean_squared_error', 'r2', 'neg_mean_absolute_error'],
  'n_split_outer_cv': 10,
  'n_split_inner_cv': 10}}

In [177]:
eval(best_results.best_params)[1]

{'feature_extraction__n_components': 2,
 'estimator__n_estimators': 200,
 'estimator__max_features': 'sqrt',
 'estimator__max_depth': 10}

In [20]:
filtered_df['best_params'][25]

"[{'feature_extraction__n_components': 2, 'estimator__n_estimators': 250, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 15}, {'feature_extraction__n_components': 2, 'estimator__n_estimators': 200, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 10}, {'feature_extraction__n_components': 2, 'estimator__n_estimators': 200, 'estimator__max_features': 'log2', 'estimator__max_depth': 10}, {'feature_extraction__n_components': 1, 'estimator__n_estimators': 200, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 15}, {'feature_extraction__n_components': 2, 'estimator__n_estimators': 250, 'estimator__max_features': 'log2', 'estimator__max_depth': 15}, {'feature_extraction__n_components': 2, 'estimator__n_estimators': 200, 'estimator__max_features': 'log2', 'estimator__max_depth': 15}, {'feature_extraction__n_components': 1, 'estimator__n_estimators': 200, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 10}, {'feature_extraction__n_components': 2, 'estim

In [19]:
filtered_df['estimator'][25]

"RandomForestRegressor(criterion = 'squared_error')"

In [121]:
filtered_df.iloc[2].best_params

"[{'feature_extraction__n_components': 4, 'estimator__tol': 0.0001, 'estimator__kernel': 'rbf', 'estimator__epsilon': 1e-05, 'estimator__C': 0.5}, {'feature_extraction__n_components': 4, 'estimator__tol': 0.001, 'estimator__kernel': 'rbf', 'estimator__epsilon': 0.001, 'estimator__C': 0.5}, {'feature_extraction__n_components': 5, 'estimator__tol': 0.001, 'estimator__kernel': 'rbf', 'estimator__epsilon': 0.01, 'estimator__C': 1.0}, {'feature_extraction__n_components': 4, 'estimator__tol': 0.001, 'estimator__kernel': 'rbf', 'estimator__epsilon': 0.001, 'estimator__C': 0.5}, {'feature_extraction__n_components': 5, 'estimator__tol': 0.0001, 'estimator__kernel': 'rbf', 'estimator__epsilon': 0.001, 'estimator__C': 0.5}, {'feature_extraction__n_components': 4, 'estimator__tol': 0.001, 'estimator__kernel': 'rbf', 'estimator__epsilon': 0.01, 'estimator__C': 0.5}, {'feature_extraction__n_components': 4, 'estimator__tol': 0.0001, 'estimator__kernel': 'rbf', 'estimator__epsilon': 1e-05, 'estimator_