In [19]:
#!pip3 install pandas numpy flaml scikit-learn lightgbm xgboost tqdm

Collecting tqdm
  Using cached tqdm-4.66.2-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.66.2-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.66.2


In [9]:
# Standard library imports
from itertools import product

# Third party imports
import numpy as np
import pandas as pd
from flaml.default import (ExtraTreesClassifier,
                           LGBMClassifier,
                           XGBClassifier)
from lightgbm import LGBMClassifier as DefaultLGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier as DefaultExtraTreesClassifier
from sklearn.metrics import auc, log_loss, precision_recall_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm
from xgboost import XGBClassifier as DefaultXGBClassifier

def cartesian_product(*lists):
    return list(product(*lists))

def pr_auc_score(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    return auc(recall, precision)


path_datasets = '../datasets/'
metadata = {
    'nasa.csv' : {
        'type' : 'binary',
        'target' : 'Hazardous'},
    
    #just replaced yes-> 1 and no->0 in the target variable with the following regex ,(yes)$ and ,(no)$
    'banking_dataset_train.csv' : {
        'type' : 'binary',
        'target' : 'y',
        'link': 'https://www.kaggle.com/datasets/rashmiranu/banking-dataset-classification/data'
        },

    'bank_churn_classification_cleaned_train.csv' : {
        'type' : 'binary',
        'target' : 'Exited',
        'link': 'https://www.kaggle.com/datasets/ikjotsingh221/bank-churn-classification-cleaned?select=train.csv'
        },

    'heart.csv': {
        'type' : 'binary',
        'target' : 'output',
        'link': 'https://www.kaggle.com/datasets/rashikrahmanpritom/heart-attack-analysis-prediction-dataset'
    },

    'Naive-Bayes-Classification-Data.csv': {
        'type' : 'binary',
        'target' : 'diabetes',
        'link': 'https://www.kaggle.com/datasets/himanshunakrani/naive-bayes-classification-data/data'
    
    },

    'EEG_Eye_State_Classification.csv': {
        'type': 'binary',
        'target': 'eyeDetection',
        'link': 'https://www.kaggle.com/datasets/robikscube/eye-state-classification-eeg-dataset?select=EEG_Eye_State_Classification.csv'
    },

    #deleted all observations with #NUM in the target variable
    'waterQuality1.csv': {
        'type': 'binary',
        'target': 'is_safe',
        'link': 'https://www.kaggle.com/datasets/mssmartypants/water-quality'
    },

    'Churn_Modelling.csv': {
        'type': 'binary',
        'target': 'Exited',
        'link': 'https://www.kaggle.com/datasets/shrutimechlearn/churn-modelling'
    },

    'predictive_maintenance.csv': {
        'type': 'binary',
        'target': 'Target',
        'cols_to_drop': ['Failure_Type'],
        'link': 'https://www.kaggle.com/datasets/shivamb/machine-predictive-maintenance-classification'
    },

    'patient_hospital_death.csv': {
        'type': 'binary',
        'target': 'hospital_death',
        'link': 'https://www.kaggle.com/datasets/mitishaagarwal/patient'
    }
 }

datasets = [
            'nasa.csv', 
            'banking_dataset_train.csv', 
            'bank_churn_classification_cleaned_train.csv', 
            'heart.csv', 
            'Naive-Bayes-Classification-Data.csv', 
            'EEG_Eye_State_Classification.csv', 
            'waterQuality1.csv', 
            'Churn_Modelling.csv', 
            'predictive_maintenance.csv',
            'patient_hospital_death.csv'
            ]

preprocessings = [
    'numeric_subselect',
    'ohe_min_freq_1',
    'ohe_min_freq_10'
]
results = []
for dataset, preprocessing in tqdm(cartesian_product(datasets, preprocessings)):

    target = metadata.get(dataset).get('target')

    df = pd.read_csv(path_datasets + dataset)
    df.columns = [col.replace(' ','_').replace('[','_').replace(']','_').replace(':','_') for col in df.columns]
    if 'cols_to_drop' in metadata.get(dataset):
        df = df.drop(metadata.get(dataset).get('cols_to_drop'), axis=1)
        
    
    X = df.drop(target, axis=1)
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=666)

    if preprocessing == 'numeric_subselect':
        #THIS IS A SELECTOR FOR ONLY NUMERIC COLUMNS
        X_train = X_train.select_dtypes(exclude='object')
        X_test = X_test.select_dtypes(exclude='object')

    elif preprocessing == 'ohe_min_freq_1':
        ohe = OneHotEncoder(handle_unknown='ignore', min_frequency=0.01)
        X_train = ohe.fit_transform(X_train)
        X_test = ohe.transform(X_test)

    elif preprocessing == 'ohe_min_freq_10':
        ohe = OneHotEncoder(handle_unknown='ignore', min_frequency=0.1)
        X_train = ohe.fit_transform(X_train)
        X_test = ohe.transform(X_test)
        
    estimators = {
        'LGBMClassifier' : {
            'challenger': LGBMClassifier(random_state=0), 
            'default': DefaultLGBMClassifier(random_state=0, verbose=-1)
            },
        
        'XGBClassifier' : {
            'challenger': XGBClassifier(random_state=0), 
            'default': DefaultXGBClassifier(random_state=0)
            },
        
        'ExtraTreesClassifier' : {
            'challenger': ExtraTreesClassifier(), 
            'default': DefaultExtraTreesClassifier()
            },
    }

    for name, estimator_dict in estimators.items():

        flaml_estimator = estimator_dict['challenger']
        default_estimator = estimator_dict['default']

        #suggest hyperparameters
        (hyperparams, estimator_name, X_transformed, y_transformed) = flaml_estimator.suggest_hyperparams(X_train, y_train)

        #Obtain challenger loss
        flaml_estimator.fit(X_train, y_train)
        y_hat = flaml_estimator.predict_proba(X_test)[:,1]
        flaml_loss = log_loss(y_test, y_hat)
        flaml_rocauc = roc_auc_score(y_test, y_hat)
        flaml_prauc = pr_auc_score(y_test, y_hat)

        #Obtain defult loss
        default_estimator.fit(X_train, y_train)
        y_hat = default_estimator.predict_proba(X_test)[:,1]
        default_loss = log_loss(y_test, y_hat)
        default_rocauc = roc_auc_score(y_test, y_hat)
        default_prauc = pr_auc_score(y_test, y_hat)

        # print(f'{name} =====================================')
        # print(f'Optimal Zero Shot parameters: {hyperparams}')

        # print(f'Zero Shot - {name} Log Loss: {flaml_loss}')
        # print(f'Zero Shot - {name} ROC AUC: {flaml_rocauc}')
        # print(f'Zero Shot - {name} PR AUC: {flaml_prauc}')

        # print(f'Default - {name} Log Loss: {default_loss}')
        # print(f'Default - {name} ROC AUC: {default_rocauc}')
        # print(f'Default - {name} PR AUC: {default_prauc}')

        # print('=====================================')
        # print('\n\n')

        results.append(
            #dataset, model_name, <metrics>...]
            [dataset, name, preprocessing,
            flaml_loss, default_loss, 
            flaml_rocauc, default_rocauc,
            flaml_prauc, default_prauc]
        )

 90%|█████████ | 27/30 [2:13:42<14:51, 297.14s/it]  


ValueError: Input X contains NaN.
ExtraTreesClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [17]:
df_out = pd.DataFrame(results, columns=['dataset', 'model', 'preprocessing', 'flaml_loss', 'default_loss', 'flaml_rocauc', 'default_rocauc', 'flaml_prauc', 'default_prauc'])

df_out['is_challenger_better_loss'] = df_out['flaml_loss'] < df_out['default_loss']
df_out['is_challenger_better_rocauc'] = df_out['flaml_rocauc'] > df_out['default_rocauc']
df_out['is_challenger_better_prauc'] = df_out['flaml_prauc'] > df_out['default_prauc']

In [18]:
df_out.to_csv('flaml_results.csv', index=False)

In [20]:
df_out.is_challenger_better_loss.value_counts(normalize=True)

is_challenger_better_loss
False    0.626506
True     0.373494
Name: proportion, dtype: float64

In [21]:
df_out.is_challenger_better_rocauc.value_counts(normalize=True)

is_challenger_better_rocauc
False    0.60241
True     0.39759
Name: proportion, dtype: float64

In [22]:
df_out.is_challenger_better_prauc.value_counts(normalize=True)

is_challenger_better_prauc
False    0.662651
True     0.337349
Name: proportion, dtype: float64

In [28]:
df_out[df_out['model'] == 'LGBMClassifier'].is_challenger_better_prauc.value_counts(normalize=True)

is_challenger_better_prauc
False    0.75
True     0.25
Name: proportion, dtype: float64

In [25]:
df_out

Unnamed: 0,dataset,model,preprocessing,flaml_loss,default_loss,flaml_rocauc,default_rocauc,flaml_prauc,default_prauc,is_challenger_better_loss,is_challenger_better_rocauc,is_challenger_better_prauc
0,nasa.csv,LGBMClassifier,numeric_subselect,0.040178,0.018107,0.996472,0.999784,0.992763,0.998881,False,False,False
1,nasa.csv,XGBClassifier,numeric_subselect,0.122087,0.019112,0.994768,0.999706,0.989135,0.998489,False,False,False
2,nasa.csv,ExtraTreesClassifier,numeric_subselect,0.051255,0.124659,0.998288,0.994759,0.995054,0.977440,True,True,True
3,nasa.csv,LGBMClassifier,ohe_min_freq_1,0.368324,0.342187,0.776740,0.821472,0.328873,0.442876,False,False,False
4,nasa.csv,XGBClassifier,ohe_min_freq_1,0.331277,0.338389,0.835603,0.822947,0.471918,0.466430,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...
78,predictive_maintenance.csv,LGBMClassifier,ohe_min_freq_10,0.154406,0.154406,0.557594,0.557594,0.387345,0.387345,True,False,False
79,predictive_maintenance.csv,XGBClassifier,ohe_min_freq_10,0.154397,0.154406,0.557594,0.557594,0.387345,0.387345,True,False,False
80,predictive_maintenance.csv,ExtraTreesClassifier,ohe_min_freq_10,0.154406,0.154406,0.557594,0.557594,0.387345,0.387345,True,False,False
81,patient_hospital_death.csv,LGBMClassifier,numeric_subselect,0.200812,0.190773,0.890482,0.889088,0.540153,0.547468,False,True,False


In [29]:
2+2

4