In [1]:
import os

os.environ['DATA_DIR'] = "/home/loren/Code/evaluation/data/datasets"

In [2]:
from multiple_classifiers import friedman_test, nemenyi_friedman_test, bonferroni_dunn_test
import pandas as pd

from src.experiments.print_results import get_results
from src.experiments.data import datasets, imbalanced_distribution_datasets
from src.experiments.print_results import all_metrics
from src.experiments.utils import Keys, get_clf_full_name

# Import the results

In [3]:
from src.experiments.imbalanced_distribution import DEFAULT_CLFS

datasets_ = list(imbalanced_distribution_datasets.keys())
metric = Keys.average_smape
experiment_ = 'imbalanced_distribution'
suffix_ = ""
from_pkl_ = False
clfs = DEFAULT_CLFS
feature_transformer_name = None
incl_transformers = [None, Keys.transformer_quantile_normal, Keys.transformer_quantile_uniform,
                             Keys.transformer_powertransformer, Keys.transformer_lntransformer]
column_order = [get_clf_full_name(clf.name, transformer, feature_transformer_name).replace(' ', '')
                                              for clf in clfs for transformer in incl_transformers]

In [4]:
all_results = {}
for clf in clfs:
    column_order = [get_clf_full_name(clf.name, transformer, feature_transformer_name).replace(' ', '') for transformer in incl_transformers]
    all_results[clf.name] = get_results(datasets_, metric.replace(' ', ''), experiment_,
                                    # present_substring=f"__f_{feature_transformer_name}".replace(' ', ''),
                                    # absent_substring='__f_',
                                    suffix=suffix_, from_text=not from_pkl_,
                                    column_order=column_order)

# Run the statistical tests

In [5]:
def average_improvement(x, lower_is_better=False):
    improvements = []
    baseline = x[[x.columns[0]]].values
    for c in x.columns[1:]:
        if lower_is_better:
            improvements_column = (baseline - x[[c]].values) / baseline * 100
        else:
            improvements_column = (x[[c]].values - baseline)
        improvements.append(improvements_column.mean())
    return pd.DataFrame(improvements, index=list(x.columns[1:]), columns=[x.columns[0]])

In [6]:
def run_stats(to_compare, control: str, lower_is_better=True):
    from classifier_comparisons import BlockDesign
    # pd.set_option('display.max_rows', None)
    # pd.set_option('display.max_columns', None)

    block_design = BlockDesign(to_compare, threshold=0.001, precision=3, higher_is_better=(not lower_is_better))
    test_results = friedman_test(block_design, alpha=0.05)
    print("test results: \n", test_results)
    print("##############################################")
    average_ranks = block_design.to_ranks()
    print("average ranks: \n", average_ranks)
    print("##############################################")
    wins = BlockDesign(to_compare).to_wins_ties_losses()
    print("wins ties losses: \n", wins)
    print("##############################################")
    p_values, sign_diffs = nemenyi_friedman_test(block_design, alpha=0.05)
    print("p values: ", p_values.data)
    print("sign diffs", sign_diffs.data)
    print("##############################################")

    p_values, sign_diffs = bonferroni_dunn_test(block_design, alpha=0.05, control=control)
    print("p values: ", p_values.data)
    print("sign diffs", sign_diffs.data)

    print("##############################################")
    print("average improvement compared with the first column: \n", average_improvement(to_compare, lower_is_better))

## Lasso

In [7]:
clf = "LassoTuned"
to_compare = all_results[clf]
run_stats(to_compare, control=to_compare.columns[0], lower_is_better=True)

Number of compared methods: 5
Number of datasets: 10
Number of compared methods: 5
Number of datasets: 10
test results: 
                alpha critical value test statistic   p value  \
Friedman test   0.05       9.487729           5.76  0.217803   
Iman Davenport  0.05       2.633532       1.514019  0.218708   

               significant? (test stat > crit value)  
Friedman test                                  False  
Iman Davenport                                 False  
##############################################
Number of compared methods: 5
Number of datasets: 10
average ranks: 
                                 average rank  standard deviation
LassoTuned                               3.9            1.577973
LassoTuned__Quantile(normal)             2.3            1.100000
LassoTuned__Quantile(uniform)            3.2            1.077033
LassoTuned__PowerTransformer             2.9            1.135782
LassoTuned__LogTransformer(ln)           2.7            1.552417
#############

## Ridge

In [8]:
clf = "RidgeRegressionTuned"
to_compare = all_results[clf]
run_stats(to_compare, control=to_compare.columns[0], lower_is_better=True)

Number of compared methods: 5
Number of datasets: 10
Number of compared methods: 5
Number of datasets: 10
test results: 
                alpha critical value test statistic   p value  \
Friedman test   0.05       9.487729           8.26  0.082505   
Iman Davenport  0.05       2.633532       2.342155  0.073369   

               significant? (test stat > crit value)  
Friedman test                                  False  
Iman Davenport                                 False  
##############################################
Number of compared methods: 5
Number of datasets: 10
average ranks: 
                                           average rank  standard deviation
RidgeRegressionTuned                              4.10            1.374773
RidgeRegressionTuned__Quantile(normal)            2.20            1.077033
RidgeRegressionTuned__Quantile(uniform)           3.10            1.374773
RidgeRegressionTuned__PowerTransformer            2.55            1.059481
RidgeRegressionTuned__LogTra

## Gradient Boosted Trees Regressor

In [9]:
clf = "GradientBoostingRegressorWrapper"
to_compare = all_results[clf]
run_stats(to_compare, control=to_compare.columns[0], lower_is_better=True)

Number of compared methods: 5
Number of datasets: 10
Number of compared methods: 5
Number of datasets: 10
test results: 
                alpha critical value test statistic   p value  \
Friedman test   0.05       9.487729          12.56  0.013638   
Iman Davenport  0.05       2.633532       4.119534   0.00753   

               significant? (test stat > crit value)  
Friedman test                                   True  
Iman Davenport                                  True  
##############################################
Number of compared methods: 5
Number of datasets: 10
average ranks: 
                                                     average rank  \
GradientBoostingRegressorWrapper                             3.5   
GradientBoostingRegressorWrapper__Quantile(normal)           4.1   
GradientBoostingRegressorWrapper__Quantile(unif...           3.2   
GradientBoostingRegressorWrapper__PowerTransformer           2.2   
GradientBoostingRegressorWrapper__LogTransforme...           2.

## SVR

In [10]:
clf = "SupportVectorRegressorWrapper"
to_compare = all_results[clf]
run_stats(to_compare, control=to_compare.columns[0], lower_is_better=True)

Number of compared methods: 5
Number of datasets: 10
Number of compared methods: 5
Number of datasets: 10
test results: 
                alpha critical value test statistic   p value  \
Friedman test   0.05       9.487729          22.16  0.000186   
Iman Davenport  0.05       2.633532      11.179372  0.000005   

               significant? (test stat > crit value)  
Friedman test                                   True  
Iman Davenport                                  True  
##############################################
Number of compared methods: 5
Number of datasets: 10
average ranks: 
                                                    average rank  \
SupportVectorRegressorWrapper                               4.6   
SupportVectorRegressorWrapper__Quantile(normal)             1.4   
SupportVectorRegressorWrapper__Quantile(uniform)            3.5   
SupportVectorRegressorWrapper__PowerTransformer             2.6   
SupportVectorRegressorWrapper__LogTransformer(ln)           2.9   

