In [1]:
import os

os.environ['DATA_DIR'] = "/home/loren/Code/evaluation/data/datasets"

In [2]:
from multiple_classifiers import friedman_test, nemenyi_friedman_test, bonferroni_dunn_test
import pandas as pd

from src.experiments.print_results import get_results
from src.experiments.data import datasets, imbalanced_distribution_datasets
from src.experiments.print_results import all_metrics
from src.experiments.utils import Keys, get_clf_full_name

# Import the results

In [3]:
from src.experiments.imbalanced_distribution import DEFAULT_CLFS

datasets_ = list(imbalanced_distribution_datasets.keys())
metric = Keys.average_rse
experiment_ = 'imbalanced_distribution'
suffix_ = ""
from_pkl_ = False
clfs = DEFAULT_CLFS
feature_transformer_name = None
incl_transformers = [None, Keys.transformer_quantile_normal, Keys.transformer_quantile_uniform,
                             Keys.transformer_powertransformer, Keys.transformer_lntransformer]
column_order = [get_clf_full_name(clf.name, transformer, feature_transformer_name).replace(' ', '')
                                              for clf in clfs for transformer in incl_transformers]

In [4]:
all_results = {}
for clf in clfs:
    column_order = [get_clf_full_name(clf.name, transformer, feature_transformer_name).replace(' ', '') for transformer in incl_transformers]
    all_results[clf.name] = get_results(datasets_, metric.replace(' ', ''), experiment_,
                                    # present_substring=f"__f_{feature_transformer_name}".replace(' ', ''),
                                    # absent_substring='__f_',
                                    suffix=suffix_, from_text=not from_pkl_,
                                    column_order=column_order)

# Run the statistical tests

In [5]:
def average_improvement(x, lower_is_better=False):
    improvements = []
    baseline = x[[x.columns[0]]].values
    for c in x.columns[1:]:
        if lower_is_better:
            improvements_column = (baseline - x[[c]].values) / baseline * 100
        else:
            improvements_column = (x[[c]].values - baseline)
        improvements.append(improvements_column.mean())
    return pd.DataFrame(improvements, index=list(x.columns[1:]), columns=[x.columns[0]])

In [8]:
def run_stats(to_compare, control: str, lower_is_better=True):
    from classifier_comparisons import BlockDesign
    # pd.set_option('display.max_rows', None)
    # pd.set_option('display.max_columns', None)

    block_design = BlockDesign(to_compare, threshold=0.001, precision=3, higher_is_better=(not lower_is_better))
    test_results = friedman_test(block_design, alpha=0.05)
    print("test results: \n", test_results)
    print("##############################################")
    average_ranks = block_design.to_ranks()
    print("average ranks: \n", average_ranks)
    print("##############################################")
    wins = BlockDesign(to_compare).to_wins_ties_losses()
    print("wins ties losses: \n", wins)
    print("##############################################")
    p_values, sign_diffs = nemenyi_friedman_test(block_design, alpha=0.05)
    print("p values: ", p_values.data)
    print("sign diffs", sign_diffs.data)
    print("##############################################")

    p_values, sign_diffs = bonferroni_dunn_test(block_design, alpha=0.05, control=control)
    print("p values: ", p_values.data)
    print("sign diffs", sign_diffs.data)

    print("##############################################")
    print("average improvement compared with the first column: \n", average_improvement(to_compare, lower_is_better))

## Lasso

In [9]:
clf = "LassoTuned"
to_compare = all_results[clf]
run_stats(to_compare, control=to_compare.columns[0], lower_is_better=True)

Number of compared methods: 5
Number of datasets: 10
Number of compared methods: 5
Number of datasets: 10
test results: 
                alpha critical value test statistic   p value  \
Friedman test   0.05       9.487729          16.96  0.001968   
Iman Davenport  0.05       2.633532          6.625   0.00042   

               significant? (test stat > crit value)  
Friedman test                                   True  
Iman Davenport                                  True  
##############################################
Number of compared methods: 5
Number of datasets: 10
average ranks: 
                                 average rank  standard deviation
LassoTuned                              4.25            0.602080
LassoTuned__Quantile(normal)            1.50            0.500000
LassoTuned__Quantile(uniform)           3.55            1.709532
LassoTuned__PowerTransformer            3.05            0.722842
LassoTuned__LogTransformer(ln)          2.65            1.265899
#############

## Ridge

In [11]:
clf = "RidgeRegressionTuned"
to_compare = all_results[clf]
run_stats(to_compare, control=to_compare.columns[0], lower_is_better=True)

Number of compared methods: 5
Number of datasets: 10
Number of compared methods: 5
Number of datasets: 10
test results: 
                alpha critical value test statistic   p value  \
Friedman test   0.05       9.487729          15.66  0.003511   
Iman Davenport  0.05       2.633532       5.790468  0.001053   

               significant? (test stat > crit value)  
Friedman test                                   True  
Iman Davenport                                  True  
##############################################
Number of compared methods: 5
Number of datasets: 10
average ranks: 
                                           average rank  standard deviation
RidgeRegressionTuned                              4.05            0.788987
RidgeRegressionTuned__Quantile(normal)            1.60            0.768115
RidgeRegressionTuned__Quantile(uniform)           3.85            1.550000
RidgeRegressionTuned__PowerTransformer            2.80            0.900000
RidgeRegressionTuned__LogTra

## Gradient Boosted Trees Regressor

In [13]:
clf = "GradientBoostingRegressorWrapper"
to_compare = all_results[clf]
run_stats(to_compare, control=to_compare.columns[0], lower_is_better=True)

Number of compared methods: 5
Number of datasets: 10
Number of compared methods: 5
Number of datasets: 10
test results: 
                alpha critical value test statistic   p value  \
Friedman test   0.05       9.487729           5.66  0.226019   
Iman Davenport  0.05       2.633532       1.483401  0.227608   

               significant? (test stat > crit value)  
Friedman test                                  False  
Iman Davenport                                 False  
##############################################
Number of compared methods: 5
Number of datasets: 10
average ranks: 
                                                     average rank  \
GradientBoostingRegressorWrapper                            2.05   
GradientBoostingRegressorWrapper__Quantile(normal)          3.30   
GradientBoostingRegressorWrapper__Quantile(unif...          3.60   
GradientBoostingRegressorWrapper__PowerTransformer          3.20   
GradientBoostingRegressorWrapper__LogTransforme...          2.8

## SVR

In [14]:
clf = "SupportVectorRegressorWrapper"
to_compare = all_results[clf]
run_stats(to_compare, control=to_compare.columns[0], lower_is_better=True)

Number of compared methods: 5
Number of datasets: 10
Number of compared methods: 5
Number of datasets: 10
test results: 
                alpha critical value test statistic   p value  \
Friedman test   0.05       9.487729          16.54  0.002374   
Iman Davenport  0.05       2.633532       6.345269  0.000569   

               significant? (test stat > crit value)  
Friedman test                                   True  
Iman Davenport                                  True  
##############################################
Number of compared methods: 5
Number of datasets: 10
average ranks: 
                                                    average rank  \
SupportVectorRegressorWrapper                              4.50   
SupportVectorRegressorWrapper__Quantile(normal)            1.85   
SupportVectorRegressorWrapper__Quantile(uniform)           3.10   
SupportVectorRegressorWrapper__PowerTransformer            3.25   
SupportVectorRegressorWrapper__LogTransformer(ln)          2.30   

