In [1]:
#!L
from catboost import CatBoostRegressor, monoforest, Pool, cv
from catboost.utils import create_cd
from sklearn.datasets import load_diabetes
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso, lasso_path
from sklearn.model_selection import train_test_split, GridSearchCV
from tqdm import tqdm as tqdm
from scipy import stats

In [2]:
#!L
import csv
import json
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import pickle
import seaborn as sns
import sys
import warnings

In [13]:
#!L
%pip install seaborn

Collecting seaborn
  Downloading seaborn-0.11.1-py3-none-any.whl (285 kB)
[K     |████████████████████████████████| 285 kB 2.5 MB/s 
[?25hCollecting matplotlib>=2.2
  Downloading matplotlib-3.3.3-cp37-cp37m-manylinux1_x86_64.whl (11.6 MB)
[K     |████████████████████████████████| 11.6 MB 5.9 MB/s 
[?25hCollecting cycler>=0.10
  Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Collecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.3.1-cp37-cp37m-manylinux1_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 7.1 MB/s 
[?25hCollecting numpy>=1.15
  Downloading numpy-1.19.5-cp37-cp37m-manylinux2010_x86_64.whl (14.8 MB)
[K     |████████████████████████████████| 14.8 MB 427 kB/s 
[?25hCollecting pandas>=0.23
  Downloading pandas-1.2.0-cp37-cp37m-manylinux1_x86_64.whl (9.9 MB)
[K     |████████████████████████████████| 9.9 MB 11.2 MB/s 
[?25hCollecting pillow>=6.2.0
  Downloading Pillow-8.1.0-cp37-cp37m-manylinux1_x86_64.whl (2.2 MB)
[K     |████████████████

In [3]:
#!L
PICKLE_DUMPS_PATH = os.path.join('resources', 'pickle_dumps')

def save_via_pickle(obj, filepath):
    with open(filepath, 'wb') as output_file:
        pickle.dump(obj, output_file)
        
def load_from_pickle(filepath):
    with open(filepath, 'rb') as input_file:
        return pickle.load(input_file)
    
def get_pickle_dump_path(dump_name):
    return os.path.join(PICKLE_DUMPS_PATH, f'{dump_name}.pkl')

In [22]:
#!L
diffs = {}
diffs['bootstrap'] = load_from_pickle(get_pickle_dump_path('diabetes_bootstrap_argset_diffs_240'))
diffs['~LightGBM'] = load_from_pickle(get_pickle_dump_path('diabetes_LightGBM_argset_diffs_240'))
diffs['empty'] = load_from_pickle(get_pickle_dump_path('diabetes_empty_argset_diffs_240'))
diffs['default'] = load_from_pickle(get_pickle_dump_path('diabetes_default_argset_diffs_240'))
diffs['border count 32'] = load_from_pickle(get_pickle_dump_path('diabetes_border_count_32_argset_diffs_240'))
diffs['border count 64'] = load_from_pickle(get_pickle_dump_path('diabetes_border_count_64_argset_diffs_240'))
diffs['border count 96'] = load_from_pickle(get_pickle_dump_path('diabetes_border_count_96_argset_diffs_240'))
diffs['border count 128'] = load_from_pickle(get_pickle_dump_path('diabetes_border_count_128_argset_diffs_240'))
diffs['border count 160'] = load_from_pickle(get_pickle_dump_path('diabetes_border_count_160_argset_diffs_240'))
diffs['border count 192'] = load_from_pickle(get_pickle_dump_path('diabetes_border_count_192_argset_diffs_240'))
diffs['border count 224'] = load_from_pickle(get_pickle_dump_path('diabetes_border_count_224_argset_diffs_240'))
diffs['border count 255'] = load_from_pickle(get_pickle_dump_path('diabetes_border_count_255_argset_diffs_240'))

In [92]:
#!L
diffs['default without best iterations'] = load_from_pickle(get_pickle_dump_path('default_without_best_iteration_diffs_240'))

In [29]:
#!L
diffs.keys()

dict_keys(['bootstrap', '~LightGBM', 'empty', 'default', 'border count 32', 'border count 64', 'border count 96', 'border count 128', 'border count 160', 'border count 192', 'border count 224', 'border count 255'])

In [44]:
#!L
default_diffs = diffs['default'][:25]
results = []
for key in diffs.keys():
    if key == 'default':
        continue
    results.append({})
    current_diffs = diffs[key][:25]
    print(f'{key} argset')
    print('Two-sided tests')
    results[-1]['First model'] = 'default'
    results[-1]['Second model'] = key
    w, p_value = stats.wilcoxon(default_diffs, current_diffs, alternative='two-sided', mode='exact')
    results[-1]['Two-sided, 25'] = p_value
    print(f'Default argset vs {key} argset, 25 values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    w, p_value = stats.wilcoxon(diffs['default'], diffs[key])
    results[-1]['Two-sided, 240'] = p_value
    print(f'Default argset vs {key} argset, all values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    print(f'One-sided tests: median(\'default\' - \'{key}\' is positive)')
    w, p_value = stats.wilcoxon(default_diffs, current_diffs, alternative='greater', mode='exact')
    results[-1]['One-sided, 25'] = p_value
    print(f'Default argset vs {key} argset, 25 values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    w, p_value = stats.wilcoxon(diffs['default'], diffs[key], alternative='greater')
    results[-1]['One-sided, 240'] = p_value
    print(f'Default argset vs {key} argset, all values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    print()

bootstrap argset
Two-sided tests
Default argset vs bootstrap argset, 25 values, Wilcoxon test p-value: 1.8298625946044922e-05, statistic: 19.0
Default argset vs bootstrap argset, all values, Wilcoxon test p-value: 1.5875089696418545e-23, statistic: 3751.0
One-sided tests: median('default' - 'bootstrap' is positive)
Default argset vs bootstrap argset, 25 values, Wilcoxon test p-value: 0.9999924600124359, statistic: 19.0
Default argset vs bootstrap argset, all values, Wilcoxon test p-value: 1.0, statistic: 3751.0

~LightGBM argset
Two-sided tests
Default argset vs ~LightGBM argset, 25 values, Wilcoxon test p-value: 0.3665854334831238, statistic: 128.0
Default argset vs ~LightGBM argset, all values, Wilcoxon test p-value: 0.6910987292710071, statistic: 14150.0
One-sided tests: median('default' - '~LightGBM' is positive)
Default argset vs ~LightGBM argset, 25 values, Wilcoxon test p-value: 0.8237401843070984, statistic: 128.0
Default argset vs ~LightGBM argset, all values, Wilcoxon test p-

In [52]:
#!L
print(os.linesep.join([str(result['One-sided, 25']) for result in results]))

0.9999924600124359
0.8237401843070984
0.10999318957328796
0.14361464977264404
0.6345053911209106
0.09086492657661438
0.3360375761985779
0.19049182534217834
0.6144202947616577
0.8237401843070984
0.8237401843070984


In [56]:
#!L
for key, diffs1 in diffs.items():
    correlations = []
    for second_key, diffs2 in diffs.items():
        correlations.append((second_key, stats.pearsonr(diffs1, diffs2)[0]))
    correlations.sort(key=lambda x: x[0])
    print(key)
    print(os.linesep.join([str(corr[1]) for corr in correlations]))
    print(os.linesep.join([corr[0] for corr in correlations]))
    print()

bootstrap
1.0
0.22963869175192675
0.22111783894630183
0.2427262430072845
0.23470221625047263
0.23470221625047263
0.281571907265297
0.17405730305284803
0.18332146474918679
0.24665794333884475
0.15576467121392362
0.23470221625047263
bootstrap
border count 128
border count 160
border count 192
border count 224
border count 255
border count 32
border count 64
border count 96
default
empty
~LightGBM

~LightGBM
0.23470221625047263
0.8092801553650384
0.8833811707582003
0.9683878200568593
1.0
1.0
0.5734801106251974
0.6142420335157097
0.6384980358294536
0.9356942731898966
0.5367101088393159
1.0
bootstrap
border count 128
border count 160
border count 192
border count 224
border count 255
border count 32
border count 64
border count 96
default
empty
~LightGBM

empty
0.15576467121392362
0.47057652316148657
0.5285821505219656
0.5443441271176704
0.5367101088393159
0.5367101088393159
0.38107627533673505
0.5030471932052726
0.43403138318349593
0.5385846287937043
1.0
0.5367101088393159
bootstrap
border

In [23]:
#!L
for key, diffs1 in diffs.items():
    for second_key, diffs2 in diffs.items():
        if key == second_key:
            continue
        print(f'{key} argset and {second_key} argset correlation: {stats.pearsonr(diffs1, diffs2)[0]}')
    print()

bootstrap argset and ~LightGBM argset correlation: 0.23470221625047263
bootstrap argset and empty argset correlation: 0.15576467121392362
bootstrap argset and default argset correlation: 0.24665794333884475
bootstrap argset and border count 32 argset correlation: 0.281571907265297
bootstrap argset and border count 64 argset correlation: 0.17405730305284803
bootstrap argset and border count 96 argset correlation: 0.18332146474918679
bootstrap argset and border count 128 argset correlation: 0.22963869175192675
bootstrap argset and border count 160 argset correlation: 0.22111783894630183
bootstrap argset and border count 192 argset correlation: 0.2427262430072845
bootstrap argset and border count 224 argset correlation: 0.23470221625047263
bootstrap argset and border count 255 argset correlation: 0.23470221625047263

~LightGBM argset and bootstrap argset correlation: 0.23470221625047263
~LightGBM argset and empty argset correlation: 0.5367101088393159
~LightGBM argset and default argset c

In [None]:
#!L
default_diffs = diffs['default'][:25]
results = []
for key in diffs.keys():
    if key == 'default':
        continue
    results.append({})
    current_diffs = diffs[key][:25]
    print(f'{key} argset')
    print('Two-sided tests')
    results[-1]['First model'] = 'default'
    results[-1]['Second model'] = key
    w, p_value = stats.wilcoxon(default_diffs, current_diffs, alternative='two-sided', mode='exact')
    results[-1]['Two-sided, 25'] = p_value
    print(f'Default argset vs {key} argset, 25 values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    w, p_value = stats.wilcoxon(diffs['default'], diffs[key])
    results[-1]['Two-sided, 240'] = p_value
    print(f'Default argset vs {key} argset, all values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    print(f'One-sided tests: median(\'default\' - \'{key}\' is positive)')
    w, p_value = stats.wilcoxon(default_diffs, current_diffs, alternative='greater', mode='exact')
    results[-1]['One-sided, 25'] = p_value
    print(f'Default argset vs {key} argset, 25 values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    w, p_value = stats.wilcoxon(diffs['default'], diffs[key], alternative='greater')
    results[-1]['One-sided, 240'] = p_value
    print(f'Default argset vs {key} argset, all values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    print()

In [105]:
#!L
default_diffs = diffs['bootstrap'][:25]
results = []
for key in diffs.keys():
    if key == 'bootstrap':
        continue
    results.append({})
    current_diffs = diffs[key][:25]
    print(f'{key} argset')
    print('Two-sided tests')
    results[-1]['First model'] = 'bootstrap'
    results[-1]['Second model'] = key
    w, p_value = stats.wilcoxon(default_diffs, current_diffs, alternative='two-sided', mode='exact')
    results[-1]['Two-sided, 25'] = p_value
    print(f'bootstrap argset vs {key} argset, 25 values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    w, p_value = stats.wilcoxon(diffs['bootstrap'], diffs[key])
    results[-1]['Two-sided, 240'] = p_value
    print(f'bootstrap argset vs {key} argset, all values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    print(f'One-sided tests: median(\'bootstrap\' - \'{key}\' is positive)')
    w, p_value = stats.wilcoxon(default_diffs, current_diffs, alternative='less', mode='exact')
    results[-1]['One-sided, 25'] = p_value
    print(f'bootstrap argset vs {key} argset, 25 values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    w, p_value = stats.wilcoxon(diffs['bootstrap'], diffs[key], alternative='less')
    results[-1]['One-sided, 240'] = p_value
    print(f'bootstrap argset vs {key} argset, all values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    print()

~LightGBM argset
Two-sided tests
bootstrap argset vs ~LightGBM argset, 25 values, Wilcoxon test p-value: 3.1948089599609375e-05, statistic: 22.0
bootstrap argset vs ~LightGBM argset, all values, Wilcoxon test p-value: 3.185401980348059e-23, statistic: 3826.0
One-sided tests: median('bootstrap' - '~LightGBM' is positive)
bootstrap argset vs ~LightGBM argset, 25 values, Wilcoxon test p-value: 0.9999866783618927, statistic: 303.0
bootstrap argset vs ~LightGBM argset, all values, Wilcoxon test p-value: 1.0, statistic: 25335.0

empty argset
Two-sided tests
bootstrap argset vs empty argset, 25 values, Wilcoxon test p-value: 1.233816146850586e-05, statistic: 17.0
bootstrap argset vs empty argset, all values, Wilcoxon test p-value: 7.612120704396165e-26, statistic: 3192.0
One-sided tests: median('bootstrap' - 'empty' is positive)
bootstrap argset vs empty argset, 25 values, Wilcoxon test p-value: 0.9999949634075165, statistic: 308.0
bootstrap argset vs empty argset, all values, Wilcoxon test p

In [69]:
#!L
default_diffs = diffs['~LightGBM'][:25]
results = []
for key in diffs.keys():
    if key == '~LightGBM':
        continue
    results.append({})
    current_diffs = diffs[key][:25]
    print(f'{key} argset')
    print('Two-sided tests')
    results[-1]['First model'] = '~LightGBM'
    results[-1]['Second model'] = key
    try:
        w, p_value = stats.wilcoxon(default_diffs, current_diffs, alternative='two-sided', mode='exact')
    except:
        w, p_value = 0, 1
    results[-1]['Two-sided, 25'] = p_value
    print(f'~LightGBM argset vs {key} argset, 25 values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    try:
        w, p_value = stats.wilcoxon(diffs['~LightGBM'], diffs[key])
    except:
        w, p_value = 0, 1
    results[-1]['Two-sided, 240'] = p_value
    print(f'~LightGBM argset vs {key} argset, all values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    print(f'One-sided tests: median(\'~LightGBM\' - \'{key}\' is positive)')
    try:
        w, p_value = stats.wilcoxon(default_diffs, current_diffs, alternative='greater', mode='exact')
    except:
        w, p_value = 0, 0
    results[-1]['One-sided, 25'] = p_value
    print(f'~LightGBM argset vs {key} argset, 25 values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    try:
        w, p_value = stats.wilcoxon(diffs['~LightGBM'], diffs[key], alternative='greater')
    except:
        w, p_value = 0, 0
    results[-1]['One-sided, 240'] = p_value
    print(f'~LightGBM argset vs {key} argset, all values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    print()

bootstrap argset
Two-sided tests
~LightGBM argset vs bootstrap argset, 25 values, Wilcoxon test p-value: 3.1948089599609375e-05, statistic: 22.0
~LightGBM argset vs bootstrap argset, all values, Wilcoxon test p-value: 3.185401980348059e-23, statistic: 3826.0
One-sided tests: median('~LightGBM' - 'bootstrap' is positive)
~LightGBM argset vs bootstrap argset, 25 values, Wilcoxon test p-value: 0.9999866783618927, statistic: 22.0
~LightGBM argset vs bootstrap argset, all values, Wilcoxon test p-value: 1.0, statistic: 3826.0

empty argset
Two-sided tests
~LightGBM argset vs empty argset, 25 values, Wilcoxon test p-value: 0.10139739513397217, statistic: 101.0
~LightGBM argset vs empty argset, all values, Wilcoxon test p-value: 0.004742682960116934, statistic: 11521.0
One-sided tests: median('~LightGBM' - 'empty' is positive)
~LightGBM argset vs empty argset, 25 values, Wilcoxon test p-value: 0.050698697566986084, statistic: 224.0
~LightGBM argset vs empty argset, all values, Wilcoxon test p-



In [75]:
#!L
print(os.linesep.join([str(result['One-sided, 240']) for result in results]))

1.0
0.002371341480058467
0.34554936463550356
4.7316555742117085e-05
0.0010264902840726646
0.0031591888538776013
0.030387035900168956
0.14430518555033106
0.742557417317687
0
0


In [100]:
#!L
default_diffs = diffs['default'][:25]
results = []
for key in diffs.keys():
    if key == 'default':
        continue
    results.append({})
    current_diffs = diffs[key][:25]
    print(f'{key} argset')
    print('Two-sided tests')
    results[-1]['First model'] = 'default'
    results[-1]['Second model'] = key
    w, p_value = stats.wilcoxon(default_diffs, current_diffs, alternative='two-sided', mode='exact')
    results[-1]['Two-sided, 25'] = p_value
    print(f'Default argset vs {key} argset, 25 values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    w, p_value = stats.wilcoxon(diffs['default'], diffs[key])
    results[-1]['Two-sided, 240'] = p_value
    print(f'Default argset vs {key} argset, all values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    print(f'One-sided tests: median(\'default\' - \'{key}\' is positive)')
    w, p_value = stats.wilcoxon(default_diffs, current_diffs, alternative='greater', mode='exact')
    results[-1]['One-sided, 25'] = p_value
    print(f'Default argset vs {key} argset, 25 values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    w, p_value = stats.wilcoxon(diffs['default'], diffs[key], alternative='greater')
    results[-1]['One-sided, 240'] = p_value
    print(f'Default argset vs {key} argset, all values, Wilcoxon test p-value: {p_value}, statistic: {w}')
    print()

bootstrap argset
Two-sided tests
Default argset vs bootstrap argset, 25 values, Wilcoxon test p-value: 1.8298625946044922e-05, statistic: 19.0
Default argset vs bootstrap argset, all values, Wilcoxon test p-value: 1.5875089696418545e-23, statistic: 3751.0
One-sided tests: median('default' - 'bootstrap' is positive)
Default argset vs bootstrap argset, 25 values, Wilcoxon test p-value: 0.9999924600124359, statistic: 19.0
Default argset vs bootstrap argset, all values, Wilcoxon test p-value: 1.0, statistic: 3751.0

~LightGBM argset
Two-sided tests
Default argset vs ~LightGBM argset, 25 values, Wilcoxon test p-value: 0.3665854334831238, statistic: 128.0
Default argset vs ~LightGBM argset, all values, Wilcoxon test p-value: 0.6910987292710071, statistic: 14150.0
One-sided tests: median('default' - '~LightGBM' is positive)
Default argset vs ~LightGBM argset, 25 values, Wilcoxon test p-value: 0.8237401843070984, statistic: 128.0
Default argset vs ~LightGBM argset, all values, Wilcoxon test p-

In [104]:
#!L
print(os.linesep.join([str(result['Two-sided, 240']) for result in results]))

1.5875089696418545e-23
0.6910987292710071
0.0009376325813599984
3.944367796875008e-05
0.0012696210355790337
0.0058731027994309674
0.04385896822163693
0.42865327468848
0.9121697967839073
0.6910987292710071
0.6910987292710071
0.005303252410723073


In [106]:
#!L
np.mean(diffs['bootstrap'])

3.2735725937128057

In [107]:
#!L
np.mean(diffs['default'])

0.7078175024710792

In [None]:
#!L
