In [118]:
import csv
import json
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import pickle
import seaborn as sns
import sys
import warnings
import scipy

In [123]:
from scipy import stats
from collections import defaultdict

In [124]:
LOGS_DIR = os.path.join('resources', 'logs')

In [130]:
logs = defaultdict(lambda: {})
for filename in os.listdir(LOGS_DIR):
    if filename[-3:] == 'log' and (filename[:3] == 'new' or filename[:4] == 'imat'):
        file_prefix = 'diabetes' if filename[:3] == 'new' else 'imat2009'
        argset = filename.split('_')[1]
        try:
            depth = int(filename.split('.')[0][-1:])
        except ValueError:
            continue
        print(file_prefix, argset, depth)
        logs[file_prefix][(argset, depth)] = pd.read_csv(os.path.join(LOGS_DIR, filename))
logs = dict(logs)

diabetes bootstrap 4
diabetes default 2
diabetes bootstrap 3
imat2009 default 4
imat2009 bootstrap 3
diabetes bootstrap 2
imat2009 default 3
diabetes bootstrap 1
imat2009 default 2
diabetes default 3
imat2009 bootstrap 2
imat2009 bootstrap 4
diabetes default 1
diabetes default 4


In [139]:
for dataset_title in ['imat2009', 'diabetes']:
    print(f'Dataset: {dataset_title}')
    for depth in [1, 2, 3, 4]:
        if ('default', depth) not in logs[dataset_title]:
            continue
        print(f'Depth: {depth}')
        default_log = logs[dataset_title][('default', depth)]
        bootstrap_log = logs[dataset_title][('bootstrap', depth)]
        
        default_diff = default_log['test reg rmse'] - default_log['test lasso rmse']
        bootstrap_diff = bootstrap_log['test reg rmse'] - bootstrap_log['test lasso rmse']
        common_len = min(default_diff.shape[0], bootstrap_diff.shape[0])
        p = stats.wilcoxon(default_diff[:common_len], bootstrap_diff[:common_len], alternative='less')[1]
        print(f'Bootstrap diffs are worse, p-value {p} (based on {common_len}-dim vectors)')
    
    print()

Dataset: imat2009
Depth: 2
Bootstrap diffs are worse, p-value 0.058450055270219244 (based on 200-dim vectors)
Depth: 3
Bootstrap diffs are worse, p-value 0.300329522036154 (based on 200-dim vectors)
Depth: 4
Bootstrap diffs are worse, p-value 0.9188007121396837 (based on 61-dim vectors)

Dataset: diabetes
Depth: 1
Bootstrap diffs are worse, p-value 0.9982214807371874 (based on 200-dim vectors)
Depth: 2
Bootstrap diffs are worse, p-value 0.9999999998424869 (based on 200-dim vectors)
Depth: 3
Bootstrap diffs are worse, p-value 0.9444708824157715 (based on 21-dim vectors)
Depth: 4
Bootstrap diffs are worse, p-value 0.9999999999999997 (based on 200-dim vectors)



## Для обоих датасетов наблюдается существенное ухудшение разности скоров регрессора и лассо (лассо существеннее недоучивается). Однако, для маленького датасета это наблюдается всегда, а для imat2009 ухудшение возрастает при в терминах p-value при увеличении глубины. Возможно, это связано с тем, что для маленького датасета увеличение глубины не дает улучшения в плане результатов регрессора, а для imat2009 улучше происходит (см ниже).

In [150]:
for dataset_title in ['imat2009', 'diabetes']:
    print(f'Dataset: {dataset_title}')
    for depth in [1, 2, 3, 4]:
        if ('default', depth) not in logs[dataset_title]:
            continue
        print(f'Depth: {depth}')
        default_log = logs[dataset_title][('default', depth)]
        bootstrap_log = logs[dataset_title][('bootstrap', depth)]
        
        default_diff = default_log['test reg rmse'] - default_log['test lasso rmse']
        bootstrap_diff = bootstrap_log['test reg rmse'] - bootstrap_log['test lasso rmse']
        p = stats.wilcoxon(default_diff, alternative='greater')[1]
        print(f'Default diffs are negative, p-value {p}')
        print(f'Confidence interval: [{np.percentile(default_diff, 5)}; {np.percentile(default_diff, 95)}]')
        p = stats.wilcoxon(bootstrap_diff, alternative='greater')[1]
        print(f'Bootstrap diffs are negative, p-value {p}')
        print(f'Confidence interval: [{np.percentile(bootstrap_diff, 5)}; {np.percentile(bootstrap_diff, 95)}]')
    
    print()

Dataset: imat2009
Depth: 2
Default diffs are negative, p-value 1.0
Confidence interval: [-0.28881392172524434; -0.14336039274348247]
Bootstrap diffs are negative, p-value 1.0
Confidence interval: [-0.286675356560148; -0.1472046058359014]
Depth: 3
Default diffs are negative, p-value 1.0
Confidence interval: [-0.2911402544635137; -0.1484648358955075]
Bootstrap diffs are negative, p-value 1.0
Confidence interval: [-0.28018821073682626; -0.14510953886336395]
Depth: 4
Default diffs are negative, p-value 1.0
Confidence interval: [-0.28524718887941775; -0.1457945269748979]
Bootstrap diffs are negative, p-value 0.9999999999944442
Confidence interval: [-0.27861454944965336; -0.16714273663906787]

Dataset: diabetes
Depth: 1
Default diffs are negative, p-value 0.9999911687084061
Confidence interval: [-3.2237030818791035; 1.409568885202918]
Bootstrap diffs are negative, p-value 0.9999999999998184
Confidence interval: [-3.315059213075999; 1.333970662607862]
Depth: 2
Default diffs are negative, p-va

## Тем не менее, на обоих датасетах разности в обеих (!) кофигурациях меньше нуля по вилкоксону.

In [142]:
for dataset_title in ['imat2009', 'diabetes']:
    print(f'Dataset: {dataset_title}')
    for depth in [1, 2, 3, 4]:
        if ('default', depth) not in logs[dataset_title]:
            continue
        print(f'Depth: {depth}')
        default_log = logs[dataset_title][('default', depth)]
        bootstrap_log = logs[dataset_title][('bootstrap', depth)]
        
        a, b = default_log['leaves distribution variance (train)'], bootstrap_log['leaves distribution variance (train)']
        common_size = min(a.shape[0], b.shape[0])
        p = stats.wilcoxon(a[:common_size], 
                           b[:common_size], alternative='less')[1]
        print(f'log(w) + 1 is more variative for default argset (train subset), p-value: {p}')
        
        a, b = default_log['leaves distribution variance (all)'], bootstrap_log['leaves distribution variance (all)']
        common_size = min(a.shape[0], b.shape[0])
        p = stats.wilcoxon(a[:common_size], 
                           b[:common_size], alternative='less')[1]
        print(f'log(w) + 1 is more variative for default argset, p-value: {p}')
    
    print()

Dataset: imat2009
Depth: 2
log(w) + 1 is more variative for default argset (train subset), p-value: 0.9948200223531455
log(w) + 1 is more variative for default argset, p-value: 0.7339249192295272
Depth: 3
log(w) + 1 is more variative for default argset (train subset), p-value: 1.0
log(w) + 1 is more variative for default argset, p-value: 1.0
Depth: 4
log(w) + 1 is more variative for default argset (train subset), p-value: 0.9999999999935505
log(w) + 1 is more variative for default argset, p-value: 0.9999999999908817

Dataset: diabetes
Depth: 1
log(w) + 1 is more variative for default argset (train subset), p-value: 1.0
log(w) + 1 is more variative for default argset, p-value: 1.0
Depth: 2
log(w) + 1 is more variative for default argset (train subset), p-value: 1.0
log(w) + 1 is more variative for default argset, p-value: 1.0
Depth: 3
log(w) + 1 is more variative for default argset (train subset), p-value: 0.9954919815063477
log(w) + 1 is more variative for default argset, p-value: 0.99

## Для всех конфигураций и всех датасетов распределения log(w) + 1 более вариативны для дефолтного набора параметров.

In [145]:
for dataset_title in ['imat2009', 'diabetes']:
    print(f'Dataset: {dataset_title}')
    for depth in [1, 2, 3]:
        if ('default', depth) not in logs[dataset_title]:
            continue
        print(f'Depth: {depth} -> {depth + 1}')
    
        for sub_key in ['default', 'bootstrap']:
            reg_scores_1 = logs[dataset_title][(sub_key, depth)]['test reg rmse']
            reg_scores_2 = logs[dataset_title][(sub_key, depth + 1)]['test reg rmse']
            common_size = min(reg_scores_1.shape[0], reg_scores_2.shape[0])
            p = stats.wilcoxon(reg_scores_1[:common_size], reg_scores_2[:common_size], alternative='less')[1]
            print(f'Bigger depth for {sub_key} datasets makes scores better, p-value: {p}')
    print()

Dataset: imat2009
Depth: 2 -> 3
Bigger depth for default datasets makes scores better, p-value: 0.9999999999976759
Bigger depth for bootstrap datasets makes scores better, p-value: 0.9999999999507848
Depth: 3 -> 4
Bigger depth for default datasets makes scores better, p-value: 0.9923861617620119
Bigger depth for bootstrap datasets makes scores better, p-value: 0.999575644152517

Dataset: diabetes
Depth: 1 -> 2
Bigger depth for default datasets makes scores better, p-value: 4.230584877953538e-12
Bigger depth for bootstrap datasets makes scores better, p-value: 0.0013808982251721012
Depth: 2 -> 3
Bigger depth for default datasets makes scores better, p-value: 0.0003892207965681651
Bigger depth for bootstrap datasets makes scores better, p-value: 0.004508018493652344
Depth: 3 -> 4
Bigger depth for default datasets makes scores better, p-value: 1.880149070706101e-05
Bigger depth for bootstrap datasets makes scores better, p-value: 1.1920928955078125e-05



## Для imat2009 увеличение глубины приводит к улучшению скоров регрессора (в обеих конфигурациях). Для маленького датасета - нет!

In [146]:
for dataset_title in ['imat2009', 'diabetes']:
    print(f'Dataset: {dataset_title}')
    for depth in [1, 2, 3]:
        if ('default', depth) not in logs[dataset_title]:
            continue
        print(f'Depth: {depth} -> {depth + 1}')
    
        for sub_key in ['default', 'bootstrap']:
            reg_scores_1 = logs[dataset_title][(sub_key, depth)]['test lasso rmse']
            reg_scores_2 = logs[dataset_title][(sub_key, depth + 1)]['test lasso rmse']
            common_size = min(reg_scores_1.shape[0], reg_scores_2.shape[0])
            p = stats.wilcoxon(reg_scores_1[:common_size], reg_scores_2[:common_size], alternative='less')[1]
            print(f'Bigger depth for {sub_key} datasets makes lasso better, p-value: {p}')
    print()

Dataset: imat2009
Depth: 2 -> 3
Bigger depth for default datasets makes lasso better, p-value: 0.9999999996377407
Bigger depth for bootstrap datasets makes lasso better, p-value: 0.999999561238125
Depth: 3 -> 4
Bigger depth for default datasets makes lasso better, p-value: 0.12823830756288407
Bigger depth for bootstrap datasets makes lasso better, p-value: 0.7280561422435816

Dataset: diabetes
Depth: 1 -> 2
Bigger depth for default datasets makes lasso better, p-value: 1.2838172633545329e-11
Bigger depth for bootstrap datasets makes lasso better, p-value: 1.9006641073632203e-13
Depth: 2 -> 3
Bigger depth for default datasets makes lasso better, p-value: 0.3718318214285564
Bigger depth for bootstrap datasets makes lasso better, p-value: 0.8037080764770508
Depth: 3 -> 4
Bigger depth for default datasets makes lasso better, p-value: 0.26089476167565756
Bigger depth for bootstrap datasets makes lasso better, p-value: 0.12862396240234375



## На маленьком датасете скоры лассо скорее не улучшаются. На большом в конфигурации с бутстрапом улучшение вроде есть всегда, в дефолтной - не всегда.

# Дальше идут какие-то старые выводы, на них можно не смотреть :)

## Корреляции между изменениями разностей при увеличении глубины нет

In [None]:
stats.wilcoxon(np.abs(default_test_diffs[2] - default_test_diffs[1]) / np.abs(default_test_diffs[1]),
               np.abs(bootstrap_test_diffs[2] - bootstrap_test_diffs[1]) / np.abs(bootstrap_test_diffs[1]), alternative='greater')[1]

In [None]:
stats.wilcoxon(np.abs(default_test_diffs[4] - default_test_diffs[2]) / np.abs(default_test_diffs[2]),
               np.abs(bootstrap_test_diffs[4] - bootstrap_test_diffs[2]) / np.abs(bootstrap_test_diffs[2]), alternative='greater')[1]

## При увеличении глубины бутстрап меняется в плане разностей медленнее

In [None]:
stats.wilcoxon(default_test_diffs[1], default_test_diffs[2], alternative='less')[1]

In [None]:
stats.wilcoxon(default_test_diffs[2], default_test_diffs[3], alternative='less')[1]

In [None]:
stats.wilcoxon(default_test_diffs[3], default_test_diffs[4], alternative='less')[1]

In [None]:
stats.wilcoxon(bootstrap_test_diffs[1], bootstrap_test_diffs[2], alternative='less')[1]

In [None]:
stats.wilcoxon(bootstrap_test_diffs[2], bootstrap_test_diffs[4], alternative='less')[1]

In [None]:
default_log.head()

In [None]:
for depth in [1, 2, 4]:
    p = stats.wilcoxon(default_leaves_var[depth]['train'], bootstrap_leaves_var[depth]['train'], alternative='less')[1]
    print(f'Variance of default > variance of boostrap (train): {p}')
    p = stats.wilcoxon(default_leaves_var[depth]['all'], bootstrap_leaves_var[depth]['all'], alternative='less')[1]
    print(f'Variance of default > variance of boostrap (test): {p}')