In [1]:
import sklearn

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import glob
import re
import sklearn.datasets as data

In [3]:
%run data_load_wrapper.ipynb

In [4]:
res_glob = glob.glob('./*real_data*csv*')
res_glob_names = [val.split('/')[-1] for val in res_glob]

In [5]:
res_glob

['./classification_real_data_ds0_9_rs10_small_fixed.csv',
 './regression_real_data_ds0_9_rs10_small.csv']

In [6]:
lst_dfs = [f for f in res_glob_names]

In [7]:
lst_dfs = [pd.read_csv(f) for f in res_glob_names]

In [8]:
# df_full = pd.concat(lst_dfs)
df = lst_dfs[0]

In [9]:
datasets  = {
    'boston' : data.load_boston().feature_names,
    'diabetes': data.load_diabetes().feature_names,
    'crime' : read_crime().feature_names,
    'ames_housing' : read_ames_housing().feature_names,
    'wine' : data.load_wine().feature_names,
    'breast_cancer': data.load_breast_cancer().feature_names,    
    'phishing' : read_phishing().feature_names,
    'mushroom' : read_mushroom().feature_names 
}

In [10]:
def get_feature_sets(df):
    # this is really bad performance wise, but since 
    # we need inter-record actions we have no choice
    # there is proubably a way to optimize this since we process small subarrays
    lst = []
    start = 0
    for i in range(1000):
        curr_arr = df.iloc[i]
        # at the start of a new experiment move dataset start pointer
        if curr_arr.iteration == 0:
            start = i
        sub_arr = df.iloc[start:i+1]
        # grab all dropped features
        all_dropped = set(sub_arr.dropped_feature.tolist())
        all_dropped.remove('full_set')
        
        # old stuff
        # set_dataset = curr_arr.full_feature_set
        # select feature intersection
        # set_features = set_all & set_dataset

        lst.append(all_dropped)
        
    feature_arr = pd.Series(lst, name='dropped_feature_set')   
    return feature_arr

In [11]:
def get_clasif_features(df):
    cols = ['dataset', 'random_state', 'data_split', 'model', 'imp_type', 'mcc_test']
    idx = ['dataset', 'random_state', 'data_split', 'model', 'imp_type']

    init_metric = df.query('dropped_feature == "full_set"')
    init_metric = init_metric.groupby(idx).mcc_test.max()
    init_metric = init_metric.rename('init_mcc_test')

    # merge into original df
    df_full= pd.merge(df, init_metric, on=idx, how='inner')
    
    cols = ['dataset', 'iteration']
    idx = ['dataset']
    max_feature_cnt = df[cols]
    max_feature_cnt = max_feature_cnt.groupby(idx).max() + 1

    max_feature_cnt['f_cnt_10perc'] = np.round((max_feature_cnt.iteration * 0.1)).astype(int)
    max_feature_cnt['f_cnt_20perc'] = np.round((max_feature_cnt.iteration * 0.2)).astype(int)
    max_feature_cnt['f_cnt_30perc'] = np.round((max_feature_cnt.iteration * 0.3)).astype(int)
    max_feature_cnt = max_feature_cnt.rename(columns={'iteration':'max_feature_cnt'})

    # merge into original df
    df_full = pd.merge(df_full, max_feature_cnt, on=idx, how='inner')
    
    df_full['mcc_delta'] = df_full.mcc_test - df_full.init_mcc_test
    df_full['mcc_perc'] = abs(df_full.mcc_test / df_full.init_mcc_test) -1
    df_full['feature_cnt'] = df_full.max_feature_cnt - df_full.iteration
    df_full['pp_coef'] =  df_full.mcc_test / df_full.init_mcc_test

    # max_perc grouping
    grouping = ['dataset', 'random_state', 'data_split', 'model', 'imp_type']

    # select all non-base models
    max_perc = df_full.query('iteration > 0')
    max_perc = max_perc.groupby(grouping).mcc_perc.max()
    max_perc = max_perc.rename('max_perc')

    df_full = pd.merge(df_full, max_perc, on=grouping, how='inner')
    
    df_full['feature_dropped'] =  df_full.max_feature_cnt - df_full.feature_cnt
    df_full['feature_dropped_perc'] = (1 - (df_full.max_feature_cnt - df_full.feature_dropped) 
                                   / df_full.max_feature_cnt)
    
    features_df = pd.DataFrame(datasets.items(), columns=['dataset', 'full_feature_lst'])
    features_df['full_feature_set'] = features_df.full_feature_lst.apply(set)
    
    df_full = df_full.merge(features_df, on='dataset')
#     print(df_full.columns)
    dropped_feature_set = get_feature_sets(df_full)
#     print(dropped_feature_set)
    df_full = df_full.join(dropped_feature_set)
    
    df_full['current_feature_set'] = (df_full['full_feature_set'] - df_full['dropped_feature_set'])
    df_full['current_feature_str'] = (df_full['full_feature_set'] - df_full['dropped_feature_set'])
    
    df_full['current_feature_str'] = df_full.current_feature_set.apply(str)
#     print(df_full.columns)
    
#     trash
#     df_full['features_dropped_lst'] = [y.dropped_feature.tolist()[:z+1] for x, y in df_full.groupby(['model','imp_type', 'dataset','random_state', 'data_split']) for z in range(len(y))]
#     df_full['features_dropped_set'] = df_full['features_dropped_lst'].apply(set)
#     original_shape, df_full.shape
#     df_full['feature_set'] =  df_full['full_feature_set'] - df_full['features_dropped_set']
    return df_full

In [12]:
def get_regr_features(df):
    cols = ['dataset', 'random_state', 'data_split', 'model', 'imp_type', 'mse_test']
    idx = ['dataset', 'random_state', 'data_split', 'model', 'imp_type']

    init_metric = df.query('dropped_feature == "full_set"')
    init_metric = init_metric.groupby(idx).mse_test.max()
    init_metric = init_metric.rename('init_mse_test')

    # merge into original df
    df_full= pd.merge(df, init_metric, on=idx, how='inner')
    
    cols = ['dataset', 'iteration']
    idx = ['dataset']
    max_feature_cnt = df[cols]
    max_feature_cnt = max_feature_cnt.groupby(idx).max() + 1

    max_feature_cnt['f_cnt_10perc'] = np.round((max_feature_cnt.iteration * 0.1)).astype(int)
    max_feature_cnt['f_cnt_20perc'] = np.round((max_feature_cnt.iteration * 0.2)).astype(int)
    max_feature_cnt['f_cnt_30perc'] = np.round((max_feature_cnt.iteration * 0.3)).astype(int)
    max_feature_cnt = max_feature_cnt.rename(columns={'iteration':'max_feature_cnt'})

    # merge into original df
    df_full = pd.merge(df_full, max_feature_cnt, on=idx, how='inner')
    
    df_full['mse_delta'] = df_full.init_mse_test - df_full.mse_test
    df_full['mse_perc'] = df_full.mse_delta / df_full.init_mse_test
    df_full['feature_cnt'] = df_full.max_feature_cnt - df_full.iteration
    df_full['pp_coef'] = df_full.init_mse_test / df_full.mse_test
    # df_full['pp_coef'] = df_full.init_mse_test / df_full.mse_test

    # max_perc grouping
    grouping = ['dataset', 'random_state', 'data_split', 'model', 'imp_type']

    # select all non-base models
    max_perc = df_full.query('iteration > 0')
    max_perc = max_perc.groupby(grouping).mse_perc.max()
    max_perc = max_perc.rename('max_perc')

    df_full = pd.merge(df_full, max_perc, on=grouping, how='inner')
    
    df_full['feature_dropped'] =  df_full.max_feature_cnt - df_full.feature_cnt
    df_full['feature_dropped_perc'] = (1 - (df_full.max_feature_cnt - df_full.feature_dropped) 
                                   / df_full.max_feature_cnt)
    
    features_df = pd.DataFrame(datasets.items(), columns=['dataset', 'full_feature_lst'])
    features_df['full_feature_set'] = features_df.full_feature_lst.apply(set)
    
    df_full = df_full.merge(features_df, on='dataset')
#     print(df_full.columns)
    dropped_feature_set = get_feature_sets(df_full)
#     print(dropped_feature_set)
    df_full = df_full.join(dropped_feature_set)
    
    df_full['current_feature_set'] = (df_full['full_feature_set'] - df_full['dropped_feature_set'])
    df_full['current_feature_str'] = (df_full['full_feature_set'] - df_full['dropped_feature_set'])
    
    df_full['current_feature_str'] = df_full.current_feature_set.apply(str)
#     print(df_full.columns)
    
#     trash
#     df_full['features_dropped_lst'] = [y.dropped_feature.tolist()[:z+1] for x, y in df_full.groupby(['model','imp_type', 'dataset','random_state', 'data_split']) for z in range(len(y))]
#     df_full['features_dropped_set'] = df_full['features_dropped_lst'].apply(set)
#     original_shape, df_full.shape
#     df_full['feature_set'] =  df_full['full_feature_set'] - df_full['features_dropped_set']
    return df_full

In [35]:
def get_feature_sets(df):
    # this is really bad performance wise, but since 
    # we need inter-record actions we have no choice
    # there is proubably a way to optimize this since we process small subarrays
    lst = []
    start = 0
    for i in range(len(df)):
        curr_arr = df.iloc[i]
        # at the start of a new experiment move dataset start pointer
        if curr_arr.iteration == 0:
            start = i
        sub_arr = df.iloc[start:i+1]
        # grab all dropped features
        all_dropped = set(sub_arr.dropped_feature.tolist())
        all_dropped.remove('full_set')
        
        # old stuff
        # set_dataset = curr_arr.full_feature_set
        # select feature intersection
        # set_features = set_all & set_dataset

        lst.append(all_dropped)
        
    feature_arr = pd.Series(lst, name='dropped_feature_set')   
    return feature_arr

In [60]:
def case3_regr(df):
    # minimal model only use 80% of features in all cases
    case3 = df[df.f_cnt_20perc == df.feature_cnt]

    relevant_columns = ['dataset', 'current_feature_str', 
                        'mse_test', 'init_mse_test','model']
    case3 = case3[relevant_columns]

    init_mcc = case3.groupby(['dataset','model'], sort=False).mean()
    init_mcc = init_mcc.init_mse_test.rename('init_mse_test_mean')

    case3 = case3.join(init_mcc, on=['dataset', 'model'])

    case3 = case3.groupby(['dataset','model','current_feature_str'], sort=False).mean()

#     case3 = case3.groupby(['dataset','model','current_feature_str'], sort=False).mean()

    case3 = case3[['mse_test', 'init_mse_test_mean']].sort_values(['dataset', 'model', 'mse_test'], ascending=False)
    case3.reset_index();

    min_mse = case3.groupby(['dataset', 'model']).min().mse_test.rename('min_mse')

    case3 = case3.join(min_mse, on=['dataset', 'model'])

    case3 = case3.query('mse_test == min_mse')

    case3.drop_duplicates();
    
    case3['mse_gain_loss_pct'] = round((((case3['init_mse_test_mean'] / case3['mse_test']) - 1) * 100),2)

    case3.drop(['min_mse'], inplace=True, axis=1)
    
    return case3

In [87]:
def case2_regr(df):
    # case2 main condition: no accuracy lost
    max_dropped = df.query('mse_test <= init_mse_test')

    relevant_columns = ['data_split','random_state', 'dataset', 'imp_type', 
                        'current_feature_str', 'feature_dropped_perc', 'model', 'mse_test', 'init_mse_test']
    test = max_dropped[relevant_columns]

    # get features sets that maximize feature dropped perc
    idx = test.groupby(['dataset', 'model','data_split', 'random_state'], sort=False)['feature_dropped_perc'].idxmax()

    test2 = test.loc[idx].sort_values('feature_dropped_perc', ascending=False)
    
    init_mse = test2.groupby(['dataset','model'], sort=False).mean()
    init_mse = init_mse.init_mse_test.rename('init_mse_test_mean')

    test2 = test2.join(init_mse, on=['dataset', 'model'])
    
    max_feature_dropped = test2.groupby(['dataset', 'model']).feature_dropped_perc.max()
    max_feature_dropped.rename('max_feature_dropped', inplace=True);

    test3 = test2.merge(max_feature_dropped, on=['dataset', 'model'])

    test4 = test3.query('feature_dropped_perc == max_feature_dropped')

    grouping = ['dataset', 'model', 'current_feature_str', 'feature_dropped_perc', 'mse_test','init_mse_test_mean']
    final_df = test4[grouping].groupby(['dataset', 'model', 'current_feature_str']).mean()
    final_df = final_df.drop_duplicates().sort_values(['dataset','model'], ascending=False)
    
    
    return final_df

In [108]:
def case1_regr(df):  
    grouping = ['model','iteration','mse_test','init_mse_test', 'current_feature_str','imp_type','max_perc','random_state', 'data_split', 'dataset', 'current_feature_set']
    case1 = df[grouping]
    # case1 = case1[case1.mcc_perc == case1.max_perc]
    case1['current_feature_cnt'] = case1['current_feature_set'].apply(len)

    # avg coalition mse
    mse_coalition = case1.groupby(['dataset', 'model', 'current_feature_str']).mean()

    mse_coalition = mse_coalition[['mse_test', 'init_mse_test', 'current_feature_cnt']]

    mse_coalition = mse_coalition.sort_values(['dataset', 'model', 'mse_test'], ascending=False)

    max_mse_coal = mse_coalition.groupby(['dataset', 'model']).min().mse_test
    max_mse_coal.rename('max_coal_mse', inplace=True);

    mse_coalition = mse_coalition.join(max_mse_coal)

    mse_coalition = mse_coalition.query('mse_test == max_coal_mse')

    min_f = mse_coalition.groupby(['dataset', 'model']).min().current_feature_cnt; min_f.rename('min_feat_cnt', inplace=True);

    mse_coalition = mse_coalition.join(min_f)

    mse_coalition = mse_coalition.query('current_feature_cnt == min_feat_cnt')

    mse_coalition['mse_gain_loss_pct'] = round((mse_coalition['init_mse_test'] / mse_coalition['mse_test'] - 1) * 100, 2)

    case1 = mse_coalition.sort_values(['dataset', 'model','mse_test'], ascending=False)[['mse_test', 'init_mse_test','mse_gain_loss_pct']]
    
    return case1

In [633]:
def parity_analysis_regr(df_case1):
    
    res1 = df_case1.reset_index()
    
    res1 = res1.pivot(index='dataset', columns='model', values=['init_mse_test', 'mse_test']).sort_values('dataset', ascending=False)

    res1.columns = res1.columns.to_flat_index()

    res1

    columns = ['DT(all)', 'RF(all)', 'GBDT(all)', 'DT(opt)', 'RF(opt)','GBDT(opt)']

    res1.columns = columns; res1

    res1 = res1[['DT(opt)', 'DT(all)', 'RF(all)', 'RF(opt)', 'GBDT(all)', 'GBDT(opt)']]

    res1

    pct_diff = res1.copy()
    res1['task'] = 'regression'
    res1['metric'] = 'mse'
    
    # compare simple vs complex
    pct_diff['DT(all)vsRF(all)'] = pct_diff['RF(all)'] / pct_diff['DT(all)']
    pct_diff['DT(all)vsGBDT(all)'] = pct_diff['GBDT(all)'] / pct_diff['DT(all)'] 

    # simple opt vs complex unopt
    pct_diff['DT(opt)vsRF(all)'] = pct_diff['RF(all)'] / pct_diff['DT(opt)'] 
    pct_diff['DT(opt)vsGBDT(all)'] =  pct_diff['GBDT(all)'] / pct_diff['DT(opt)']
    
    # reduce complex model complexity
    pct_diff['RF(opt)vsRF(all)'] =  pct_diff['RF(all)'] / pct_diff['RF(opt)']
    pct_diff['GBDT(opt)vsGBDT(all)'] = pct_diff['GBDT(all)'] / pct_diff['GBDT(opt)']
    
#     # simple models {opt, all} vs complex optimized
#     # simple all
#     pct_diff['DT(all)vsRF(opt)'] = pct_diff['RF(opt)'] / pct_diff['DT(all)']
#     pct_diff['DT(all)vsGBDT(opt)'] = pct_diff['GBDT(opt)'] / pct_diff['DT(all)']
#     # simple opt
#     pct_diff['DT(opt)vsRF(opt)'] = pct_diff['DT(opt)'] / pct_diff['RF(opt)']
#     pct_diff['DT(opt)vsGBDT(opt)'] = pct_diff['DT(opt)'] / pct_diff['GBDT(opt)']

    pct_diff = round(pct_diff[['DT(all)vsRF(all)','DT(opt)vsRF(all)',
                               'DT(all)vsGBDT(all)', 'DT(opt)vsGBDT(all)',
                               'RF(opt)vsRF(all)', 'GBDT(opt)vsGBDT(all)',
                              ]
                             ] * 100, 2)
    pct_diff['task'] = 'regression'
    pct_diff['metric'] = 'mse'
    
    return pct_diff, res1

In [634]:
# parity_regr, parity_regr_all = parity_analysis_regr(res1_regr)

In [635]:
def case3_clasif(df):
    # minimal model only use 80% of features in all cases
    case3 = df[df.f_cnt_20perc == df.feature_cnt]

    relevant_columns = ['dataset', 'current_feature_str', 
                        'mcc_test', 'init_mcc_test','model']
    case3 = case3[relevant_columns]

    init_mcc = case3.groupby(['dataset','model'], sort=False).mean()
    init_mcc = init_mcc.init_mcc_test.rename('init_mcc_test_mean')

    case3 = case3.join(init_mcc, on=['dataset', 'model'])

    case3 = case3.groupby(['dataset','model','current_feature_str'], sort=False).mean()

    case3 = case3.groupby(['dataset','model','current_feature_str'], sort=False).mean()

    case3 = case3[['mcc_test', 'init_mcc_test_mean']].sort_values(['dataset', 'model', 'mcc_test'], ascending=False)
    case3.reset_index();

    max_mcc = case3.groupby(['dataset', 'model']).max().mcc_test.rename('max_mcc')

    case3 = case3.join(max_mcc, on=['dataset', 'model'])

    case3 = case3.query('mcc_test == max_mcc')

    case3.drop_duplicates();

    case3['mcc_gain_loss'] = round((case3['mcc_test'] / case3['init_mcc_test_mean'] - 1) * 100, 2)

    case3.drop(['max_mcc'], inplace=True, axis=1)
    
    return case3

In [636]:
def case2_clasif(df):
    # case2 main condition: no accuracy lost
    max_dropped = df.query('mcc_test  >= init_mcc_test')

    relevant_columns = ['data_split','random_state', 'dataset', 'imp_type', 
                        'current_feature_str', 'feature_dropped_perc', 'model', 'mcc_test', 'init_mcc_test']
    test = max_dropped[relevant_columns]

    # get features sets that maximize feature dropped perc
    idx = test.groupby(['dataset', 'model','data_split', 'random_state'], sort=False)['feature_dropped_perc'].idxmax()

    test2 = test.loc[idx].sort_values('feature_dropped_perc', ascending=False)
    
    init_mcc = test2.groupby(['dataset','model'], sort=False).mean()
    init_mcc = init_mcc.init_mcc_test.rename('init_mcc_test_mean')

    test2 = test2.join(init_mcc, on=['dataset', 'model'])

    max_feature_dropped = test2.groupby(['dataset', 'model']).feature_dropped_perc.max()
    max_feature_dropped.rename('max_feature_dropped', inplace=True);

    test3 = test2.merge(max_feature_dropped, on=['dataset', 'model'])

    test4 = test3.query('feature_dropped_perc == max_feature_dropped')

    grouping = ['dataset', 'model', 'current_feature_str', 'feature_dropped_perc', 'mcc_test','init_mcc_test_mean']
    final_df = test4[grouping].groupby(['dataset', 'model', 'current_feature_str']).mean()
    final_df = final_df.drop_duplicates().sort_values(['dataset','model'], ascending=False)
    
    
    return final_df
    

In [637]:
def case1_clasif(df):  
    grouping = ['model','iteration','mcc_test', 'mcc_perc','init_mcc_test', 'current_feature_str','imp_type','max_perc','random_state', 'data_split', 'dataset', 'current_feature_set']
    case1 = df[grouping]
    # case1 = case1[case1.mcc_perc == case1.max_perc]
    case1['current_feature_cnt'] = case1['current_feature_set'].apply(len)

    # avg coalition mcc
    mcc_coalition = case1.groupby(['dataset', 'model', 'current_feature_str']).mean()

    mcc_coalition = mcc_coalition[['mcc_test', 'init_mcc_test', 'current_feature_cnt']]

    mcc_coalition = mcc_coalition.sort_values(['dataset', 'model', 'mcc_test'], ascending=False)

    max_mcc_coal = mcc_coalition.groupby(['dataset', 'model']).max().mcc_test
    max_mcc_coal.rename('max_coal_mcc', inplace=True);

    mcc_coalition = mcc_coalition.join(max_mcc_coal)

    mcc_coalition = mcc_coalition.query('mcc_test == max_coal_mcc')

    min_f = mcc_coalition.groupby(['dataset', 'model']).min().current_feature_cnt; min_f.rename('min_feat_cnt', inplace=True);

    mcc_coalition = mcc_coalition.join(min_f)

    mcc_coalition = mcc_coalition.query('current_feature_cnt == min_feat_cnt')

    mcc_coalition['mcc_gain_loss'] = round((mcc_coalition['mcc_test'] / mcc_coalition['init_mcc_test'] - 1) * 100, 2)

    case1 = mcc_coalition.sort_values(['dataset', 'model','mcc_test'], ascending=False)[['mcc_test', 'init_mcc_test','mcc_gain_loss']]
    
    return case1

def parity_analysis_clasif(df_case1):
    
    res1 = df_case1
    res1 = res1.reset_index()
    res1 = res1[['dataset', 'model', 'mcc_test','init_mcc_test']]
#     res1 = res1.drop_duplicates()
    
    res1 = (res1
#             .reset_index()
            .groupby(['dataset','model'])
            .first()
            .reset_index()
            .sort_values(['dataset', 'model', 'mcc_test'], ascending=False))
    
#     print(res1.sort_values('dataset', ascending=False).head(20))
    
    res1 = res1.pivot(index='dataset', columns='model', values=['init_mcc_test', 'mcc_test']).sort_values('dataset', ascending=False)

    res1.columns = res1.columns.to_flat_index()

    res1

    columns = ['DT(all)', 'RF(all)', 'GBDT(all)', 'DT(opt)', 'RF(opt)','GBDT(opt)']

    res1.columns = columns; res1

    res1 = res1[['DT(opt)', 'DT(all)', 'RF(all)', 'RF(opt)', 'GBDT(all)', 'GBDT(opt)']]

#     print(res1.index)

    pct_diff = res1.copy()
    res1['task'] = 'classification'
    res1['metric'] = 'mcc'
    
    # compare simple vs complex
    pct_diff['DT(all)vsRF(all)'] = pct_diff['DT(all)'] / pct_diff['RF(all)']
    pct_diff['DT(all)vsGBDT(all)'] = pct_diff['DT(all)'] / pct_diff['GBDT(all)']
    
    # simple models vs complex unoptimized
    pct_diff['DT(opt)vsRF(all)'] = pct_diff['DT(opt)'] / pct_diff['RF(all)']
    pct_diff['DT(opt)vsGBDT(all)'] = pct_diff['DT(opt)'] / pct_diff['GBDT(all)']
    
    # reduce complex model complexity
    pct_diff['RF(opt)vsRF(all)'] = pct_diff['RF(opt)'] / pct_diff['RF(all)']
    pct_diff['GBDT(opt)vsGBDT(all)'] = pct_diff['GBDT(opt)'] / pct_diff['GBDT(all)']

#     # simple models vs complex unoptimized
#     pct_diff['DT(opt)vsRF(all)'] = pct_diff['DT(opt)'] / pct_diff['RF(all)']
#     pct_diff['DT(opt)vsGBDT(all)'] = pct_diff['DT(opt)'] / pct_diff['GBDT(all)']
    
#     # simple models {opt, all} vs complex optimized
#     # simple all
#     pct_diff['DT(all)vsRF(opt)'] = pct_diff['DT(all)'] / pct_diff['RF(opt)']
#     pct_diff['DT(all)vsGBDT(opt)'] = pct_diff['DT(all)'] / pct_diff['GBDT(opt)'] 
#     # simple opt
#     pct_diff['DT(opt)vsRF(opt)'] = pct_diff['DT(opt)'] / pct_diff['RF(opt)']
#     pct_diff['DT(opt)vsGBDT(opt)'] = pct_diff['DT(opt)'] / pct_diff['GBDT(opt)'] 

    pct_diff = round(pct_diff[['DT(all)vsRF(all)','DT(opt)vsRF(all)',
                               'DT(all)vsGBDT(all)', 'DT(opt)vsGBDT(all)',
                               'RF(opt)vsRF(all)', 'GBDT(opt)vsGBDT(all)',
                              ]
                             ] * 100, 2)
    pct_diff['task'] = 'classification'
    pct_diff['metric'] = 'mcc'
    
    return pct_diff, res1

In [638]:
# parity_clasif, parity_clasif_all = parity_analysis_clasif(res1_clasif)

In [639]:
# res1_clasif;

In [640]:
# res1_clasif.reset_index().groupby(['dataset','model']).first().sort_values(['dataset', 'model', 'mcc_test'], ascending=False);

In [641]:
# df_clasif = lst_dfs[0]
# df_clasif = get_clasif_features(df.iloc[:10000])
# df_clasif = get_clasif_features(df_clasif)

# res2_clasif = case2_clasif(df_clasif)

# res3_clasif = case3_clasif(df_clasif)

# res1_clasif = case1_clasif(df_clasif)

In [642]:
# # df_regr = lst_dfs[1]
# # df_regr = get_regr_features(df_regr.iloc[:10000])
# # df_regr = get_regr_features(df_regr)

# res3_regr = case3_regr(df_regr); res3_regr;

# res2_regr = case2_regr(df_regr); res2_regr;

# res1_regr = case1_regr(df_regr); res1_regr;

In [643]:
# dump data to disk

In [644]:
# res1_clasif.to_csv('results_coalition_parity/case1_coalition_clasif.csv')

# res2_clasif.to_csv('results_coalition_parity/case2_coalition_clasif.csv')

# res3_clasif.to_csv('results_coalition_parity/case3_coalition_clasif.csv')

# res1_regr.to_csv('results_coalition_parity/case1_coalition_regr.csv')

# res2_regr.to_csv('results_coalition_parity/case2_coalition_regr.csv')

# res3_regr.to_csv('results_coalition_parity/case3_coalition_regr.csv')

In [645]:
parity_regr, parity_regr_all = parity_analysis_regr(res1_regr)

In [646]:
parity_clasif, parity_clasif_all = parity_analysis_clasif(res1_clasif)

In [647]:
parity_small = pd.concat([parity_regr, parity_clasif])

In [648]:
parity_large = pd.concat([parity_regr_all, parity_clasif_all])

In [657]:
parity_large

Unnamed: 0_level_0,DT(opt),DT(all),RF(all),RF(opt),GBDT(all),GBDT(opt),task,metric
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
diabetes,4574.541,6214.88,2904.814,2734.083,3082.448,3352.718,regression,mse
crime,0.02585679,0.03909593,0.01730625,0.01618907,0.01890048,0.01620825,regression,mse
boston,16.14947,17.37671,9.071403,10.742,11.19797,10.29483,regression,mse
ames_housing,949408400.0,1264754000.0,542140100.0,525141200.0,477304800.0,431868200.0,regression,mse
wine,0.9725563,0.9725563,1.0,1.0,1.0,1.0,classification,mcc
phishing,0.9347885,0.9299563,0.9468502,0.9517603,0.9419477,0.9474607,classification,mcc
mushroom,1.0,1.0,1.0,1.0,1.0,1.0,classification,mcc
breast_cancer,0.9625013,0.8871975,0.9506628,0.9754854,0.9754854,0.9754854,classification,mcc


In [656]:
parity_large

Unnamed: 0_level_0,DT(opt),DT(all),RF(all),RF(opt),GBDT(all),GBDT(opt),task,metric
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
diabetes,4574.541,6214.88,2904.814,2734.083,3082.448,3352.718,regression,mse
crime,0.02585679,0.03909593,0.01730625,0.01618907,0.01890048,0.01620825,regression,mse
boston,16.14947,17.37671,9.071403,10.742,11.19797,10.29483,regression,mse
ames_housing,949408400.0,1264754000.0,542140100.0,525141200.0,477304800.0,431868200.0,regression,mse
wine,0.9725563,0.9725563,1.0,1.0,1.0,1.0,classification,mcc
phishing,0.9347885,0.9299563,0.9468502,0.9517603,0.9419477,0.9474607,classification,mcc
mushroom,1.0,1.0,1.0,1.0,1.0,1.0,classification,mcc
breast_cancer,0.9625013,0.8871975,0.9506628,0.9754854,0.9754854,0.9754854,classification,mcc


In [661]:
grouping_simple_models = ['DT(all)vsRF(all)','DT(opt)vsRF(all)',
            'DT(all)vsGBDT(all)', 'DT(opt)vsGBDT(all)', 'task' ,'metric'
                              ]

In [664]:
grouping_complex_models = ['RF(opt)vsRF(all)', 'GBDT(opt)vsGBDT(all)', 'task', 'metric']

In [665]:
parity_small[grouping_simple_models]

Unnamed: 0_level_0,DT(all)vsRF(all),DT(opt)vsRF(all),DT(all)vsGBDT(all),DT(opt)vsGBDT(all),task,metric
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
diabetes,46.74,63.5,49.6,67.38,regression,mse
crime,44.27,66.93,48.34,73.1,regression,mse
boston,52.2,56.17,64.44,69.34,regression,mse
ames_housing,42.87,57.1,37.74,50.27,regression,mse
wine,97.26,97.26,97.26,97.26,classification,mcc
phishing,98.22,98.73,98.73,99.24,classification,mcc
mushroom,100.0,100.0,100.0,100.0,classification,mcc
breast_cancer,93.32,101.25,90.95,98.67,classification,mcc


In [666]:
parity_small[grouping_complex_models]

Unnamed: 0_level_0,RF(opt)vsRF(all),GBDT(opt)vsGBDT(all),task,metric
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
diabetes,106.24,91.94,regression,mse
crime,106.9,116.61,regression,mse
boston,84.45,108.77,regression,mse
ames_housing,103.24,110.52,regression,mse
wine,100.0,100.0,classification,mcc
phishing,100.52,100.59,classification,mcc
mushroom,100.0,100.0,classification,mcc
breast_cancer,102.61,100.0,classification,mcc


In [651]:
# parity_small.to_csv('results_coalition_parity/parity_small.csv')

# parity_large.to_csv('results_coalition_parity/parity_large.csv')