In [888]:
import pandas as pd
from piex import explorer

In [890]:
from mit_d3m.db import get_db

Using TensorFlow backend.


In [891]:
db = get_db('ta2')

In [892]:
piex = explorer.MongoPipelineExplorer(db)

In [893]:
df = piex.get_test_results(data_modality='single_table', task_type='regression')

In [894]:
df.columns

Index(['cv_score', 'cv_time', 'data_modality', 'dataset', 'elapsed', 'error',
       'fit_time', 'insert_ts', 'iterations', 'load_time', 'metric', 'rank',
       'score', 'step', 'task_subtype', 'task_type', 'test_id', 'trivial_time',
       'budget', 'checkpoints', 'commit', 'docker', 'hostname', 'image',
       'insert_ts_results', 'pipeline', 'status', 'timeout', 'update_ts'],
      dtype='object')

In [896]:
import numpy as np

def normalize(metric_type, min_value=None, max_value=None):
    def f(raw):
        if metric_type == 'zero_one_score':
            return raw
        elif metric_type == 'zero_one_cost':
            return 1 - raw
        elif metric_type == 'ranged_score':
            return (raw - min_value) / (max_value - min_value)
        elif metric_type == 'real_score':
            return 1 / (1 + np.exp(-raw))
        elif metric_type == 'real_cost':
            return 1 - (1 / (1 + np.exp(-raw)))
        elif metric_type == 'zero_inf_score':
            return 1 / (1 + np.exp(-np.log10(raw)))
        elif metric_type == 'zero_inf_cost':
            return 1 - 1 / (1 + np.exp(-np.log10(raw)))
        else:
            raise ValueError('Unknown metric type')
    
    return f

In [897]:
METRIC_TYPES = {
    'f1': 'zero_one_score',
    'f1Macro': 'zero_one_score',
    'accuracy': 'zero_one_score',
    'meanSquaredError': 'zero_inf_cost',
    'rootMeanSquaredError': 'zero_inf_cost',
}

In [898]:
def normalize_df(s):
    return normalize(METRIC_TYPES[s.metric])(s.cv_score)

In [878]:
def get_scores(df):
    df = df.copy()
    df['transcore'] = df.apply(normalize_df, axis=1)
    stats = df.groupby('dataset')['transcore'].agg(['mean', 'std', 'min', 'max', 'median'])
    
    #rank_stats = df.groupby('dataset')['rank'].agg(['mean', 'std', 'min', 'max', 'median'])
    #rank_stats['range'] = rank_stats['max'] - rank_stats['min']
    #rank_stats['midpoint'] = rank_stats['min'] + rank_stats['range'] / 2
    
    df = df.merge(stats, how='left', on='dataset')
    #df = df.merge(rank_stats, how='left', on='dataset', suffixes=('', '_rank'))
    #df['z-score'] = (df['rank'] - df['mean_rank']) / df['std_rank']
    #df['r-score'] = (df['rank'] - df['midpoint']) / df['range']
    #df['ranking'] = df.groupby('dataset')['rank'].rank(method='dense')
    df['nist-avg'] = (df['transcore'] - df['mean']) / df['mean']
    df['nist-median'] = (df['transcore'] - df['median']) / df['median']
    df['z-score'] = (df['transcore'] - df['mean']) / df['std']
    
    return df

scores = get_scores(df)

In [879]:
def get_order(df):
    df = df.sort_values('elapsed')
    df['order'] = list(range(1, len(df) + 1))
    return df['order']

checkpoint = scores.groupby(['dataset', 'test_id']).apply(get_order)
checkpoint = checkpoint.reset_index(level=0, drop=True).reset_index(level=0, drop=True)
scores['checkpoint'] = checkpoint

In [880]:
from scipy.stats import ttest_ind

def ttest(df, column, condition):
    test_a = df[condition]
    test_b = df[~condition]
    tvalue = column + '-t-value'
    pvalue = column + '-p-value'

    values = list()
    for checkpoint in test_a.checkpoint.unique():
        set_a = test_a[test_a.checkpoint == checkpoint]
        set_b = test_b[test_b.checkpoint == checkpoint]

        value = {'checkpoint': checkpoint}
        if not (set_a.empty or set_b.empty):
            ttest = ttest_ind(set_a[column].dropna(), set_b[column].dropna(), equal_var=False)
            
            value[tvalue] = ttest.statistic
            value[pvalue] = ttest.pvalue
        
        values.append(value)
    
    return pd.DataFrame(values).set_index('checkpoint')[[tvalue, pvalue]]

In [881]:
df_a = scores[scores.pipeline.str.startswith('dfs')]
df_b = scores[~scores.pipeline.str.startswith('dfs')]

In [882]:
dfs_avg = ttest(scores, 'nist-avg', scores.pipeline.str.startswith('dfs'))
dfs_avg

Unnamed: 0_level_0,nist-avg-t-value,nist-avg-p-value
checkpoint,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-1.705782,0.089029
2,-1.25416,0.210687
3,-1.510771,0.131828
4,-0.883829,0.377434


In [883]:
dfs_median = ttest(scores, 'nist-median', scores.pipeline.str.startswith('dfs'))
dfs_median

Unnamed: 0_level_0,nist-median-t-value,nist-median-p-value
checkpoint,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-1.678676,0.094204
2,-1.256517,0.209838
3,-1.533065,0.126257
4,-0.959977,0.337775


In [884]:
xgb_avg = ttest(scores, 'nist-avg', scores.pipeline.str.contains('xgb'))
xgb_avg

Unnamed: 0_level_0,nist-avg-t-value,nist-avg-p-value
checkpoint,Unnamed: 1_level_1,Unnamed: 2_level_1
1,8.072268,1.800572e-14
2,7.98928,2.517871e-14
3,7.605354,2.945829e-13
4,6.963872,1.792276e-11


In [885]:
xgb_median = ttest(scores, 'nist-median', scores.pipeline.str.contains('xgb'))
xgb_median

Unnamed: 0_level_0,nist-median-t-value,nist-median-p-value
checkpoint,Unnamed: 1_level_1,Unnamed: 2_level_1
1,7.511824,9.460779e-13
2,7.631549,3.698816e-13
3,7.439453,1.025517e-12
4,7.028937,1.222168e-11


In [886]:
pd.concat([xgb_avg, xgb_median], axis=1).to_csv('xgb_vs_random_forest.single_table_regression.ttest.csv')

In [887]:
pd.concat([dfs_avg, dfs_median], axis=1).to_csv('dfs_vs_nodfs.single_table_regression.ttest.csv')