# Effect size extrapolation

In the [last notebook], we calculated emperical and distribution-based power for five types of statistical tests:
* One Sample T test
* Independent Sample t test 
* One way ANOVA, 3 groups
* One way ANOVA, 8 groups
* Linear correlation

We will now evaluate 

In [1]:
import copy
import os
import pickle
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sn
import scipy
# import statsmodels.api as sms
import statsmodels.formula.api as smf

import emp_power.traditional as trad
import emp_power.effects as eff
import emp_power.plot as plot

% matplotlib inline
sn.set_style('ticks')

In [2]:
warnings.filterwarnings('ignore')

In [3]:
num_rounds = 100
alpha = 0.05
counts = np.arange(5, 100, 10)
colormap = 'Spectral'
overwrite = True

In [4]:
sim_location = './simulations'
if not os.path.exists(sim_location):
    raise ValueError('The power simulations do not exist.'
                     'Please go back to notebooks 2 and 3 and'
                     'calculate power.'
                     )

In [5]:
colors = sn.color_palette(colormap, n_colors=len(counts))
check_color = {count: colors[i] for i, count in enumerate(counts)}

In [22]:
tests = ['permanova', 'mantel']
# tests = ['ttest_1']

In [7]:
extrapolated = ['f_power', 't_power', 'z_power']

In [23]:
distributions = {'permanova': {'clean_name': 'PERMANOVA',
                               'num_groups': 2,
                               'input_dir': './simulations/emperical_power/permanova',
                               'return_fp': './simulations/extrapolation/permanova.txt'
                               },
                 'mantel': {'clean_name': 'Independent Sample\n T Test',
                            'num_groups': 2,
                            'input_dir': './simulations/emperical_power/ttest_ind',
                            'return_fp': './simulations/extrapolation/ttest_ind.txt'
                            },
#                  'anova_3': {'clean_name': 'One way ANOVA\n(3 groups)',
#                              'num_groups': 3,
#                              'input_dir': './simulations/emperical_power/anova_3',
#                              'return_fp': './simulations/extrapolation/anova_3.txt'
#                              },
#                  'anova_8': {'clean_name': 'One way ANOVA\n(8 groups)',
#                              'num_groups': 8,
#                              'input_dir': './simulations/emperical_power/anova_8',
#                              'return_fp': './simulations/extrapolation/anova_8.txt'
#                              },
#                  'correlation': {'clean_name': 'Correlation',
#                                  'num_groups': 2,
#                                  'input_dir': './simulations/emperical_power/correlation',
#                                  'return_fp': './simulations/extrapolation/correlation.txt'
#                                  }
                 }

We'll start by calculating the emperical and extrapolated effect sizes for the parametric tests.

In [24]:
for test_name in tests:
    power_dir = distributions[test_name]['input_dir']
    num_groups = distributions[test_name]['num_groups']
    return_fp = distributions[test_name]['return_fp']
    
    if not os.path.exists(power_dir):
        raise ValueError('%s does not exist' % power_dir)
        
    summaries = []
    
    # Loops through the rounds
    for i in range(10):
        # Loads through the power simulation for the round
        power_fp = os.path.join(power_dir, 'simulation_%i.p' % i)
        
        try:
            with open(power_fp, 'rb') as f_:
                sim = pickle.load(f_)
        except:
            pass
        
        # Pulls the previously calculated power values
        counts = sim['counts']
        emperical = sim['emperical_power']
        empr_shape = emperical.shape
        z_effect = eff.z_effect(counts, emperical)
        t_effect = eff.t_effect(counts, emperical)
        f_effect = eff.f_effect(counts, emperical, groups=num_groups)
        num_obs = (empr_shape[0] * empr_shape[1])
        run_summary = pd.DataFrame({
                    'counts': np.hstack([counts] * empr_shape[0]),
                    'emperical_power': np.hstack(emperical),
                    'sim_pos': np.hstack([np.arange(empr_shape[1]) + (i + 1) * 10 
                                          for i in range(empr_shape[0])]),
                    'z_effect': np.hstack(z_effect),
                    't_effect': np.hstack(t_effect),
                    'f_effect': np.hstack(f_effect),
                    'z_power': np.hstack([eff.z_power(counts, np.nanmean(z_effect))]
                                         * empr_shape[0]),
                    't_power': np.hstack([eff.t_power(counts, np.nanmean(t_effect))]
                                         * empr_shape[0]),
                    'f_power': np.hstack([eff.f_power(counts, np.nanmean(f_effect), groups=num_groups)]
                                         * empr_shape[0]),
                    })
        run_summary['color'] = run_summary['counts'].apply(lambda x: check_color[x])
        run_summary['test'] = test_name
        run_summary['clean_name'] = distributions[test_name]['clean_name']
        run_summary['simulation'] = i
        run_summary['fit_f-mean'] = np.nanmean(f_effect)
        run_summary['fit_f-std'] = np.nanstd(f_effect)
        run_summary['fit_f-count'] = np.sum(np.isnan(f_effect) == False) / num_obs
        run_summary['fit_t-mean'] = np.nanmean(t_effect)
        run_summary['fit_t-std'] = np.nanstd(t_effect)
        run_summary['fit_t-count'] = np.sum(np.isnan(t_effect) == False) / num_obs
        run_summary['fit_z-mean'] = np.nanmean(z_effect)
        run_summary['fit_z-std'] = np.nanstd(z_effect)
        run_summary['fit_z-count'] = np.sum(np.isnan(z_effect) == False) / num_obs
        run_summary['index'] = (run_summary['test'] + '.' +  
                                run_summary['simulation'].apply(lambda x: '%i' % x) + '.' +
                                run_summary['sim_pos'].apply(lambda x: '%i' % x))
        run_summary.set_index('index', inplace=True)
        summaries.append(pd.DataFrame(run_summary))
    summaries = pd.concat(summaries)
    summaries.to_csv(return_fp, sep='\t')
    distributions[test_name]['summary'] = summaries

For most of the effect sizes and fits, we find the behavior of the curve 

Let's also compare the behavior of the emperical power and the fit power curves. 

In [1]:
# Sets up the figure and axes
er_fig, er_axes = plt.subplots(3, 2)
er_fig.set_size_inches(4, 6)
sn.despine()

for idc, test_name in enumerate(tests):
    summary = distributions[test_name]['summary']
    for metric, ax_reg in zip(*[extrapolated, er_axes.T[idc]]):
        plot.gradient_regression(ax=ax_reg, 
                            x='emperical_power', 
                            y=metric, 
                            gradient='color', 
                            alpha=0.25,
                            data=summary
                            )
        plot.format_regression_axis(ax_reg)
        if metric == 'z_power':
            ax_reg.set_xticklabels(ax_reg.get_xticks())
        if test_name == tests[0]:
            ax_reg.set_yticklabels(ax_reg.get_yticks())
            ax_reg.set_ylabel(metric.replace('_', ' ').capitalize())
        
er_axes[-1][2].set_xlabel('Emperical Power')

NameError: name 'plt' is not defined

...