# Simulate Data

We're going to use this notebook to simulate a bunch of distributions and data sets.

In [1]:
import os
import pickle

import numpy as np
import pandas as pd
import scipy.stats
import skbio
import seaborn as sn
% matplotlib inline
import statsmodels.api as sm
import matplotlib.pyplot as plt

import emp_power.simulate as sim

sn.set_style('ticks')

In [2]:
np.random.seed(25)

We'll perfrom 100 simulations

In [3]:
sim_location = './simulations/'
if not os.path.exists(sim_location):
    os.makedirs(sim_location)
overwrite = True

In [4]:
num_rounds = 100
distributions = {}

In [5]:
def retrieve_test(simulation_type):
    """The simulation function, test, and simulation parameters"""
    simulation = distributions[simulation_type]['function']
    test = distributions[simulation_type]['test']
    kwargs = distributions[simulation_type]['kwargs']
    
    return simulation, test, kwargs

# Categorical Tests

## One Sample T test

In [6]:
distributions['ttest_1'] = {'function': sim.simulate_ttest_1,
                            'test': lambda x: scipy.stats.ttest_1samp(*x, 0),
                            'kwargs': {'mu_lim': [5, 10],
                                       'sigma_lim': [5, 8],
                                       'count_lim': [60, 100]}
                            }

## Two Sample Independent T test

In [7]:
distributions['ttest_ind'] = {'function': sim.simulate_ttest_ind,
                              'test': lambda x: scipy.stats.ttest_ind(*samples),
                              'kwargs': {'mu_lim': [0, 10],
                                         'sigma_lim': [5, 15],
                                         'counts_lim': [60, 100]}
                              }

## One way ANOVA

In [8]:
distributions['anova_3'] = {'function': sim.simulate_anova,
                            'test': lambda x: scipy.stats.f_oneway(*x),
                            'kwargs': {'mu_lim': [0, 10],
                                       'sigma_lim': [5, 15],
                                       'count_lim': [60, 100],
                                       'num_pops': 3}
                            }
distributions['anova_8'] = {'function': sim.simulate_anova,
                            'test': lambda x: scipy.stats.f_oneway(*x),
                            'kwargs': {'mu_lim': [0, 10],
                                       'sigma_lim': [5, 15],
                                       'count_lim': [60, 100],
                                       'num_pops': 8}
                            }

## Bimodal distribution

In [9]:
distributions['bimodal'] = {'function': sim.simulate_bimodal,
                            'test': lambda x: scipy.stats.mannwhitneyu(*x),
                            'kwargs': {'mu_lim': [-2, 4],
                                       'sigma_lim': [1, 5],
                                       'count_lim': [60, 100],
                                       'bench_lim': [30, 50],
                                       'diff_lim': [0, 5],
                                       'sep_lim': [4, 8]}
                            }

## PERMANOVA on distance matrix

In [10]:
def permanova_test(samples):
    """Wraps the scikit-bio permanova for convenience"""
    [dm, groups] = samples
    res = skbio.stats.distance.permanova(dm,
                                         groups,
                                         permutations=99)
    return res['p-value']

In [11]:
distributions['permanova_distance'] = {'function': sim.simulate_permanova,
                                       'test': permanova_test,
                                       'kwargs': {'num_samples': np.random.randint(60, 100),
                                                  'num0': None,
                                                  'wdist': [0, 0.5],
                                                  'wspread': [0, 0.5], 
                                                  'bdist': [0, 0.5],
                                                  'bspread': [0, 0.5],
                                                  }
                                       }

# Continous Distributions

## Univariate Correlation

In [24]:
distributions['correlation'] = {'function': sim.simulate_correlation,
                                'test': lambda x: scipy.stats.pearsonr(*x),
                                'kwargs': {'slope_lim': [1, 5],
                                           'intercept_lim': [-2, 2],
                                           'sigma_lim': [2, 10],
                                           'count_lim': [60, 100]}
                                 }

## Multivariate Correlation

In [25]:
def multivariate_test(samples):
    """Wraps the statsmodels Ordinary Least Squares model"""
    [xs, y] = samples
    r = sm.OLS(y, sm.add_constant(xs)).fit()
    return r.pvalues

In [26]:
distributions['multivariate'] = {'function': sim.simulate_multivariate,
                                'test': multivariate_test,
                                'kwargs': {'slope_lim': [-5, 5],
                                           'intercept_lim': [-3, 3],
                                           'sigma_lim': [3, 8],
                                           'count_lim': [60, 100],
                                           'x_lim': [-10, 10],
                                           'num_pops': 3,
                                           }
                                }

## Distance Correlation

In [27]:
def mantel_test(samples):
    """Wraps the scikit-bio mantel test"""
    [x, y] = samples
    return skbio.stats.distance.mantel(x, y, 
                                       method='spearman', 
                                       permutations=99)[1]

In [28]:
distributions['mantel'] = {'function': sim.simulate_mantel,
                           'test': mantel_test,
                           'kwargs': {'slope_lim': [1, 5],
                                      'intercept_lim': [-2, 2],
                                      'sigma_lim': [2, 10],
                                      'count_lim': [60, 100]}
                           }

# Simulates the data

In [29]:
for test_name in distributions.keys():
    print(test_name)
    # Gets the simulation function, test, and arguments
    simulation, test, kwargs = retrieve_test(test_name)
    if not os.path.exists(os.path.join(sim_location, test_name)):
        os.makedirs(os.path.join(sim_location, test_name))
    # Permforms the simulations
    for i in range(num_rounds):
        file = os.path.join(sim_location, '%s/simulation_%i.p' % (test_name, i))
        params, samples = simulation(**kwargs)
        p = test(samples)
        with open(file, 'wb') as f_:
            pickle.dump({'samples': samples, 'params': params, 'p_value': p}, f_)

multivariate
anova_3
mantel
anova_8
bimodal
ttest_ind
correlation
ttest_1
permanova_distance
