In [None]:
## Import packages
import pandas as pd
from math import floor, ceil
from scipy.special import ndtri
import numpy as np


In [None]:
def optimal_sample_size(power,population_size,variance_null,num_tests,min_effect_size,variance_test=None):
    '''
    Calculates the required number of members per test group given a desired power at 95% significance.
    It is specifically designed for our use case (multiple tests, known population, solving for a single test group size).
    This is the result of a bunch of algebra that I won't reproduce here.

    If you wish to verify the results for yourself, you can use this tool: https://ytliu0.github.io/Stat_Med/power2.html
    If comparing results to that website, make sure to choose two-sided test, unlock the group sizes, and note that they ask for standard deviations.
    Note that there is a small amount of rounding error that will prevent the results lining up exactly.

    Args:
        :power_test: The desired power level.
        :population_size: Total number of targeted members.
        :variance_null: The variance under the null hypothesis.
        :variance_test: The variance under the alternative hypothesis. This will generally not be
            known, though in the case of rates (e.g. completion rates) it can be calculated and supplied.
        :num_tests: Number of test groups.
        :min_detectable_uplift: The desired minimum detectable change in the outcome at the given power and 95% significance.
    '''
    alpha = 0.05    # 95% significance
    critical_value = ndtri(1-alpha/2)

    if variance_test == None: #If the test group variance is not known and can't be calculated, just use the null group variance as a best estimate
        variance_test = variance_null     

    s_t, s_c, n, k = variance_test, variance_null, population_size, num_tests #For brevity
    z = (min_effect_size/(ndtri(power)+critical_value))*(min_effect_size/(ndtri(power)+critical_value)) #For brevity

    #Coefficients of the polynomial
    a = -z*k
    b = (z*n+k*s_t-s_c)
    c = -n*s_t

    return ceil(min(np.roots([a,b,c])))

In [18]:
optimal_sample_size(0.8,8000,0.25,1,.05)

883