### Exploring Power And Sample Sizes

A brief foray into the realm of statistical robustness:

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as st

import warnings
warnings.filterwarnings("ignore")

In [None]:
def one_prop_sample_size(p0, p1, alpha, power):
    """
    Input:
    - p0: Historical/known proportion to compare to
    - p1: Desired proportion to test against
    - alpha: Desired p-value
    - power: Likelihood of avoiding type II error
    
    Output:
    - return: Sample size
    
    Notes:
    - Created using "Sample Size for One Sample, Dichotomous Outcome" taken from tinyurl.com/jyym9d9f
    """
    z_score_a = st.norm.ppf((1 - alpha)/2)
    z_score_b = st.norm.ppf(power)
    ES = (p1 - p0)/np.sqrt(p1*(1 - p1))
    return ((z_score_a + z_score_b)/ES)**2

In [None]:
one_prop_sample_size(0.6, 0.5, 0.05, 0.9)

### Using Power To Calculate Sample Sizes For BILS Data

In [None]:
bls = pd.read_csv('data/blsdata.csv')

In [None]:
bls = bls[~(bls["Occupations"].str.contains("occupation",case=False).fillna(False))]

In [None]:
full_bls = bls.dropna(how='all').iloc[:-1]

In [None]:
full_bls['Count'] = full_bls['Count'].str.replace(',','').astype(int)
# Looking at occupations that are in the upper half in count in our dataset
clean_bls = full_bls[full_bls['Count'] > 85]

In [None]:
clean_bls['Women'] = clean_bls['Women'].astype(float)
clean_bls['Black or\nAfrican\nAmerican'] = clean_bls['Black or\nAfrican\nAmerican'].astype(float)
clean_bls['Asian'] = clean_bls['Asian'].astype(float)
clean_bls['Hispanic\nor Latino'] = clean_bls['Hispanic\nor Latino'].astype(float)

In [None]:
lst = []
for prop in clean_bls['Women']:
#     print(one_prop_sample_size(prop/100, 0.5, 0.05, 0.9))
    lst += [one_prop_sample_size(prop/100, 0.5, 0.05, 0.99)]

In [None]:
bias_w = clean_bls.reset_index()[['Occupations', 'Women']]

w_samples = bias_w.assign(Sample=pd.Series(lst))

In [None]:
w_samples["Sample"] = w_samples["Sample"].apply(np.ceil)

In [None]:
w_samples.sort_values(by="Sample",ascending=False)

In [None]:
w_samples["Sample"].describe()

In [None]:
potential = w_samples[(w_samples["Women"] <= 65) & (w_samples["Women"] >= 45) & (w_samples["Sample"] <= 206)].sort_values(by="Sample",ascending=False)

In [None]:
potential