### Exploring Power And Sample Sizes

A brief foray into the realm of statistical robustness:

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as st

import warnings
warnings.filterwarnings("ignore")

In [2]:
def one_prop_sample_size(p0, p1, alpha, power):
    """
    Input:
    - p0: Historical/known proportion to compare to
    - p1: Desired proportion to test against
    - alpha: Desired p-value
    - power: Likelihood of avoiding type II error
    
    Output:
    - return: Sample size
    
    Notes:
    - Created using "Sample Size for One Sample, Dichotomous Outcome" taken from tinyurl.com/jyym9d9f
    """
    z_score_a = st.norm.ppf((1 - alpha)/2)
    z_score_b = st.norm.ppf(power)
    ES = (p1 - p0)/np.sqrt(p1*(1 - p1))
    return ((z_score_a + z_score_b)/ES)**2

In [3]:
one_prop_sample_size(0.6, 0.5, 0.05, 0.9)

37.13956540657675

### Using Power To Calculate Sample Sizes For BILS Data

In [4]:
bls = pd.read_csv('data/blsdata.csv')

In [5]:
bls = bls[~(bls["Occupations"].str.contains("occupation",case=False).fillna(False))]

In [6]:
full_bls = bls.dropna(how='all').iloc[:-1]

In [7]:
full_bls['Count'] = full_bls['Count'].str.replace(',','').astype(int)
# Looking at occupations that are in the upper half in count in our dataset
clean_bls = full_bls[full_bls['Count'] > 85]

In [8]:
clean_bls['Women'] = clean_bls['Women'].astype(float)
clean_bls['Black or\nAfrican\nAmerican'] = clean_bls['Black or\nAfrican\nAmerican'].astype(float)
clean_bls['Asian'] = clean_bls['Asian'].astype(float)
clean_bls['Hispanic\nor Latino'] = clean_bls['Hispanic\nor Latino'].astype(float)

In [9]:
lst = []
for prop in clean_bls['Women']:
#     print(one_prop_sample_size(prop/100, 0.5, 0.05, 0.9))
    lst += [one_prop_sample_size(prop/100, 0.5, 0.05, 0.99)]

In [10]:
bias_w = clean_bls.reset_index()[['Occupations', 'Women']]

w_samples = bias_w.assign(Sample=pd.Series(lst))

In [11]:
w_samples["Sample"] = w_samples["Sample"].apply(np.ceil)

In [12]:
w_samples.sort_values(by="Sample",ascending=False)

Unnamed: 0,Occupations,Women,Sample
19,"Property, real estate, and community associati...",50.3,142336.0
178,Retail salespersons,49.4,35584.0
245,"Electrical, electronics, and electromechanical...",49.3,26144.0
180,Insurance sales agents,49.2,20016.0
90,Artists and related workers,49.2,20016.0
...,...,...,...
238,"Heating, air conditioning, and refrigeration m...",2.3,6.0
237,Heavy vehicle and mobile equipment service tec...,1.2,6.0
236,Bus and truck mechanics and diesel engine spec...,1.2,6.0
235,Automotive service technicians and mechanics,2.9,6.0


In [13]:
w_samples["Sample"].describe()

count       278.000000
mean       1183.035971
std        9113.023892
min           6.000000
25%          10.000000
50%          23.000000
75%         103.000000
max      142336.000000
Name: Sample, dtype: float64

In [14]:
potential = w_samples[(w_samples["Women"] <= 65) & (w_samples["Women"] >= 45) & (w_samples["Sample"] <= 206)].sort_values(by="Sample",ascending=False)

In [15]:
potential

Unnamed: 0,Occupations,Women,Sample
63,Biological scientists,57.9,206.0
33,Market research analysts and marketing special...,57.9,206.0
249,Food batchmakers,58.1,196.0
83,Secondary school teachers,58.7,170.0
35,Accountants and auditors,58.8,166.0
204,"Dispatchers, except police, fire, and ambulance",58.9,162.0
2,Marketing managers,58.9,162.0
149,Bartenders,59.0,159.0
185,"Sales and related workers, all other",59.2,152.0
107,Pharmacists,59.6,139.0
