# Setup

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as st

In [None]:
import random
import decimal

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go
import plotly

In [None]:
plotly.tools.set_credentials_file(username='gasia44', api_key='WquddqqfPKg5KJfPJFuk')

# Statistic test

In [None]:
def stats_sig_unpooled(p_1, p_2, n_1, n_2):
    ste = np.sqrt((p_1*(1 - p_1)/n_1 + p_2 * (1- p_2)/n_2))
    return((np.abs(p_1-p_2))/ste)

In [None]:
def stats_sig_pooled(p_1, p_2, n_1, n_2):
    p = (n_1 * p_1 + n_2 * p_2)/ (n_1 + n_2)
    ste = np.sqrt((p*(1-p))*(1/n_1 + 1/n_2))
    return((np.abs(p_1-p_2))/ste)

# Confidence Interval

In [None]:
def confidence_interval_pooled(p_1, p_2, n_1, n_2, alpha):
    delta = np.abs(p_1 - p_2)
    p = (n_1 * p_1 + n_2 * p_2)/ (n_1 + n_2)
    ste = np.sqrt((p*(1-p))*(1/n_1 + 1/n_2))
    return  delta + st.norm.ppf(alpha) * (ste), delta + st.norm.ppf(1-alpha) * (ste)

In [None]:
def confidence_interval_unpooled(p_1, p_2, n_1, n_2, alpha):
    delta = np.abs(p_1 - p_2)
    ste = np.sqrt((p_1*(1 - p_1)/n_1 + p_2 * (1- p_2)/n_2))
    return  delta + st.norm.ppf(alpha) * (ste), delta + st.norm.ppf(1-alpha) * (ste)

# Experiments

## Data

In [None]:
alpha = 0.05
n_experiments = 100
n_1 = random.sample(range(40, 1000), n_experiments)
n_2 = random.sample(range(40, 1000), n_experiments)

In [None]:
max_range_1 = (np.array(n_1) - 20)
max_range_2 = (np.array(n_2) - 20)

In [None]:
p_1 = [round(random.randint(20,max_range_1[i])/n_1[i], 3) for i in range(len(n_1))]
p_2 = [round(random.randint(20,max_range_2[i])/n_2[i], 3) for i in range(len(n_2))]

In [None]:
df = pd.DataFrame({'n_1': n_1, 'p_1': p_1, 'n_2': n_2, 'p_2' : p_2})

In [None]:
df.head()

In [None]:
df['std_1'] = round(np.sqrt(df['p_1']*(1-df['p_1']) * df['n_1']), 3)
df['std_2'] = round(np.sqrt(df['p_2']*(1-df['p_2']) * df['n_2']), 3)

In [None]:
df.head()

 RULE OF THUMB: If the larger sample standard deviation is MORE THAN twice the smaller sample standard deviation then perform the analysis using unpooled methods.

In [None]:
df['std_twice_bigger'] = df[['std_1', 'std_2']].max(axis=1) > 2* df[['std_1', 'std_2']].min(axis=1) 

In [None]:
np.sum(df['std_twice_bigger'])

In [None]:
temp = list(np.linspace(1, 3, 21))
std_twice_bigger_exp = [np.sum(df[['std_1', 'std_2']].max(axis=1) > x* df[['std_1', 'std_2']].min(axis=1)) for x in temp]


In [None]:
data = [
    go.Scatter(
        x = temp,
        y =std_twice_bigger_exp)]

layout = go.Layout(
    title='max std > min std',
    xaxis=dict(
        title='#times bigger',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='#data',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=data, layout=layout)


py.iplot(fig, filename = 'gasia')

In [None]:
df.head()

-----------

## Calculations

In [None]:
df['test_stat_pooled'] = 0.0
df['test_stat_unpooled'] = 0.0

df['confid_interval_pooled_min'] = 0.0
df['confid_interval_pooled_max'] = 0.0
df['confid_interval_unpooled_min'] = 0.0
df['confid_interval_unpooled_max'] = 0.0

In [None]:
for i in range(len(df)):
    d = df.iloc[i]
    df.loc[i,'test_stat_pooled'] =   stats_sig_pooled(p_1= d['p_1'], p_2= d['p_2'], n_1= d['n_1'], n_2 =d['n_2'])
    df.loc[i,'test_stat_unpooled'] = stats_sig_unpooled(p_1= d['p_1'], p_2= d['p_2'], n_1= d['n_1'], n_2 =d['n_2'])
    
    conf_res = confidence_interval_pooled(p_1= d['p_1'], p_2= d['p_2'], n_1= d['n_1'], n_2 =d['n_2'], alpha = alpha)
    df.loc[i,'confid_interval_pooled_min'], df.loc[i,'confid_interval_pooled_max']   =  conf_res[0], conf_res[1]
    
    conf_res = confidence_interval_unpooled(p_1= d['p_1'], p_2= d['p_2'], n_1= d['n_1'], n_2 =d['n_2'], alpha = alpha)
    df.loc[i,'confid_interval_unpooled_min'], df.loc[i,'confid_interval_unpooled_max'] =  conf_res[0], conf_res[1]

In [None]:
df = df.sort_values(['n_1', 'n_2'])

In [None]:
df.reset_index(inplace = True, drop=True)

In [None]:
df.head()

## Insights

### Test Statistisc

In [None]:
data = [
    go.Scatter(
        x = df.index ,
        y = df['test_stat_pooled'].values,
        mode = 'lines',
        name = 'pooled'), 
    
    go.Scatter(
        x = df.index ,
        y = df['test_stat_unpooled'].values,
        mode = 'lines',
        name = 'unpooled'), 
    
        
    go.Scatter(
        x = (df[df['std_twice_bigger']]).index ,
        y = df[df['std_twice_bigger']]['test_stat_unpooled'].values,
        mode = 'markers',
        name = 'std_twice_bigger'),

    ]

layout = go.Layout(
    title='Test Statistic',
    xaxis=dict(
        title='index',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='test statistic',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename = 'gasia')

In [None]:
df['diff'] = df['test_stat_unpooled'] - df['test_stat_pooled']

In [None]:
data = [
    go.Scatter(
        x = df.index ,
        y = df['diff'].values,
        mode = 'lines',
        name = 'difference'), 
    
        
    go.Scatter(
        x = (df[df['std_twice_bigger']]).index ,
        y = df[df['std_twice_bigger']]['diff'].values,
        mode = 'markers',
        name = 'std_twice_bigger'),

    ]

layout = go.Layout(
    title='Test Statistic difference',
    xaxis=dict(
        title='index',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='unpooled - pooled',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename = 'gasia')

### Confidence Interval

In [None]:
data = [
    go.Scatter(
        x = df.index ,
        y = df['confid_interval_pooled_min'].values,
        mode = 'lines',
        name = 'pooled_min'), 
    
    go.Scatter(
        x = df.index ,
        y = df['confid_interval_pooled_max'].values,
        mode = 'lines',
        name = 'pooled_max'),
    
    go.Scatter(
        x = df.index ,
        y = df['confid_interval_unpooled_min'].values,
        mode = 'lines',
        name = 'unpooled_min'), 
    
    go.Scatter(
        x = df.index ,
        y = df['confid_interval_unpooled_max'].values,
        mode = 'lines',
        name = 'unpooled_max'), 
    
    go.Scatter(
        x = (df[df['std_twice_bigger']]).index ,
        y = df[df['std_twice_bigger']]['confid_interval_unpooled_max'].values,
        mode = 'markers',
        name = 'std_twice_bigger'),
    
    go.Scatter(
        x = (df[df['std_twice_bigger']]).index ,
        y = df[df['std_twice_bigger']]['confid_interval_unpooled_min'].values,
        mode = 'markers',
        name = 'std_twice_bigger'),
    ]

layout = go.Layout(
    title='Confidence Interval',
    xaxis=dict(
        title='index',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='confidence interval',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename = 'gasia')

In [None]:
df['conf_diff'] = df['confid_interval_unpooled_max'] - df['confid_interval_pooled_max']

In [None]:
data = [
    
    go.Scatter(
        x = df.index ,
        y = df['conf_diff'].values,
        mode = 'lines',
        name = 'unpooled - pooled'), 
    
    go.Scatter(
        x = (df[df['std_twice_bigger']]).index ,
        y = df[df['std_twice_bigger']]['conf_diff'].values,
        mode = 'markers',
        name = 'std_twice_bigger'),  
    ]

layout = go.Layout(
    title='Confidence Interval Difference',
    xaxis=dict(
        title='index',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='unpooled - pooled',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename = 'gasia')