# Setup

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as st

In [2]:
import random
import decimal

In [3]:
import plotly.plotly as py
import plotly.graph_objs as go
import plotly

In [4]:
plotly.tools.set_credentials_file(username='gasia44', api_key='WquddqqfPKg5KJfPJFuk')

# Statistic test

In [5]:
def stats_sig_unpooled(p_1, p_2, n_1, n_2):
    ste = np.sqrt((p_1*(1 - p_1)/n_1 + p_2 * (1- p_2)/n_2))
    return((np.abs(p_1-p_2))/ste)

In [6]:
def stats_sig_pooled(p_1, p_2, n_1, n_2):
    p = (n_1 * p_1 + n_2 * p_2)/ (n_1 + n_2)
    ste = np.sqrt((p*(1-p))*(1/n_1 + 1/n_2))
    return((np.abs(p_1-p_2))/ste)

# Confidence Interval

In [7]:
def confidence_interval_pooled(p_1, p_2, n_1, n_2, alpha):
    delta = np.abs(p_1 - p_2)
    p = (n_1 * p_1 + n_2 * p_2)/ (n_1 + n_2)
    ste = np.sqrt((p*(1-p))*(1/n_1 + 1/n_2))
    return  delta + st.norm.ppf(alpha) * (ste), delta + st.norm.ppf(1-alpha) * (ste)

In [8]:
def confidence_interval_unpooled(p_1, p_2, n_1, n_2, alpha):
    delta = np.abs(p_1 - p_2)
    ste = np.sqrt((p_1*(1 - p_1)/n_1 + p_2 * (1- p_2)/n_2))
    return  delta + st.norm.ppf(alpha) * (ste), delta + st.norm.ppf(1-alpha) * (ste)

# Experiments

## Data

In [9]:
alpha = 0.05
n_experiments = 100
n_1 = random.sample(range(40, 1000), n_experiments)
n_2 = random.sample(range(40, 1000), n_experiments)

In [10]:
max_range_1 = (np.array(n_1) - 20)
max_range_2 = (np.array(n_2) - 20)

In [11]:
p_1 = [round(random.randint(20,max_range_1[i])/n_1[i], 3) for i in range(len(n_1))]
p_2 = [round(random.randint(20,max_range_2[i])/n_2[i], 3) for i in range(len(n_2))]

In [12]:
df = pd.DataFrame({'n_1': n_1, 'p_1': p_1, 'n_2': n_2, 'p_2' : p_2})

In [13]:
df.head()

Unnamed: 0,n_1,p_1,n_2,p_2
0,451,0.18,424,0.276
1,872,0.761,858,0.577
2,919,0.317,260,0.919
3,474,0.608,414,0.157
4,536,0.216,648,0.17


In [14]:
df['std_1'] = round(np.sqrt(df['p_1']*(1-df['p_1']) * df['n_1']), 3)
df['std_2'] = round(np.sqrt(df['p_2']*(1-df['p_2']) * df['n_2']), 3)

In [15]:
df.head()

Unnamed: 0,n_1,p_1,n_2,p_2,std_1,std_2
0,451,0.18,424,0.276,8.159,9.205
1,872,0.761,858,0.577,12.594,14.471
2,919,0.317,260,0.919,14.106,4.399
3,474,0.608,414,0.157,10.629,7.402
4,536,0.216,648,0.17,9.527,9.562


 RULE OF THUMB: If the larger sample standard deviation is MORE THAN twice the smaller sample standard deviation then perform the analysis using unpooled methods.

In [16]:
df['std_twice_bigger'] = df[['std_1', 'std_2']].max(axis=1) > 2* df[['std_1', 'std_2']].min(axis=1) 

In [17]:
np.sum(df['std_twice_bigger'])

22

In [18]:
temp = list(np.linspace(1, 3, 21))
std_twice_bigger_exp = [np.sum(df[['std_1', 'std_2']].max(axis=1) > x* df[['std_1', 'std_2']].min(axis=1)) for x in temp]


In [19]:
data = [
    go.Scatter(
        x = temp,
        y =std_twice_bigger_exp)]

layout = go.Layout(
    title='max std > min std',
    xaxis=dict(
        title='#times bigger',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='#data',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=data, layout=layout)


py.iplot(fig, filename = 'gasia')


Consider using IPython.display.IFrame instead



In [20]:
df.head()

Unnamed: 0,n_1,p_1,n_2,p_2,std_1,std_2,std_twice_bigger
0,451,0.18,424,0.276,8.159,9.205,False
1,872,0.761,858,0.577,12.594,14.471,False
2,919,0.317,260,0.919,14.106,4.399,True
3,474,0.608,414,0.157,10.629,7.402,False
4,536,0.216,648,0.17,9.527,9.562,False


-----------

## Calculations

In [21]:
df['test_stat_pooled'] = 0.0
df['test_stat_unpooled'] = 0.0

df['confid_interval_pooled_min'] = 0.0
df['confid_interval_pooled_max'] = 0.0
df['confid_interval_unpooled_min'] = 0.0
df['confid_interval_unpooled_max'] = 0.0

In [22]:
for i in range(len(df)):
    d = df.iloc[i]
    df.loc[i,'test_stat_pooled'] =   stats_sig_pooled(p_1= d['p_1'], p_2= d['p_2'], n_1= d['n_1'], n_2 =d['n_2'])
    df.loc[i,'test_stat_unpooled'] = stats_sig_unpooled(p_1= d['p_1'], p_2= d['p_2'], n_1= d['n_1'], n_2 =d['n_2'])
    
    conf_res = confidence_interval_pooled(p_1= d['p_1'], p_2= d['p_2'], n_1= d['n_1'], n_2 =d['n_2'], alpha = alpha)
    df.loc[i,'confid_interval_pooled_min'], df.loc[i,'confid_interval_pooled_max']   =  conf_res[0], conf_res[1]
    
    conf_res = confidence_interval_unpooled(p_1= d['p_1'], p_2= d['p_2'], n_1= d['n_1'], n_2 =d['n_2'], alpha = alpha)
    df.loc[i,'confid_interval_unpooled_min'], df.loc[i,'confid_interval_unpooled_max'] =  conf_res[0], conf_res[1]

In [23]:
df = df.sort_values(['n_1', 'n_2'])

In [24]:
df.reset_index(inplace = True, drop=True)

In [25]:
df.head()

Unnamed: 0,n_1,p_1,n_2,p_2,std_1,std_2,std_twice_bigger,test_stat_pooled,test_stat_unpooled,confid_interval_pooled_min,confid_interval_pooled_max,confid_interval_unpooled_min,confid_interval_unpooled_max
0,40,0.5,909,0.13,3.162,10.139,True,6.493442,4.63427,0.276275,0.463725,0.238675,0.501325
1,57,0.351,910,0.718,3.603,13.574,True,5.845443,5.650184,0.26373,0.47027,0.260161,0.473839
2,83,0.494,591,0.24,4.555,10.383,True,4.873577,4.408072,0.168274,0.339726,0.159221,0.348779
3,88,0.625,502,0.892,4.541,6.954,False,6.50941,4.996762,0.199532,0.334468,0.179108,0.354892
4,92,0.348,843,0.285,4.569,13.107,True,1.262948,1.210646,-0.019051,0.145051,-0.022595,0.148595


## Insights

### Test Statistisc

In [26]:
data = [
    go.Scatter(
        x = df.index ,
        y = df['test_stat_pooled'].values,
        mode = 'lines',
        name = 'pooled'), 
    
    go.Scatter(
        x = df.index ,
        y = df['test_stat_unpooled'].values,
        mode = 'lines',
        name = 'unpooled'), 
    
        
    go.Scatter(
        x = (df[df['std_twice_bigger']]).index ,
        y = df[df['std_twice_bigger']]['test_stat_unpooled'].values,
        mode = 'markers',
        name = 'std_twice_bigger'),

    ]

layout = go.Layout(
    title='Test Statistic',
    xaxis=dict(
        title='index',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='test statistic',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename = 'gasia')

In [27]:
df['diff'] = df['test_stat_unpooled'] - df['test_stat_pooled']

In [28]:
data = [
    go.Scatter(
        x = df.index ,
        y = df['diff'].values,
        mode = 'lines',
        name = 'difference'), 
    
        
    go.Scatter(
        x = (df[df['std_twice_bigger']]).index ,
        y = df[df['std_twice_bigger']]['diff'].values,
        mode = 'markers',
        name = 'std_twice_bigger'),

    ]

layout = go.Layout(
    title='Test Statistic difference',
    xaxis=dict(
        title='index',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='unpooled - pooled',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename = 'gasia')

### Confidence Interval

In [29]:
data = [
    go.Scatter(
        x = df.index ,
        y = df['confid_interval_pooled_min'].values,
        mode = 'lines',
        name = 'pooled_min'), 
    
    go.Scatter(
        x = df.index ,
        y = df['confid_interval_pooled_max'].values,
        mode = 'lines',
        name = 'pooled_max'),
    
    go.Scatter(
        x = df.index ,
        y = df['confid_interval_unpooled_min'].values,
        mode = 'lines',
        name = 'unpooled_min'), 
    
    go.Scatter(
        x = df.index ,
        y = df['confid_interval_unpooled_max'].values,
        mode = 'lines',
        name = 'unpooled_max'), 
    
    go.Scatter(
        x = (df[df['std_twice_bigger']]).index ,
        y = df[df['std_twice_bigger']]['confid_interval_unpooled_max'].values,
        mode = 'markers',
        name = 'std_twice_bigger'),
    
    go.Scatter(
        x = (df[df['std_twice_bigger']]).index ,
        y = df[df['std_twice_bigger']]['confid_interval_unpooled_min'].values,
        mode = 'markers',
        name = 'std_twice_bigger'),
    ]

layout = go.Layout(
    title='Confidence Interval',
    xaxis=dict(
        title='index',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='confidence interval',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename = 'gasia')

In [30]:
df['conf_diff'] = df['confid_interval_unpooled_max'] - df['confid_interval_pooled_max']

In [31]:
data = [
    
    go.Scatter(
        x = df.index ,
        y = df['conf_diff'].values,
        mode = 'lines',
        name = 'unpooled - pooled'), 
    
    go.Scatter(
        x = (df[df['std_twice_bigger']]).index ,
        y = df[df['std_twice_bigger']]['conf_diff'].values,
        mode = 'markers',
        name = 'std_twice_bigger'),  
    ]

layout = go.Layout(
    title='Confidence Interval Difference',
    xaxis=dict(
        title='index',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='unpooled - pooled',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename = 'gasia')