In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import ttest_ind

In [2]:
#Since I'll be running this multiple times...

description = ['Pop1_actual_mean',
               'Pop2_actual_mean',
               'Sample1_mean',
               'Sample2_mean',
               'Difference_of_means',
               'Difference_sample_1',
               'Difference_sample_2',
               'Sample1_StD',
               'Sample2_StD',
               'T-Value',
               'P-Value']

sample_data_1 = pd.DataFrame(index=description)
sample_data_2 = pd.DataFrame(index=description)
sample_data_3 = pd.DataFrame(index=description)
pop_data_1 = pd.DataFrame(index=description)
pop_data_2 = pd.DataFrame(index=description)


def Population(prob, pop_size):
    population = np.random.binomial(10, prob, pop_size)
    return population

def Population_Gamma(prob, pop_size):
    population = np.random.gamma(10, prob, pop_size)
    return population

def Plot_Populations(pop1, pop2):
    # Make histograms for the two populations. 
    plt.hist(pop1, alpha=0.5, label='Population 1') 
    plt.hist(pop2, alpha=0.5, label='Population 2') 
    plt.legend(loc='upper right') 
    plt.show()
    
def Plot_Samples(sample1, sample2):
    plt.hist(sample1, alpha=0.5, label='sample 1') 
    plt.hist(sample2, alpha=0.5, label='sample 2') 
    plt.legend(loc='upper right') 
    plt.show()
    
def Create_Sample(population, sample_size):
    sample = np.random.choice(population, sample_size, replace=True)
    return sample

def Compile(pop1, pop2, sample_size, df, sample_name):
    #Plot_Populations(pop1, pop2)
    
    sample1 = Create_Sample(pop1, sample_size)
    sample2 = Create_Sample(pop2, sample_size)

    #Plot_Samples(sample1, sample2)

    #print("Sample 1/2 mean: " + str(sample1.mean()) + "/ " + str(sample2.mean()))
    #print("Sample 1/2 StD: " + str(sample1.std()) + "/ " + str(sample2.std()) )

    # Compute the difference between the two sample means.
    sample_diff=sample2.mean( ) -sample1.mean()
    actual_diff1=pop1.mean()-sample1.mean()
    actual_diff2=pop2.mean()-sample2.mean()
    #print("Difference of means: " + str(sample_diff))

    size = np.array([len(sample1), len(sample2)])
    sd = np.array([sample1.std(), sample2.std()])

    # The squared standard deviations are divided by the sample size and summed, then we take
    # the square root of the sum. 
    diff_se = (sum(sd ** 2 / size)) ** 0.5  

    #The difference between the means divided by the standard error: T-value.  
    #print("T-value: " + str(sample_diff/diff_se))

    #print(ttest_ind(sample2, sample1, equal_var=False))
    ttest = ttest_ind(sample2, sample1, equal_var=False)
    
    df[sample_name] = [pop1.mean(),
                       pop2.mean(),
                       sample1.mean(), 
                       sample2.mean(),
                       sample_diff,
                       actual_diff1,
                       actual_diff2,
                       sample1.std(), 
                       sample2.std(),  
                       str(sample_diff/diff_se), 
                       ttest[1]]
    

In [3]:
pop_size = 10000
pop1 = Population(0.2, pop_size)
pop2 = Population(0.5, pop_size)

Compile(pop1, pop2, 100, sample_data_1, 'pop_size_100-1')
Compile(pop1, pop2, 100, sample_data_1, 'pop_size_100-2')
Compile(pop1, pop2, 100, sample_data_1, 'pop_size_100-3')


In [4]:
#Increasing sample sizes from 100 to 1000

Compile(pop1, pop2, 1000, sample_data_2, 'pop_size_1000-1')
Compile(pop1, pop2, 1000, sample_data_2, 'pop_size_1000-2')
Compile(pop1, pop2, 1000, sample_data_2, 'pop_size_1000-3')

In [5]:
#Decreasing sample sizes from 100 to 20

Compile(pop1, pop2, 20, sample_data_3, 'pop_size_20-1')
Compile(pop1, pop2, 20, sample_data_3, 'pop_size_20-2')
Compile(pop1, pop2, 20, sample_data_3, 'pop_size_20-3')

In [6]:
#Changing pop1 probaiblity vailue to 0.3

pop1 = Population(0.3, pop_size)

Compile(pop1, pop2, 1000, pop_data_1, 'prob_0.3-1')
Compile(pop1, pop2, 1000, pop_data_1, 'prob_0.3-2')
Compile(pop1, pop2, 1000, pop_data_1, 'prob_0.3-3')

In [7]:
#Changing pop1 probaiblity vailue to 0.4

pop1 = Population(0.4, pop_size)

Compile(pop1, pop2, 1000, pop_data_2, 'prob_0.4-1')
Compile(pop1, pop2, 1000, pop_data_2, 'prob_0.4-2')
Compile(pop1, pop2, 1000, pop_data_2, 'prob_0.4-3')

In [8]:
sample_data_1

Unnamed: 0,pop_size_100-1,pop_size_100-2,pop_size_100-3
Pop1_actual_mean,2.0244,2.0244,2.0244
Pop2_actual_mean,5.0045,5.0045,5.0045
Sample1_mean,1.92,2.35,2.08
Sample2_mean,5.14,5.01,5.05
Difference_of_means,3.22,2.66,2.97
Difference_sample_1,0.1044,-0.3256,-0.0556
Difference_sample_2,-0.1355,-0.0055,-0.0455
Sample1_StD,1.27812,1.45172,1.42604
Sample2_StD,1.54932,1.51324,1.73997
T-Value,16.032008142364663,12.684800772826152,13.201825563829372


In [9]:
sample_data_2

Unnamed: 0,pop_size_1000-1,pop_size_1000-2,pop_size_1000-3
Pop1_actual_mean,2.0244,2.0244,2.0244
Pop2_actual_mean,5.0045,5.0045,5.0045
Sample1_mean,1.986,2.056,2.052
Sample2_mean,4.998,4.95,4.95
Difference_of_means,3.012,2.894,2.898
Difference_sample_1,0.0384,-0.0316,-0.0276
Difference_sample_2,0.0065,0.0545,0.0545
Sample1_StD,1.28445,1.29417,1.29665
Sample2_StD,1.56397,1.55612,1.56764
T-Value,47.06364902889528,45.21674090233877,45.04653295896167


In [10]:
sample_data_3

Unnamed: 0,pop_size_20-1,pop_size_20-2,pop_size_20-3
Pop1_actual_mean,2.0244,2.0244,2.0244
Pop2_actual_mean,5.0045,5.0045,5.0045
Sample1_mean,2.15,2.4,2.15
Sample2_mean,4.85,4.75,4.75
Difference_of_means,2.7,2.35,2.6
Difference_sample_1,-0.1256,-0.3756,-0.1256
Difference_sample_2,0.1545,0.2545,0.2545
Sample1_StD,1.49248,1.31909,1.27574
Sample2_StD,1.23592,1.60857,1.08972
T-Value,6.231230137618311,5.052011399399741,6.930253904652302


In [11]:
pop_data_1

Unnamed: 0,prob_0.3-1,prob_0.3-2,prob_0.3-3
Pop1_actual_mean,3.0072,3.0072,3.0072
Pop2_actual_mean,5.0045,5.0045,5.0045
Sample1_mean,3.01,2.918,3.07
Sample2_mean,4.956,5.022,5.052
Difference_of_means,1.946,2.104,1.982
Difference_sample_1,-0.0028,0.0892,-0.0628
Difference_sample_2,0.0485,-0.0175,-0.0475
Sample1_StD,1.45805,1.37742,1.48765
Sample2_StD,1.59313,1.57845,1.60975
T-Value,28.49477559725672,31.75946925668111,28.59461488421066


In [12]:
pop_data_2

Unnamed: 0,prob_0.4-1,prob_0.4-2,prob_0.4-3
Pop1_actual_mean,3.9945,3.9945,3.9945
Pop2_actual_mean,5.0045,5.0045,5.0045
Sample1_mean,4.013,4.015,4.021
Sample2_mean,4.951,4.934,4.947
Difference_of_means,0.938,0.919,0.926
Difference_sample_1,-0.0185,-0.0205,-0.0265
Difference_sample_2,0.0535,0.0705,0.0575
Sample1_StD,1.52998,1.5706,1.55967
Sample2_StD,1.59518,1.57849,1.54538
T-Value,13.419966783913488,13.051003311255768,13.336862745719053


# Changing Sample Size from 100 to 1000 to 20

What values change when changing population size samples:
Changing population


# Changed probability value p from 0.2 to 0.3 to 0.4

In [15]:
pop_size = 10000
pop1 = Population_Gamma(0.2, pop_size)
pop2 = Population_Gamma(0.5, pop_size)

gamma_data_1 = pd.DataFrame(index=description)

Compile(pop1, pop2, 100, gamma_data_1, 'pop_size_100-1')
Compile(pop1, pop2, 100, gamma_data_1, 'pop_size_100-2')
Compile(pop1, pop2, 100, gamma_data_1, 'pop_size_100-3')

In [19]:
gamma_data_2 = pd.DataFrame(index=description)

Compile(pop1, pop2, 100, gamma_data_2, 'pop_size_100-1')
Compile(pop1, pop2, 100, gamma_data_2, 'pop_size_100-2')
Compile(pop1, pop2, 100, gamma_data_2, 'pop_size_100-3')

In [17]:
gamma_data_1

Unnamed: 0,pop_size_100-1,pop_size_100-2,pop_size_100-3
Pop1_actual_mean,2.00701,2.00701,2.00701
Pop2_actual_mean,4.96586,4.96586,4.96586
Sample1_mean,1.99723,1.96552,2.07775
Sample2_mean,5.03955,4.97307,4.78765
Difference_of_means,3.04232,3.00755,2.70989
Difference_sample_1,0.00977779,0.0414884,-0.0707447
Difference_sample_2,-0.073695,-0.00721821,0.178209
Sample1_StD,0.581692,0.68722,0.579178
Sample2_StD,1.577,1.32509,1.37111
T-Value,18.099757655655207,20.148474551057166,18.20657164994752


In [18]:
gamma_data_2

Unnamed: 0,pop_size_100-1,pop_size_100-2,pop_size_100-3
Pop1_actual_mean,2.00701,2.00701,2.00701
Pop2_actual_mean,4.96586,4.96586,4.96586
Sample1_mean,1.99862,2.04593,1.96485
Sample2_mean,4.72228,5.11505,5.02289
Difference_of_means,2.72366,3.06912,3.05804
Difference_sample_1,0.00838554,-0.0389246,0.0421618
Difference_sample_2,0.243578,-0.149198,-0.0570329
Sample1_StD,0.660236,0.606039,0.586193
Sample2_StD,1.38706,1.62561,1.5414
T-Value,17.73008281679263,17.690448200159764,18.54366644876306
