 ## SAMPLE T TEST ##

In [1]:
# Import library
import scipy.stats as stats
from statsmodels.stats.weightstats import ttest_ind
import numpy as np

# Creating data arrays
data_array1 = np.array([14, 15, 15, 16, 13, 8, 14, 17, 16, 14, 19, 20, 21, 15, 15, 16, 16, 13, 14, 12])
data_array2 = np.array([15, 17, 14, 17, 14, 8, 12, 19, 19, 14, 17, 22, 24, 16, 13, 16, 13, 18, 15, 13])

# Function to check variance
def variance_Check():
    # Print the variance of the arrays (each data group) 
    print(np.var(data_array1), np.var(data_array2))
    
# Function to perform T-Test using SciPy Library
def TTest_SciPy():
    # Perform the two sample t-test with equal variances using SciPy (Scientific Python Library)
    result1 = stats.ttest_ind(a=data_array1, b=data_array2, equal_var=True)
    # Print the result
    print(result1)

# Function to perform T-Test using Statsmodel package
def TTest_Statsmodels():
    # Perform the two sample t-test with equal variances using Statsmodels
    result2 = ttest_ind(data_array1, data_array2)
    # Print the result
    print(result2)

In [2]:
# Method/Function call
variance_Check()
# Inference: 
# Ratio of large dataset variance to small dataset variance less than 4:1---> Same Variance (One of the assumptions for T-Test satisfied (Here: 12.26/7.7275 ~ 1.58:1 which is less than 4:1) ). 
# Note: There are 3 asumptions for T-test/Anova: 1. Independent samples 2. Data elements in each dataset follows Normal Distribution or atleast 30 samples needed 3. Same Variance (Homegenity Test))
# Perform the two sample t-test with equal variances


7.727500000000001 12.260000000000002


In [3]:
# Method/Function call
TTest_SciPy()
# Inference p(0.53) > P(0.5), So Null Hypothesis cannot be rejected; There is no significant evidence that the data groups are of Independent samples

TtestResult(statistic=-0.6337397070250238, pvalue=0.5300471010405257, df=38.0)


In [4]:
# Method/Function call
TTest_Statsmodels()
# Inference p(0.53) > P(0.5), So Null Hypothesis cannot be rejected; There is no significant evidence that the data groups are of Independent samples

(-0.6337397070250238, 0.5300471010405257, 38.0)


## Class Task 1 ##
**T Test:** (Two Sample Method)
**Method 1:** Using Scipy library
**Method 2:** Using Statistical Model Library- Compute Statistical models and conduct statistical tests- R based Modules and DataFrames
**Step 1:** 1. Load Data
**Step 2:** 2. Perform T-Tests

In [5]:
# Steps to follow (Time to complete the task: 15 mins
# 1. Import necessary libraries (scipy.stats and ttest_ind from statsmodels)
import pandas as pd

# 2. Create a method "read_csv()" to read the dataset V1_InferentialStats.csv file
df = pd.DataFrame(pd.read_csv('../Dataset/V1_InferentialStats.csv'))
df_clean = df.drop(index = df.index[8:10],columns=df.columns[3:10])
df_clean

# Ratio of large dataset variance to small dataset variance less than 4:1---> Same Variance (One of the assumptions for T-Test satisfied. 
# Note: There are 3 asumptions for T-test/Anova: 1. Independent samples 2. Data elements in each dataset follows Normal Distribution or atleast 30 samples needed 3. Same Variance (Homegenity Test))

# 3. Create a method "variance_Check()" to check variance
# Function to check variance
def variance_Check():
    # Print the variance of the arrays (each data group)
    print(np.var(df_clean['Caffeine'].values), np.var(df_clean['No_Caffeine'].values))


# 4. Perform the two sample t-test with equal variances through both the methods 1. scipy.stats and 2. ttest_ind from statsmodels. 
# Note: Create one function for each with function names TTest_SciPy() and TTest_Statsmodels() respectively
variance_Check()
# Function to perform T-Test using SciPy Library
def TTest_SciPy():
    # Perform the two sample t-test with equal variances using SciPy (Scientific Python Library)
    result1 = stats.ttest_ind(a=df_clean['Caffeine'].values, b=df_clean['No_Caffeine'].values, equal_var=True)
    # Print the result
    print(result1)

# Function to perform T-Test using Statsmodel package
def TTest_Statsmodels():
    # Perform the two sample t-test with equal variances using Statsmodels
    result2 = ttest_ind(df_clean['Caffeine'].values, df_clean['No_Caffeine'].values)
    # Print the result
    print(result2)

TTest_SciPy()
TTest_Statsmodels()
# 4. Print the results

# 5. Infer the results

41465.234375 26080.859375
TtestResult(statistic=-0.44537615142807846, pvalue=0.6628572422578224, df=14.0)
(-0.44537615142807846, 0.6628572422578224, 14.0)


## SAMPLE ANOVA ##

In [6]:
# Importing library
from scipy.stats import f_oneway

# Performance of students based on attendance among 4 disciplines
BSCDSAI = [89, 89, 88, 78, 79]
BSCCS = [93, 92, 94, 89, 88]
MSCDSAI= [89, 88, 89, 93, 90]
MSCCS = [81, 78, 81, 92, 82]

# Conduct the one-way ANOVA
f_oneway(BSCDSAI,BSCCS,MSCDSAI,MSCCS)
# Inference pvalue .01 < 0.5, Hence Reject Null Hypothesis, 
# Thus there is evidence of improvement in student performance on regular attendence in class


F_onewayResult(statistic=4.625000000000002, pvalue=0.01633645983978022)

## Class Task 2 ##
### ANOVA ###

In [7]:
# F statistic calculation: Step I #
# Model sum of squares (SSM (21.13): how much variance is explained by the differences between groups; 
# Difference between each group mean and the grand mean, Square, multiply count, sum)  #
# F statistic calculation: Step II #
# SSR (9.95): difference between the group means and the individual data points, squared and summed #
# F statistic calculation: Step III #
# F (2,10)= MSM / MSR (variance); 10.6 F>4.10 Hence Significant p<.05

### Class Task ###
### 1. Create Data for the values in PPT (GroupA, GroupB and GroupC) using List ###
### 2. Perform One way ANOVA ###


In [8]:
import pandas as pd
import numpy as np
from scipy.stats import f_oneway
import statistics

GroupA = [3,5,4,4,2]
GroupB = [2,3,2,1]
GroupC = [5,6,4,6]

a_mean = statistics.mean(GroupA)
b_mean = statistics.mean(GroupB)
c_mean = statistics.mean(GroupC)
grand_mean = (a_mean + b_mean + c_mean) / 3


SSM = ((a_mean -grand_mean)**2 * len(GroupA)) + (b_mean - grand_mean)**2 * len(GroupB) + (c_mean - grand_mean)**2 * len(GroupC)
print(SSM)

A_SSr = (GroupA[0]-a_mean)**2 + (GroupA[1]-a_mean)**2 + (GroupA[2]-a_mean)**2   + (GroupA[3]-a_mean)**2 + (GroupA[4]-a_mean)**2

B_SSr = (GroupB[0]-b_mean)**2 + (GroupB[1]-b_mean)**2 + (GroupB[2]-b_mean)**2   + (GroupB[3]-b_mean)**2

C_SSr = (GroupC[0]-c_mean)**2 + (GroupC[1]-c_mean)**2 + (GroupC[2]-c_mean)**2   + (GroupC[3]-c_mean)**2

SSR = A_SSr + B_SSr + C_SSr

print(SSR)

MSM = SSM / (3-1)
MSR = SSR / (13-1 - (3-1))

F = MSM / MSR

print(F)
print(f'F(2,10) = {F} > 4.10 -> Signifiant influence of groups to each other  p <.05')


f_oneway(GroupA, GroupB, GroupC)

21.126944444444444
9.95
10.616554997208265
F(2,10) = 10.616554997208265 > 4.10 -> Signifiant influence of groups to each other  p <.05


F_onewayResult(statistic=10.616544259760337, pvalue=0.003364537240814761)

## SAMPLE: TWO WAY ANOVA ##

**Story line:** Steps to perform a two-way ANOVA to determine if drinking water habits and intensity of exercises have a significant effect on child growth, and to determine if there is any interaction effect between drinking water habits and intensity of exercises

In [9]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

#create data
df = pd.DataFrame({'Drink_Water': np.repeat(['once_3hours', 'once_6hours'], 15),
                   'Exercises': np.tile(np.repeat(['low', 'med', 'high'], 5), 2),
                   'Growth': [6, 6, 6, 5, 6, 5, 5, 6, 4, 5,
                              6, 6, 7, 8, 7, 3, 4, 4, 4, 5,
                              4, 4, 4, 4, 4, 5, 6, 6, 7, 8]})

#view first ten rows of data

FactorA = [df[(df['Drink_Water']=='once_3hours') & (df['Exercises']=='low')]['Growth'],
           df[(df['Drink_Water']=='once_3hours') & (df['Exercises']=='med')]['Growth'],
           df[(df['Drink_Water']=='once_3hours') & (df['Exercises']=='high')]['Growth']]

FactorB = [df[(df['Drink_Water']=='once_6hours') & (df['Exercises']=='low')]['Growth'],
           df[(df['Drink_Water']=='once_6hours') & (df['Exercises']=='med')]['Growth'],
           df[(df['Drink_Water']=='once_6hours') & (df['Exercises']=='high')]['Growth']]


mean_A = [group.mean() for group in FactorA]
mean_B = [group.mean() for group in FactorB]

grand_mean = (sum(mean_A) + sum(mean_B)) / (len(mean_A) + len(mean_B))

SSA = sum([(group - mean_A[i]).pow(2).sum() for i, group in enumerate(FactorA)])
SSB = sum([(group - mean_B[i]).pow(2).sum() for i, group in enumerate(FactorB)])


SSAB = 0
SSR = 0

for i in range(len(FactorA)):
    for j in range(len(FactorB)):
        SSAB += (FactorA[i] - mean_A[i]).pow(2).sum() + (FactorB[j] - mean_B[j]).pow(2).sum() - (FactorA[i] - mean_A[i]).pow(2).sum() - (FactorB[j] - mean_B[j]).pow(2).sum()

SSR = SSA + SSB + SSAB

df_A = len(mean_A) - 1

df_B = len(mean_B) - 1

df_AB = df_A * df_B

df_R = len(mean_A) * len(mean_B) - df_AB

MSA = SSA / df_A

MSB = SSB / df_B

MSAB = SSAB / df_AB

MSR = SSR / df_R

F_A = MSA / MSR
F_B = MSB / MSR
F_AB = MSAB / MSR

# Calculate p-values

p_value_A = 1 - stats.f.cdf(F_A, df_A, df_R)
p_value_B = 1 - stats.f.cdf(F_B, df_B, df_R)
p_value_AB = 1 - stats.f.cdf(F_AB, df_AB, df_R)

# Print results
print("Factor A F-statistic:", F_A)
print("Factor A p-value:", p_value_A)
print("Factor B F-statistic:", F_B)
print("Factor B p-value:", p_value_B)
print("Interaction F-statistic:", F_AB)
print("Interaction p-value:", p_value_AB)


#perform two-way ANOVA
model = ols('Growth ~ C(Drink_Water) + C(Exercises) + C(Drink_Water):C(Exercises)', data=df).fit()
sm.stats.anova_lm(model, typ=2)
# Since the p-values for Drink_Water and Exercises are both less than .05 (0.000527 and 0.000002 respectively), 
# this means that both factors have a statistically significant effect on Child Growth
# But, p-value for the interaction effect (.120667) is not less than .05, 
# this tells us that there is no significant interaction effect between Drinking water habits and Exercises.
# Note: Although the ANOVA results tell us that Drinking water habits and Exercises have a statistically significant effect on Child Growth, 
# we would need to perform post-hoc tests to determine exactly how different Drinking water habits and Exercise intensity affect Child Growth.

Factor A F-statistic: 1.09375
Factor A p-value: 0.4036271455471542
Factor B F-statistic: 1.40625
Factor B p-value: 0.32767999999999997
Interaction F-statistic: 0.0
Interaction p-value: 1.0


Unnamed: 0,sum_sq,df,F,PR(>F)
C(Drink_Water),8.533333,1.0,16.0,0.000527
C(Exercises),24.866667,2.0,23.3125,2e-06
C(Drink_Water):C(Exercises),2.466667,2.0,2.3125,0.120667
Residual,12.8,24.0,,


In [10]:
df

Unnamed: 0,Drink_Water,Exercises,Growth
0,once_3hours,low,6
1,once_3hours,low,6
2,once_3hours,low,6
3,once_3hours,low,5
4,once_3hours,low,6
5,once_3hours,med,5
6,once_3hours,med,5
7,once_3hours,med,6
8,once_3hours,med,4
9,once_3hours,med,5


### Class Task ###

## Which ANOVA? ##
## TRY ##
Suppose a researcher wants to determine if two training programs lead to different mean improvements in jumping height among college basketball players.

The researcher suspects that gender and division (Division I or II) may also affect jumping height so he collects data for these factors as well.

His goal is to perform a ?????? ANOVA to determine how training program, gender, and division affect jumping height.


Which Anova?: One or Two or Three ?
We are interested to find if two different training styles lead to different mean improvements in success ratio of A-Level exams among high school students
There is also a thought that may be gender and birth year (Before 2010 or After 2010) may also affect success ratio of A-Level exams. so we are collecting data for these factors as well.
Our goal is to perform a ???????(Find which ANOVA?) ANOVA to determine how training style, gender, and birth year affect success ratio.

In [11]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

#create DataFrame
df = pd.DataFrame({'Training_Style': np.repeat([1, 2], 20),
                   'gender': np.tile(np.repeat(['M', 'F'], 10), 2),
                   'Birth_Year': np.tile(np.repeat([1, 2], 5), 4),
                   'Grades': [7, 7, 8, 8, 7, 6, 6, 5, 6, 5,
                              5, 5, 4, 5, 4, 3, 3, 4, 3, 3,
                              6, 6, 5, 4, 5, 4, 5, 4, 4, 3,
                              2, 2, 1, 4, 4, 2, 1, 1, 2, 1]})
#perform ????? ANOVA. Note: In Birth year Before 2010 is denoted by 1 and After 2010 is denoted by 2
# Fill the "?" marks appropriately
# model = ols('Growth ~ C(Drink_Water) + C(Exercises) + C(Drink_Water):C(Exercises)', data=df).fit()
model = ols('Grades ~ C(Training_Style) + C(gender) + C(Birth_Year) + C(Training_Style):C(gender) + C(Training_Style):C(Birth_Year) + C(Birth_Year):C(gender) + C(Training_Style):C(gender):C(Birth_Year)', data=df).fit()
sm.stats.anova_lm(model, typ=3)


Unnamed: 0,sum_sq,df,F,PR(>F)
Intercept,105.8,1.0,192.363636,4.41008e-15
C(Training_Style),10.0,1.0,18.181818,0.0001658997
C(gender),19.6,1.0,35.636364,1.185218e-06
C(Birth_Year),4.9,1.0,8.909091,0.005399881
C(Training_Style):C(gender),0.05,1.0,0.090909,0.7649753
C(Training_Style):C(Birth_Year),0.05,1.0,0.090909,0.7649753
C(Birth_Year):C(gender),0.2,1.0,0.363636,0.5507439
C(Training_Style):C(gender):C(Birth_Year),0.1,1.0,0.181818,0.6726702
Residual,17.6,32.0,,
