In [1]:
'''Is gender independent of education level? 
   A random sample of 395 people were surveyed and each person was asked to report the highest education level 
   they obtained. The data that resulted from the survey is summarized in the following table:

            High School Bachelors Masters Ph.d. Total
    Female           60     54     46     41     201 
    Male             40     44     53     57     194
    Total           100     98     99     98     395

Question: Are gender and education level dependent at 5% level of significance? 
In other words, given the data collected above, is there a relationship between the gender of an individual 
and the level of education that they have obtained?'''

# This is a chisquare test of relationship

import pandas as pd
import numpy as np
from scipy import stats

# State the Hypothesis
# Null Hypothesis - There is no relationship between the gender of an individual and the level of education
# Alternate Hypothesis - There is a relationship between the gender of an individual and the level of education

# Given level of significance, alpha = 5%
# Sample size is 'n'
# Create a data frame from the given data.

alpha = 0.05
n=395
df1=pd.DataFrame({'Female':[60,54,46,41],'Male':[40,44,53,57]})
print('Given Data in a dataframe : \n', '*'*24,'\n', df1,'\n')

# Solution 1 - Find the chisquare value using calculations
# Expected values Females = (Total number of Females * Number of Females in each category)/Sample Size
# Expected values Males = (Total number of Males * Number of Females in each category)/Sample Size
# Degree of Freedom = (number of columns-1)* (number of rows-1) = (2-1)*(4-1) = 1* 3 = 3

tot_F = 201
tot_M = 194
total = np.array([100,98,99,98])
EVal_F=(tot_F*total)/n
EVal_M=(tot_M*total)/n
print('Expected Values of the Female and Male Population \n', EVal_F,'\n', EVal_M,'\n')

# Get the chisquare value 
# Sum of (Original Value - Expected Value)**2 / Expected Value
Chisq = np.sum((df1['Female']-EVal_F)**2/EVal_F)
Chisq_M = np.sum((df1['Male']-EVal_M)**2/EVal_M)
chisq_T=(Chisq + Chisq_M)
chisq_D=stats.chi2.isf(q=0.05, df=3)
print('Chisquare Value of the distribution       = ', chisq_T)
print('P Value of the calculated chisquare value = ', stats.chi2.sf(chisq_T, 3))
print('Chisquare value for degree of freedom 3   = ', chisq_D)

# The chisquare value for degree of freedom 3 is greater than actual chisquare value calculated. 
# Hence we fail to reject the Null Hypothesis. 
if chisq_T > chisq_D:
    print('\nChisquare Value is greater than the chisquare of degree of freedom 3. So reject the Null Hypothesis')
    print('There is a relationship between the gender and the level of eductaion')
else:
    print('Chisquare Value is less than the chisquare of degree of freedom 3. So fail to reject the Null Hypothesis')
    print('There is no relationship between the gender and the level of eductaion')


# Solution 2 - Find the chisquare value and p-value using chisquare test from scipy stats module. 
# It gives the chisquare value with the P value, degree of freedom and the expected Values. 

stats.chi2_contingency(df1)


Given Data in a dataframe : 
 ************************ 
    Female  Male
0      60    40
1      54    44
2      46    53
3      41    57 

Expected Values of the Female and Male Population 
 [50.88607595 49.86835443 50.37721519 49.86835443] 
 [49.11392405 48.13164557 48.62278481 48.13164557] 

Chisquare Value of the distribution       =  8.006066246262538
P Value of the calculated chisquare value =  0.045886500891747214
Chisquare value for degree of freedom 3   =  7.814727903251178

Chisquare Value is greater than the chisquare of degree of freedom 3. So reject the Null Hypothesis
There is a relationship between the gender and the level of eductaion


(8.006066246262538, 0.045886500891747214, 3, array([[50.88607595, 49.11392405],
        [49.86835443, 48.13164557],
        [50.37721519, 48.62278481],
        [49.86835443, 48.13164557]]))

In [2]:
'''Using the following data, perform a oneway analysis of variance using α=.05. Write up the results in APA format.

[Group1: 51, 45, 33, 45, 67] 
[Group2: 23, 43, 23, 43, 45] 
[Group3: 56, 76, 74, 87, 56]
'''

# Number of observations - n = 5
n = 5
alpha = 0.05
df2=pd.DataFrame({'Group1': [51,45,33,45,67]})
df3=pd.DataFrame({'Group2': [23,43,23,43,45]})                  
df4=pd.DataFrame({'Group3': [56,76,74,87,56]})

# Get the values for the groups - mean, deviations and square of deviations
df2['mean'] = df2['Group1'].mean()
df2['deviations'] = df2['Group1'] - df2['mean']
df2['sq deviations'] = df2['deviations']**2

df3['mean'] = df3['Group2'].mean()
df3['deviations'] = df3['Group2'] - df3['mean']
df3['sq deviations'] = df3['deviations']**2

df4['mean'] = df4['Group3'].mean()
df4['deviations'] = df4['Group3'] - df4['mean']
df4['sq deviations'] = df4['deviations']**2

# Print the values for the groups - mean, deviations and square of deviations
print('Dataframe for Group1\n', '*'*19, '\n', df2,'\n')
print('Dataframe for Group2\n', '*'*19, '\n', df3,'\n')
print('Dataframe for Group3\n', '*'*19, '\n', df4,'\n')

# Get the Sum of the squares of the groups 
SS_Group1 = sum(df2['sq deviations'])
SS_Group2 = sum(df3['sq deviations'])
SS_Group3 = sum(df4['sq deviations'])

# Get the Variance  of the groups 
Var1 = SS_Group1/(n-1)
Var2 = SS_Group2/(n-1)
Var3 = SS_Group3/(n-1)

# Get the Mean standard Error and Sum of Squares of the groups 
MS_Err = (Var1+Var2+Var3)/3
print('Mean Standard Error = ', MS_Err)
dof = 15-3
print('Degree of freedom, Error = ', dof)
SS_Err = MS_Err * dof
print('Sum of Squares of Error = ', SS_Err)

# Get the mean, deviations and square of deviations of the group mean
df5 = pd.DataFrame({'Group_mean': [df2['mean'][0], df3['mean'][0], df4['mean'][0]]})
df5['grand_mean'] = df5['Group_mean'].mean()
df5['deviations'] = df5['Group_mean'] - df5['grand_mean']
df5['sq deviations'] = df5['deviations']**2

print('Dataframe for group mean\n', '*'*23, '\n', df5,'\n')

# Get the Mean standard Error and Sum of Squares of the group mean
dof_groups = (3-1)
print ('Degree of freedom of groups = ', dof_groups)
Var_means = sum(df5['sq deviations'])/dof_groups
print('Variance of means = ', Var_means)
MS_Between = Var_means * 5
print('Mean Standard Between = ', MS_Between)
SS_group = MS_Between * dof_groups
print('Sum of Squares group = ', SS_group)

# Get the F Value
F = MS_Between / MS_Err
print('F Value = ' , F)

SS_Total = SS_Err + SS_group
print('Sum of Squares Total = ', SS_Total)

effect_size = SS_group / SS_Total
print ('Effect Size = ', effect_size)

print('\nAnova Table\n')
df6 = pd.DataFrame({'source':['group', 'error', 'total'], 'SS':[SS_group,SS_Err,SS_Total], 
                    'df':[dof, dof_groups,' '], 'F': [F, ' ', ' ']})
print (df6)

print('\nAPA writeup\n', '*' * 9)
print('F(2,12) = ', round(F,2) ,', p < ', alpha, 'Effect Size = ', round(effect_size,2) )


Dataframe for Group1
 ******************* 
    Group1  mean  deviations  sq deviations
0      51  48.2         2.8           7.84
1      45  48.2        -3.2          10.24
2      33  48.2       -15.2         231.04
3      45  48.2        -3.2          10.24
4      67  48.2        18.8         353.44 

Dataframe for Group2
 ******************* 
    Group2  mean  deviations  sq deviations
0      23  35.4       -12.4         153.76
1      43  35.4         7.6          57.76
2      23  35.4       -12.4         153.76
3      43  35.4         7.6          57.76
4      45  35.4         9.6          92.16 

Dataframe for Group3
 ******************* 
    Group3  mean  deviations  sq deviations
0      56  69.8       -13.8         190.44
1      76  69.8         6.2          38.44
2      74  69.8         4.2          17.64
3      87  69.8        17.2         295.84
4      56  69.8       -13.8         190.44 

Mean Standard Error =  155.06666666666666
Degree of freedom, Error =  12
Sum of Squares 

In [3]:
'''Calculate F Test for given 10, 20, 30, 40, 50 and 5,10,15, 20, 25. '''

# Define the first array, calculate the mean, Standard deviation and Variance
N1 = np.array([10,20,30,40,50])
Sd_N1 = np.std(N1, ddof=1)
Var_N1 = np.var(N1, ddof=1)

print('First Array: ', N1)
print('Mean of First Array = ', N1.mean())
print('Standard Deviation of First Array = ', Sd_N1)
print('Variance of First Array = ', Var_N1, '\n')

# Define the second array, calculate the mean, Standard deviation and Variance
N2 = np.array([5,10,15,20,25])
Sd_N2 = np.std(N2, ddof=1)
Var_N2 = np.var(N2, ddof=1)

print('Second Array: ', N2)
print('Mean of Second Array = ', N2.mean())
print('Standard Deviation of Second Array = ', Sd_N2)
print('Variance of Second Array = ', Var_N2,'\n')


# Calculate the F value
F_Test = Var_N1 / Var_N2 
print('F Value of the series = ', F_Test)

First Array:  [10 20 30 40 50]
Mean of First Array =  30.0
Standard Deviation of First Array =  15.811388300841896
Variance of First Array =  250.0 

Second Array:  [ 5 10 15 20 25]
Mean of Second Array =  15.0
Standard Deviation of Second Array =  7.905694150420948
Variance of Second Array =  62.5 

F Value of the series =  4.0
