# Assignment 4

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

from scipy import stats
from math import sqrt
from statsmodels.formula.api import ols

# Question 1
Is gender independent of education level? A random sample of 395 people were
surveyed and each person was asked to report the highest education level they
obtained. The data that resulted from the survey is summarized in the following table:

        High School Bachelors Masters Ph.d. Total
Female   60   54   46   41   201

Male     40   44   53   57   194

Total   100   98   99   98   395

Question: Are gender and education level dependent at 5% level of significance? In
other words, given the data collected above, is there a relationship between the
gender of an individual and the level of education that they have obtained?

In [2]:
## H0: there exist a relationship between gender and level of education
## H1: there is no realtion between gender and level of education
## Significance level = 5%

In [3]:
data = {'High school': [60,40],'Bachelors': [54,44],'Masters':[46,53], 'Ph.d':[41,57]}
indexes = ['Female', 'Male']
df = pd.DataFrame(data,index=indexes)
df

Unnamed: 0,High school,Bachelors,Masters,Ph.d
Female,60,54,46,41
Male,40,44,53,57


In [4]:
chi_stat,p,dof,expected = stats.chi2_contingency(df)

print(f"The chi_score for the data is {chi_stat}")
print(f"The p value for the data is {p}")
print(f"The degree of freedom for the data is {dof}")

The chi_score for the data is 8.006066246262538
The p value for the data is 0.045886500891747214
The degree of freedom for the data is 3


In [5]:
## Evaluating the chi_critical
prob = 0.95
chi_critical = stats.chi2.ppf(prob,dof)

print(f"The critical value is {chi_critical}")

if chi_stat < chi_critical:
    print("The null hypothesis is accepted...")
    print("There exist a relationship between gender and level of education")
else:
    print("The null hypothesis is rejected...")
    print("There is no relationship between gender and level of education")

The critical value is 7.814727903251179
The null hypothesis is rejected...
There is no relationship between gender and level of education


# Question 2
Using the following data, perform a oneway analysis of variance using α=.05. Write
up the results in APA format.
[Group1: 51, 45, 33, 45, 67]
[Group2: 23, 43, 23, 43, 45]
[Group3: 56, 76, 74, 87, 56]

In [6]:
group1 = [51, 45, 33, 45, 67]
group2 = [23, 43, 23, 43, 45]
group3 = [56, 76, 74, 87, 56]

In [7]:
## H0: All groups have equal means
## H1: The means of each group is significantly different
## Signifcance level = 0.05

In [8]:
# Creating a dataframe from the given data

groups = (['group1'] * len(group1)) + (['group2'] * len(group2)) + (['group3'] * len(group3))
scores = group1 + group2 + group3

df = pd.DataFrame({'group':groups,'score':scores})
df

Unnamed: 0,group,score
0,group1,51
1,group1,45
2,group1,33
3,group1,45
4,group1,67
5,group2,23
6,group2,43
7,group2,23
8,group2,43
9,group2,45


In [9]:
df.groupby('group').mean()

Unnamed: 0_level_0,score
group,Unnamed: 1_level_1
group1,48.2
group2,35.4
group3,69.8


### Method 1
### One-way Anova using statsmodels

In [10]:
lm = ols('score~group',df).fit()
output = sm.stats.anova_lm(lm)
output

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
group,2.0,3022.933333,1511.466667,9.747206,0.00306
Residual,12.0,1860.8,155.066667,,


## OR

In [11]:
stats.f_oneway(df['score'][df['group'] == 'group1'],
               df['score'][df['group'] == 'group2'],
               df['score'][df['group'] == 'group3'])

F_onewayResult(statistic=9.747205503009463, pvalue=0.0030597541434430556)

### Method 2
### All calculations from scratch

In [12]:
k = len(set(df['group']))  #3 ## Number of groups
n = df.shape[0]   #15 ## Number of observations
DFbetween = k - 1
DFwithin = n - k
#DFtotal = 

In [13]:
grand_mean = df['score'].mean()
grand_mean

51.13333333333333

In [14]:
df['grand_mean'] = round(grand_mean,2)
df

Unnamed: 0,group,score,grand_mean
0,group1,51,51.13
1,group1,45,51.13
2,group1,33,51.13
3,group1,45,51.13
4,group1,67,51.13
5,group2,23,51.13
6,group2,43,51.13
7,group2,23,51.13
8,group2,43,51.13
9,group2,45,51.13


In [15]:
group_means = df.groupby('group').mean()
new_columns = ['group_mean','grand_mean']
group_means.columns = new_columns

In [16]:
group_means

Unnamed: 0_level_0,group_mean,grand_mean
group,Unnamed: 1_level_1,Unnamed: 2_level_1
group1,48.2,51.13
group2,35.4,51.13
group3,69.8,51.13


In [18]:
df = df.merge(group_means,left_on='group',right_index=True)
df.drop('grand_mean_y', axis=1, inplace=True)
df = df.rename(columns = {'grand_mean_x':'grand_mean'})
df

Unnamed: 0,group,score,grand_mean,group_mean
0,group1,51,51.13,48.2
1,group1,45,51.13,48.2
2,group1,33,51.13,48.2
3,group1,45,51.13,48.2
4,group1,67,51.13,48.2
5,group2,23,51.13,35.4
6,group2,43,51.13,35.4
7,group2,23,51.13,35.4
8,group2,43,51.13,35.4
9,group2,45,51.13,35.4


In [19]:
SSbetween = sum((df['group_mean'] - df['grand_mean']) ** 2)
MSbetween = SSbetween / (k - 1)

In [20]:
SSwithin = sum((df['score'] - df['group_mean']) ** 2)
MSwithin= SSwithin / (n - k)

In [21]:
f = MSbetween / MSwithin
f

9.747206040412724

In [22]:
p_value = 1 - stats.f.cdf(f,DFbetween,DFwithin)
p_value

0.003059753516923669

In [23]:
## Effect size, eta_square
eta_square = SSbetween / (SSbetween + SSwithin)
eta_square

0.6189800282918796

In [24]:
f_critical = stats.f.ppf(1 - 0.05,DFbetween,DFwithin)

In [25]:
if f < f_critical:
    print("Retain the null hyothesis.")
    print("All groups have equal means")
else:
    print("Null hypothesis is rejected.")
    print("The means of each group is significantly different")

Null hypothesis is rejected.
The means of each group is significantly different


## APA Format
### F(2,12)=9.75, p<0.05, eta_square = 0.62

# Question 3
Calculate F Test for given 10, 20, 30, 40, 50 and 5,10,15, 20, 25.

In [26]:
## H0: The means are equal
## H1: The means are not equal

In [27]:
data1 = [10,20,30,40,50]
data2 = [5,10,15,20,25]

In [28]:
var1 = np.var([10,20,30,40,50])
var2 = np.var([5,10,15,20,25])
df1 = 4
df2 = 4

In [29]:
print(var1, var2)

200.0 50.0


In [30]:
f_score = var1/var2
f_score

4.0

In [31]:
f_critical = stats.f.ppf(1-0.05,df1,df2)
f_critical

6.388232908695868

In [32]:
if f_score < f_critical:
    print("Retain the null hyothesis.")
else:
    print("Null hypothesis is rejected.")

Retain the null hyothesis.
