In [10]:
from scipy.stats import ttest_1samp
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind

# One sample t-test

In [7]:
df = pd.read_csv("data/PlantGrowth.csv")
weight = df['weight']
print(weight[1:10])
weight_mean = np.mean(weight)
print(weight_mean)

1    5.58
2    5.18
3    6.11
4    4.50
5    4.61
6    5.17
7    4.53
8    5.33
9    5.14
Name: weight, dtype: float64
5.073000000000001


In [9]:
tset, pval = ttest_1samp(weight, 4.6)
print("p-values",pval)
if pval < 0.05:    # alpha value is 0.05 or 5%
   print(" we are rejecting null hypothesis")
else:
  print("we are accepting null hypothesis")

p-values 0.0009101351370639487
 we are rejecting null hypothesis


# Two sampled T-test

In [11]:
df1 = pd.read_csv("data/PlantGrowth1.csv")
df2 = pd.read_csv("data/PlantGrowth2.csv")
pg1 = df1['weight']
pg2 = df2['weight']
pg1_mean = np.mean(pg1)
pg2_mean = np.mean(pg2)
print("pg1 mean value:",pg1_mean)
print("pg2 mean value:",pg2_mean)
pg1_std = np.std(pg1)
pg2_std = np.std(pg2)
print("week1 std value:",pg1_std)
print("week2 std value:",pg2_std)
ttest,pval = ttest_ind(pg1,pg2)
print("p-value",pval)
if pval <0.05:
  print("we reject null hypothesis")
else:
  print("we accept null hypothesis")

pg1 mean value: 4.807142857142858
pg2 mean value: 5.268
week1 std value: 0.6330586586613617
week2 std value: 0.655878037442938
p-value 0.07449555858109919
we accept null hypothesis


# Paired(dependent ) sampled t-test·1
It’s an uni variate test that tests for a significant difference between 2 related variables. 

In [12]:
import pandas as pd
from scipy import stats
df = pd.read_csv("data/blood_pressure.csv")
df[['bp_before','bp_after']].describe()
ttest,pval = stats.ttest_rel(df['bp_before'], df['bp_after'])
print(pval)
if pval<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

0.0011297914644840823
reject null hypothesis


### One-sample z test

In [14]:
import pandas as pd
from scipy import stats
from statsmodels.stats import weightstats as stests
ztest ,pval = stests.ztest(df['bp_before'], x2=None, value=156)
print(float(pval))
if pval<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

0.6651614730255063
accept null hypothesis


### Two-sample Z test
H0 : mean of two group is 0

H1 : mean of two group is not 0

In [15]:
ztest ,pval1 = stests.ztest(df['bp_before'], x2=df['bp_after'], value=0,alternative='two-sided')
print(float(pval1))
if pval<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

0.002162306611369422
accept null hypothesis


### ANOVA (F-TEST)
comparing more than two groups at the same time

F = Between group variability / Within group variability

In [17]:
df_anova = pd.read_csv('data/PlantGrowth.csv')
df_anova = df_anova[['weight','group']]
grps = pd.unique(df_anova.group.values)
d_data = {grp:df_anova['weight'][df_anova.group == grp] for grp in grps}
 
F, p = stats.f_oneway(d_data['ctrl'], d_data['trt1'], d_data['trt2'])
print("p-value for significance is: ", p)
if p<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

p-value for significance is:  0.0159099583256229
reject null hypothesis


### Two Way F-test 

In [18]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
df_anova2 = pd.read_csv("https://raw.githubusercontent.com/Opensourcefordatascience/Data-sets/master/crop_yield.csv")
model = ols('Yield ~ C(Fert)*C(Water)', df_anova2).fit()
print(f"Overall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .4f}")
res = sm.stats.anova_lm(model, typ= 2)
res

Overall model F( 3, 16) =  4.112, p =  0.0243


Unnamed: 0,sum_sq,df,F,PR(>F)
C(Fert),69.192,1.0,5.766,0.028847
C(Water),63.368,1.0,5.280667,0.035386
C(Fert):C(Water),15.488,1.0,1.290667,0.272656
Residual,192.0,16.0,,


### Chi-Square Test (categorical variables)
The test is applied when you have two categorical variables from a single population. It is used to determine whether there is a significant association between the two variables.

In [22]:
df_chi = pd.read_csv('data/chi-test.csv')
df_chi

Unnamed: 0,Gender,Shopping?
0,Male,No
1,Female,Yes
2,Male,Yes
3,Female,Yes
4,Female,Yes
5,Male,Yes
6,Male,No
7,Female,No
8,Female,No


In [23]:
contingency_table=pd.crosstab(df_chi["Gender"],df_chi["Shopping?"])
print('contingency_table :-\n',contingency_table)
#Observed Values
Observed_Values = contingency_table.values 
print("Observed Values :-\n",Observed_Values)
b=stats.chi2_contingency(contingency_table)
Expected_Values = b[3]
print("Expected Values :-\n",Expected_Values)
no_of_rows=len(contingency_table.iloc[0:2,0])
no_of_columns=len(contingency_table.iloc[0,0:2])
ddof=(no_of_rows-1)*(no_of_columns-1)
print("Degree of Freedom:-",ddof)
alpha = 0.05
from scipy.stats import chi2
chi_square=sum([(o-e)**2./e for o,e in zip(Observed_Values,Expected_Values)])
chi_square_statistic=chi_square[0]+chi_square[1]
print("chi-square statistic:-",chi_square_statistic)
critical_value=chi2.ppf(q=1-alpha,df=ddof)
print('critical_value:',critical_value)
#p-value
p_value=1-chi2.cdf(x=chi_square_statistic,df=ddof)
print('p-value:',p_value)
print('Significance level: ',alpha)
print('Degree of Freedom: ',ddof)
print('chi-square statistic:',chi_square_statistic)
print('critical_value:',critical_value)
print('p-value:',p_value)
if chi_square_statistic>=critical_value:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("Retain H0,There is no relationship between 2 categorical variables")
    
if p_value<=alpha:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("Retain H0,There is no relationship between 2 categorical variables")

contingency_table :-
 Shopping?  No  Yes
Gender            
Female      2    3
Male        2    2
Observed Values :-
 [[2 3]
 [2 2]]
Expected Values :-
 [[2.22222222 2.77777778]
 [1.77777778 2.22222222]]
Degree of Freedom:- 1
chi-square statistic:- 0.09000000000000008
critical_value: 3.841458820694124
p-value: 0.7641771556220945
Significance level:  0.05
Degree of Freedom:  1
chi-square statistic: 0.09000000000000008
critical_value: 3.841458820694124
p-value: 0.7641771556220945
Retain H0,There is no relationship between 2 categorical variables
Retain H0,There is no relationship between 2 categorical variables
