## Chi-Square Test:

The test is applied when you have two categorical variables from a single population. It is used to determine whether there is a significant association between the two variables

In [1]:
import scipy.stats as stats

In [2]:
import seaborn as sns
import pandas as pd
import numpy as np
dataset=sns.load_dataset('tips')

In [3]:
dataset.head()
#we are going to find relation b/w sex and smoker

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
dataset_table=pd.crosstab(dataset['sex'],dataset['smoker']) #Compute a simple cross tabulation of two (or more) factors.
print(dataset_table)

smoker  Yes  No
sex            
Male     60  97
Female   33  54


In [5]:
dataset_table.values

array([[60, 97],
       [33, 54]])

In [24]:
Observed_values = dataset_table.values

In [25]:
Observed_values

array([[60, 97],
       [33, 54]])

In [7]:
val = stats.chi2_contingency(dataset_table)

In [9]:
val

Chi2ContingencyResult(statistic=0.0, pvalue=1.0, dof=1, expected_freq=array([[59.84016393, 97.15983607],
       [33.15983607, 53.84016393]]))

In [20]:
Expected_values=val[3]

In [21]:
print(Expected_values)

[[59.84016393 97.15983607]
 [33.15983607 53.84016393]]


In [27]:
no_of_rows=len(dataset_table.iloc[0:2,0])
no_of_columns=len(dataset_table.iloc[0,0:2])
ddof=(no_of_columns-1)*(no_of_rows-1)
print("Degree of Freedom:-",ddof)
alpha=0.05

Degree of Freedom:- 1


# X^2 = Sigma (o-e)^2 / e

In [29]:
from scipy.stats import chi2
chi_square=sum([(o-e)**2./e for o,e in zip(Observed_values,Expected_values)])
chi_square_statistic=chi_square[0]+chi_square[1]

In [30]:
print(chi_square_statistic)

0.001934818536627623


In [31]:
critical_value=chi2.ppf(q=1-alpha,df=ddof)
print('critical_value:',critical_value)

critical_value: 3.841458820694124


In [33]:
#we can use critical value techinique or p_value technique 
p_value = 1-chi2.cdf(x=chi_square_statistic,df=ddof)
print('p-value:',p_value)
print('Significance level:',alpha)
print('Degree of Freedom:',ddof)
print('p-value:',p_value)

p-value: 0.964915107315732
Significance level: 0.05
Degree of Freedom: 1
p-value: 0.964915107315732


In [34]:
if chi_square_statistic >= critical_value:
    print("Reject H0, there is a relationship b/w 2 categorical variables")
else:
    print("Retain H0, there is no relationship b/w 2 cateforical variables")

#or we can use this..    
    
if p_value <= alpha:
    print("Reject H0, there is a relationship b/w 2 categorical variables")
else:
    print("Retain H0, there is no relationship b/w 2 cateforical variables")

Retain H0, there is no relationship b/w 2 cateforical variables
Retain H0, there is no relationship b/w 2 cateforical variables
