# Chi-Square Test

### Chi-Square Test to find the Goodness of Fit for Categorical Variables

In [47]:
import numpy as np
import pandas as pd
from scipy.stats import chisquare,chi2_contingency

In [48]:
car = pd.read_csv("Car_Dataset.csv", header=None, usecols=[0,1,2,3,4,5])
car.head()

Unnamed: 0,0,1,2,3,4,5
0,vhigh,vhigh,2,2,small,low
1,vhigh,vhigh,2,2,small,med
2,vhigh,vhigh,2,2,small,high
3,vhigh,vhigh,2,2,med,low
4,vhigh,vhigh,2,2,med,med


In [49]:
car = car.rename(columns={0: 'buying', 1: 'maintenance', 2: 'doors', 3: 'persons', 4: 'lug_boot', 5: 'safety'})

In [50]:
car.head()

Unnamed: 0,buying,maintenance,doors,persons,lug_boot,safety
0,vhigh,vhigh,2,2,small,low
1,vhigh,vhigh,2,2,small,med
2,vhigh,vhigh,2,2,small,high
3,vhigh,vhigh,2,2,med,low
4,vhigh,vhigh,2,2,med,med


In [51]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 6 columns):
buying         1728 non-null object
maintenance    1728 non-null object
doors          1728 non-null object
persons        1728 non-null object
lug_boot       1728 non-null object
safety         1728 non-null object
dtypes: object(6)
memory usage: 81.1+ KB


In [52]:
car.describe()

Unnamed: 0,buying,maintenance,doors,persons,lug_boot,safety
count,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3
top,vhigh,vhigh,2,2,big,low
freq,432,432,432,576,576,576


In [53]:
car.doors.value_counts()

2        432
5more    432
3        432
4        432
Name: doors, dtype: int64

In [54]:
#Goodness of fit test for a single Categorical variable

# Let pi denote the proportion in the ith category
# H0 : All pi' s are the same
# Ha : At least one pi differs from the others

chisquare(car["doors"].value_counts())

#The p-value > 0.05 hence we conclude that all proportions are the same

Power_divergenceResult(statistic=0.0, pvalue=1.0)

In [55]:
chisquare(car["lug_boot"].value_counts())

#The p-value > 0.05 hence we conclude that all proportions are equal

Power_divergenceResult(statistic=0.0, pvalue=1.0)

In [56]:
car.lug_boot.value_counts()

big      576
med      576
small    576
Name: lug_boot, dtype: int64

In [57]:
# Goodness of Fit Test between 2 categorical variables

# H0: The two categorical variables are independent
# Ha: The two categorical variables are dependent


#Contingency Table
contingency_table=pd.crosstab(car["doors"],car["lug_boot"])
print('contingency_table :-\n',contingency_table)

contingency_table :-
 lug_boot  big  med  small
doors                    
2         144  144    144
3         144  144    144
4         144  144    144
5more     144  144    144


In [58]:
#Observed Values
Observed_Values = contingency_table.values 
print("Observed Values :-\n",Observed_Values)

Observed Values :-
 [[144 144 144]
 [144 144 144]
 [144 144 144]
 [144 144 144]]


In [59]:
#Expected Values
import scipy.stats
b=scipy.stats.chi2_contingency(contingency_table)
Expected_Values = b[3]
print("Expected Values :-\n",Expected_Values)

Expected Values :-
 [[144. 144. 144.]
 [144. 144. 144.]
 [144. 144. 144.]
 [144. 144. 144.]]


In [60]:
#Degree of Freedom
no_of_rows=len(contingency_table.iloc[0:4,0])
no_of_columns=len(contingency_table.iloc[0,0:3])
df=(no_of_rows-1)*(no_of_columns-1)
print("Degree of Freedom:-",df)

Degree of Freedom:- 6


In [61]:
#or
df=b[2]
print("Degree of Freedom:-",df)

Degree of Freedom:- 6


In [62]:
#Significance Level 5%
alpha=0.05

In [63]:
#chi-square statistic - χ2
from scipy.stats import chi2
chi_square=sum([(o-e)**2./e for o,e in zip(Observed_Values,Expected_Values)])
chi_square_statistic=chi_square[0]+chi_square[1]
print("chi-square statistic:-",chi_square_statistic)

chi-square statistic:- 0.0


In [64]:
#critical_value
critical_value=chi2.ppf(q=1-alpha,df=df)
print('critical_value:',critical_value)

critical_value: 12.591587243743977


In [65]:
#p-value
p_value=1-chi2.cdf(x=chi_square_statistic,df=df)
print('p-value:',p_value)

p-value: 1.0


In [66]:
print('Significance level: ',alpha)
print('Degree of Freedom: ',df)
print('chi-square statistic:',chi_square_statistic)
print('critical_value:',critical_value)
print('p-value:',p_value)

Significance level:  0.05
Degree of Freedom:  6
chi-square statistic: 0.0
critical_value: 12.591587243743977
p-value: 1.0


In [67]:
#compare chi_square_statistic with critical_value and p-value which is the probability of getting 
# chi-square>0.09 (chi_square_statistic)

if chi_square_statistic>=critical_value:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("Retain H0,There is no relationship between 2 categorical variables")
    
if p_value<=alpha:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("Retain H0,There is no relationship between 2 categorical variables")

Retain H0,There is no relationship between 2 categorical variables
Retain H0,There is no relationship between 2 categorical variables
