In [1]:
%autosave 0

Autosave disabled


The chi^2 test will determine if membership to one group affects membership to another.

In [1]:
import numpy as np
import pandas as pd

from pydataset import data
from scipy import stats

Let's read in the mpg dataset from pydataset!

In [2]:
mpg = data('mpg')
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


Let's do some feature engineering.

Our goal is to compare above/below average mpg to automatic/manual transmission.

In [5]:
mpg['mean_mpg'] = (mpg.cty + mpg.hwy) / 2
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class,mean_mpg
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,23.5
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,25.0
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,25.5
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,25.5
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,21.0


In [7]:
mpg['mpg_cat'] = pd.qcut(mpg.mean_mpg, 2, labels = ['low_mpg', 'high_mpg'])
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class,mean_mpg,mpg_cat
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,23.5,high_mpg
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,25.0,high_mpg
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,25.5,high_mpg
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,25.5,high_mpg
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,21.0,high_mpg


In [9]:
mpg.mpg_cat.value_counts()

low_mpg     121
high_mpg    113
Name: mpg_cat, dtype: int64

In [12]:
mpg['trans_bins'] = np.where(mpg.trans.str.startswith('a'), 'auto', 'manual')
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class,mean_mpg,mpg_cat,trans_bins
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,23.5,high_mpg,auto
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,25.0,high_mpg,manual
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,25.5,high_mpg,manual
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,25.5,high_mpg,auto
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,21.0,high_mpg,auto


Now that we have our categorical features, we can prepare to run a chi^2 contingency test!

First, we need to define our null and alternative hypotheses.

H0: transmission does not affect mpg
H1: transmission does affect mpg

We need to generate a crosstab of our two categorical features.

Once we have the crosstab, we can run the test!

In [15]:
ct = pd.crosstab(mpg.mpg_cat, mpg.trans_bins)
ct

trans_bins,auto,manual
mpg_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
low_mpg,95,26
high_mpg,62,51


The test will return four values:
- The test statistic (chi^2)
- The p-value
- The degrees of freedom (sample size minus 1)
- The table of expected values, if the two features were independent of each other

In [None]:
chi, p , degf, exp =

Let's evaluate our result using a 95% confidence interval!