In [3]:
import pandas as pd
import numpy as np
import scipy.stats as stats

# Chi-squared Goodeness-of-fit test

This test is analogus to one-way t-test (if sample mean differs from populaton mean).
Chi-squared goodness of fit test tests whether the dstribution of sample categorical data matches the population distrubution. 

We generate a fake US and Minesota demographic data and test whether the demographic proportions match

In [4]:
national = pd.DataFrame(["white"]*100000 + ["hispanic"]*60000 +\
                        ["black"]*50000 + ["asian"]*15000 + ["other"]*35000)

minnesota = pd.DataFrame(["white"]*600 + ["hispanic"]*300 + \
                         ["black"]*250 +["asian"]*75 + ["other"]*150)

national.rename(columns = {national.columns[0] : 'demograph'}, inplace = True)
minnesota.rename(columns = {minnesota.columns[0] : 'demograph'}, inplace = True)


national_table = pd.crosstab(national['demograph'], columns = 'counts')
minnesota_table = pd.crosstab(minnesota['demograph'], columns = 'counts')

print(national_table)
print(minnesota_table)

col_0      counts
demograph        
asian       15000
black       50000
hispanic    60000
other       35000
white      100000
col_0      counts
demograph        
asian          75
black         250
hispanic      300
other         150
white         600


In [5]:
national['demograph'].value_counts()

white       100000
hispanic     60000
black        50000
other        35000
asian        15000
Name: demograph, dtype: int64

In [6]:
observed = minnesota_table

expected = len(minnesota) * national_table/len(national)

In [17]:
chi_squared_stat = (((observed - expected)**2)/expected).sum()
chi_squared_stat[0]

18.194805194805205

In [18]:
crit = stats.chi2.ppf(q = 0.95, df = 4)
crit

9.487729036781154

In [61]:
1-stats.chi2.cdf(x= chi_squared_stat[0], df = 4)

0.001130467092803511

In [62]:
stats.chisquare(f_obs = observed,f_exp = expected)

Power_divergenceResult(statistic=array([18.19480519]), pvalue=array([0.00113047]))

# Chi-square Test of independence

In [20]:
voter_race = np.random.choice(a = ["asian","black","hispanic","other","white"], 
                              p = [0.05, 0.15 ,0.25, 0.05, 0.5],
                             size = 1000)

voter_party = np.random.choice(a = ["democrat","independent","republican"],
                              p = [0.4,0.2,0.4],
                              size = 1000)

In [21]:
voter_table = pd.crosstab(voter_race,voter_party, margins = True)
voter_table.columns = ["democrat","independent","republican","row_totals"]
voter_table.index = ["asian","black","hispanic","other","white","col_totals"]

In [22]:
voter_table

Unnamed: 0,democrat,independent,republican,row_totals
asian,17,10,20,47
black,67,27,56,150
hispanic,107,76,81,264
other,24,12,25,61
white,196,98,184,478
col_totals,411,223,366,1000


In [47]:
voter_table.loc['col_totals']

democrat       17
independent    10
republican     20
row_totals     47
Name: asian, dtype: int64

In [67]:
expected = np.outer(voter_table.row_totals[0:5],voter_table.loc['col_totals'][0:3])/1000

In [63]:
np.outer(voter_table.row_totals[0:4],voter_table.loc['col_totals'])/1000

array([[ 19.317,  10.481,  17.202,  47.   ],
       [ 61.65 ,  33.45 ,  54.9  , 150.   ],
       [108.504,  58.872,  96.624, 264.   ],
       [ 25.071,  13.603,  22.326,  61.   ]])

In [58]:
expected = pd.DataFrame(expected)

In [59]:
observed = voter_table.iloc[0:5,0:3]

In [62]:
observed

Unnamed: 0,democrat,independent,republican
asian,17,10,20
black,67,27,56
hispanic,107,76,81
other,24,12,25
white,196,98,184


In [60]:
stats.chi2_contingency(observed= observed)

(11.732750647008402,
 0.16353210433156662,
 8,
 array([[ 19.317,  10.481,  17.202],
        [ 61.65 ,  33.45 ,  54.9  ],
        [108.504,  58.872,  96.624],
        [ 25.071,  13.603,  22.326],
        [196.458, 106.594, 174.948]]))

In [68]:
expected

array([[ 19.317,  10.481,  17.202],
       [ 61.65 ,  33.45 ,  54.9  ],
       [108.504,  58.872,  96.624],
       [ 25.071,  13.603,  22.326],
       [196.458, 106.594, 174.948]])