**Import packages**

In [1]:
import numpy as np
import pandas as pd
from scipy import stats

In [4]:
import io
df2 = pd.read_csv('chiSquare.csv')

In [5]:
df2.head()

Unnamed: 0,City,Brand
0,Mumbai,A
1,Chennai,C
2,Mumbai,A
3,Mumbai,C
4,Chennai,C


In [6]:
contingTab = pd.crosstab(df2.City, df2.Brand, margins=True)

In [7]:
contingTab

Brand,A,B,C,All
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Chennai,165,47,191,403
Mumbai,279,73,225,577
All,444,120,416,980


**Calculate the expected frequency**

In [8]:
contingTab['A']

City
Chennai    165
Mumbai     279
All        444
Name: A, dtype: int64

In [9]:
contingTab['A']['Chennai']

165

In [10]:
contingTab['All']['All']

980

In [11]:
contingTab.transpose()

City,Chennai,Mumbai,All
Brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,165,279,444
B,47,73,120
C,191,225,416
All,403,577,980


In [12]:
cities = list(df2['City'].unique())
brands = list(df2['Brand'].unique())

exp1 = {}

for i in cities:
  exp2 = {}
  for j in brands:
    exp2[j] = contingTab.transpose()[i]['All'] * contingTab[j]['All'] / (contingTab['All']['All'])

  exp1[i] = exp2


In [13]:
403*444/980

182.58367346938775

In [14]:
exp1

{'Mumbai': {'A': 261.41632653061225,
  'C': 244.93061224489796,
  'B': 70.65306122448979},
 'Chennai': {'A': 182.58367346938775,
  'C': 171.06938775510204,
  'B': 49.3469387755102}}

**Chi square calculation**

In [15]:
chiSquareCal = 0
for i in cities:
  for j in brands:
    val = (contingTab.transpose()[i][j] - exp1[i][j])**2/exp1[i][j]
    chiSquareCal = chiSquareCal + val

In [16]:
chiSquareCal

7.009543616823935

**Degrees of freedom**

In [17]:
dof = (len(cities)-1) * (len(brands)-1)

In [18]:
dof

2

In [19]:
stats.chi2.ppf(1-0.05, df=dof)

5.991464547107979

**Shortcut to the chi-squared test**

In [20]:
contab = np.array([contingTab.transpose()['Chennai'][0:3].values,
                  contingTab.transpose()['Mumbai'][0:3].values])
stats.chi2_contingency(contab)

Chi2ContingencyResult(statistic=7.009543616823934, pvalue=0.03005363054744611, dof=2, expected_freq=array([[182.58367347,  49.34693878, 171.06938776],
       [261.41632653,  70.65306122, 244.93061224]]))

In [21]:
1 - stats.chi2.cdf(chiSquareCal, dof)

0.030053630547446142