In [4]:
import numpy as np
import pandas as pd
from scipy.stats import chi2
class CHI2:

  """
  Methods:


  build_expected_table(df_observed : pd.DataFrame) :
  Takes observed data and returns expected values table under null hypothesis (H0) conditions

  calc_sum_of_errors_exp_obs(df_expected : pd.DataFrame , df_observed : pd.DataFrame):
  Takes observed values and expected values table and returns sigma which is close to 0 if H0 is correct

  chi2_test(df_observed : pd.DataFrame):
  Takes observed values table and returns p_value.



  """
  @staticmethod
  def build_expected_table(df_observed : pd.DataFrame):
    sum_of_cols = df_observed.sum().values
    sum_of_rows = df_observed.sum(axis = 1).values
    df_expected = pd.DataFrame()
    all_data = sum(sum_of_cols)
    for c , col in enumerate(df_observed.columns):
      values = []
      for r in range(len(df_observed)):
        values.append((sum_of_cols[c]/all_data)*sum_of_rows[r])
      df_expected[col] = values

    return df_expected

  @staticmethod
  def calc_sum_of_errors_exp_obs(df_expected : pd.DataFrame , df_observed : pd.DataFrame):
    flattened_observed = df_observed.to_numpy().flatten()
    flattened_expected = df_expected.to_numpy().flatten()
    sigma = 0
    for i in range(len(flattened_observed)):
      sigma += (np.square(flattened_expected[i] - flattened_observed[i]))/ flattened_expected[i]

    return sigma


  @staticmethod
  def chi2_test(df_observed : pd.DataFrame) -> int :
    df_expected = CHI2.build_expected_table(df_observed)
    sigma = CHI2.calc_sum_of_errors_exp_obs(df_expected= df_expected , df_observed = df_observed)
    degree_of_freedom = (len(df_expected) - 1) * (len(df_expected.columns) - 1)
    return 1 - chi2(degree_of_freedom).cdf(sigma)


In [2]:
### Test case:

df_observed = pd.DataFrame(data= {"group1" : [8,11,45] , "group2" : [4,12,54], "group3" : [5,42,15]})
df_observed.index = ["mild","moderate" , "severe"]
df_observed

Unnamed: 0,group1,group2,group3
mild,8,4,5
moderate,11,12,42
severe,45,54,15


In [5]:
CHI2.build_expected_table(df_observed)

Unnamed: 0,group1,group2,group3
0,5.55102,6.071429,5.377551
1,21.22449,23.214286,20.561224
2,37.22449,40.714286,36.061224


In [6]:
CHI2.chi2_test(df_observed)

9.518652532847227e-11

In [7]:
# Note that if observed values are close to expected values, sigma should be most likely from chi2 distribution
df_observed = CHI2.build_expected_table(df_observed=df_observed)

In [8]:
CHI2.chi2_test(df_observed)

1.0