In [1]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt

In [71]:
ns = np.array([20, 100])
distributions = {
  'Normal': [np.random.normal(loc=0, scale=1, size=n) for n in ns],
  'Student\'s t': [np.random.standard_t(df=10, size=n) for n in ns],
  'Uniform': [np.random.uniform(low=0, high=1, size=n) for n in ns],
}

In [72]:

Ps = {
  'Normal': {
    5: [3, 4, 6, 4, 3],
    8: [14, 10, 12, 14, 14, 12, 10, 14]
  },
  'Student\'s t': {
    5: [4, 5, 5, 3, 3],
    8: [20, 7, 8, 9, 10, 9, 8, 29]
  },
  'Uniform': {
    5: [0, 6, 7, 7, 0],
    8: [0, 14, 18, 18, 18, 18, 14, 0]
  },
}

In [95]:
bounds = {
  'Normal': {
    5: [-np.inf, -0.674, 0, 0.674, np.inf],
    8: [-np.inf, -1.1, -0.733, -0.367, 0, 0.367, 0.733, 1.1, np.inf]
  },
  'Student\'s t': {
    5: [-np.inf, -0.674, 0, 0.674, np.inf],
    8: [-np.inf, -1.1, -0.733, -0.367, 0, 0.367, 0.733, 1.1, np.inf]
  },
  'Uniform': {
    5: [-np.inf, 0.25, 0.5, 0.75, np.inf],
    8: [-np.inf, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, np.inf]
  }
}

In [96]:
batches = {
  'Normal': sp.stats.norm,
  'Student\'s t': sp.stats.t,
  'Uniform': sp.stats.uniform
}

In [110]:
def chi2_test(data, bins, dist_name='Normal', alpha=0.05):
  observed_values, _ = np.histogram(data, bins=bounds[dist_name][bins])
  edges = np.histogram_bin_edges(data, bins=bounds[dist_name][bins])
  if (dist_name == 'Student\'s t'):
    Fxs = batches[dist_name].cdf(edges, df=10)
  else:
    Fxs = batches[dist_name].cdf(edges)
  Fxs[0] = 0
  Fxs[-1] = 1
  Pis = Fxs - np.roll(Fxs, 1)
  exp_values = [round(len(data) * pi) for pi in Pis[1:]]
  P_vals = Pis[1:]
  diff = abs(np.sum(observed_values) - sum(exp_values))
  obs_sum = np.sum(observed_values)
  exp_sum = sum(exp_values)
  #print(obs_sum, exp_sum)
  #print("diff = ", diff)
  if (sum(exp_values) != len(data)):
    for i in range(0, len(exp_values)):
      if (observed_values[i] != exp_values[i]):
        if (observed_values[i] < exp_values[i] and obs_sum < exp_sum):
          observed_values[i] = observed_values[i] + diff
          break
        elif (observed_values[i] > exp_values[i] and obs_sum > exp_sum):
          exp_values[i] = exp_values[i] + diff
          break
  #print(observed_values)
  #print(exp_values)
  # print(np.sum(observed_values), sum(exp_values))
  # expected_values = [len(data)/bins] * bins
  expected_values = Ps[dist_name][bins]
  #chi2, p_val = sp.stats.chisquare(f_obs=observed_values, f_exp=exp_values)
  p_val = 0
  chi2_c = 0
  for i in range(len(P_vals)):
    chi_i = ((observed_values[i] - len(data) * P_vals[i]) ** 2) / (len(data) * P_vals[i])
    print(f"{i+1} & [{edges[i]}, {edges[i+1]}] & {observed_values[i]} & {P_vals[i]:.3f} & {len(data) * P_vals[i]:.3f} & {observed_values[i] - len(data) * P_vals[i]:.3f} & {chi_i:.3f} \\\\")
    print("\\hline")
    chi2_c = chi2_c + chi_i
  chi2_critical = sp.stats.chi2.ppf(q = 1 - alpha, df = bins - 1)
  print(f"$\\sum$ & - & {len(data)} & {1:.3f} & {len(data):.3f} & {0:.3f} & {chi2_c:.3f} \\\\")
  print("\\hline")

  return chi2_c, p_val, chi2_critical

In [111]:
alpha = 0.05
bins = 10

for dist_name, datasets in distributions.items():
  for i, data in enumerate(datasets):
    if (len(data) == 20):
      bins = 5
    else:
      bins = 8
    chi2, p_val, chi2_critical = chi2_test(data, bins, dist_name)
    print(f"\nDistribution: {dist_name}, n={ns[i]}")
    print(f"Chi-squared: {chi2}, chi2-critical: {chi2_critical}")
    if chi2 < chi2_critical:
      print(f"Do not reject null hypothesis, data follows the expected distribution")
    else:
      print(f"Reject null hypothesis, data does not follow the expected distribution")

1 & [-inf, -0.674] & 4 & 0.250 & 5.003 & -1.003 & 0.201 \\
\hline
2 & [-0.674, 0.0] & 5 & 0.250 & 4.997 & 0.003 & 0.000 \\
\hline
3 & [0.0, 0.674] & 5 & 0.250 & 4.997 & 0.003 & 0.000 \\
\hline
4 & [0.674, inf] & 6 & 0.250 & 5.003 & 0.997 & 0.199 \\
\hline
$\sum$ & - & 20 & 1.000 & 20.000 & 0.000 & 0.400 \\
\hline

Distribution: Normal, n=20
Chi-squared: 0.3997588573245543, chi2-critical: 9.487729036781154
Do not reject null hypothesis, data follows the expected distribution
1 & [-inf, -1.1] & 15 & 0.136 & 13.567 & 1.433 & 0.151 \\
\hline
2 & [-1.1, -0.733] & 11 & 0.096 & 9.611 & 1.389 & 0.201 \\
\hline
3 & [-0.733, -0.367] & 6 & 0.125 & 12.503 & -6.503 & 3.382 \\
\hline
4 & [-0.367, 0.0] & 21 & 0.143 & 14.319 & 6.681 & 3.117 \\
\hline
5 & [0.0, 0.367] & 12 & 0.143 & 14.319 & -2.319 & 0.376 \\
\hline
6 & [0.367, 0.733] & 13 & 0.125 & 12.503 & 0.497 & 0.020 \\
\hline
7 & [0.733, 1.1] & 10 & 0.096 & 9.611 & 0.389 & 0.016 \\
\hline
8 & [1.1, inf] & 14 & 0.136 & 13.567 & 0.433 & 0.014 \\
\h