In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats

# Chi-Squared Goodness-Of-Fit Test

#### Preparing data

In [2]:
data_1 = pd.DataFrame(["Red"]*30 + ["Blue"]*25 + ["Green"]*20+["Yellow"]*15+["Orange"]*10)

In [3]:
data_2 = pd.DataFrame(["Red"]*35 + ["Blue"]*30 + ["Green"]*15+["Yellow"]*10+["Orange"]*10)

In [4]:
observed_1 = pd.crosstab(index=data_1[0],columns="count")

In [5]:
expected_1 = pd.crosstab(index=data_2[0],columns="count")

In [6]:
observed_1

col_0,count
0,Unnamed: 1_level_1
Blue,25
Green,20
Orange,10
Red,30
Yellow,15


In [7]:
expected_1

col_0,count
0,Unnamed: 1_level_1
Blue,30
Green,15
Orange,10
Red,35
Yellow,10


#### Calculating Chi Square test

In [8]:
chi_squared_test_1 = (((observed_1-expected_1)**2)/expected_1).sum()

#### Concluding : Critical value apporoach

In [9]:
critical_value_1 = stats.chi2.ppf(q = 0.95, df = len(expected_1)-1)

In [10]:
print(chi_squared_test_1,"\n")
print(critical_value_1)

col_0
count    5.714286
dtype: float64 

9.487729036781154


The Goodness of Fit Test is primarily a right-tailed hypothesis test, assessing if the test score exceeds the critical value.

Since 5.71 is less than 9.48, we cannot reject the null hypothesis. This suggests that based on the data observed, there isn't enough evidence to indicate a significant difference from the expected distribution.

#### Concluding : P-value apporoach

In [11]:
p_value_1 = 1 - stats.chi2.cdf(chi_squared_test_1,  df = len(expected_1)-1)

In [12]:
print(p_value_1)

[0.22152582]


Failing to reject the alternative hypothesis due to the p-value being larger than the chosen significance level (0.05)

#### Method for returing the chi square test and p-value

In [13]:
stats.chisquare(f_obs= observed_1,   f_exp= expected_1)

Power_divergenceResult(statistic=array([5.71428571]), pvalue=array([0.22152582]))

# Chi-Squared Test of Independence

In [149]:
np.random.seed(49)
gender_options = ['Male', 'Female']
smoking_status_options = ['Non-Smoker', 'Smoker']

data = {
    'Gender': np.random.choice(gender_options, size=1000),
    'Smoking_Status': np.random.choice(smoking_status_options, size=1000)
}

data_3 = pd.DataFrame(data)

In [150]:
contingency_table = pd.crosstab(data_3['Gender'], data_3['Smoking_Status'])

In [151]:
_, p, _, _ = stats.chi2_contingency(contingency_table)

In [152]:
print(f"P-value: {p}")

P-value: 0.08932055231733187


Given that the 0.08 is greater than 0.05, we reject the alternative hypothesis. Consequently, we conclude that there is no significant association between the variables 'Gender' and 'Smoking_Status,'