Hypothesis Testing

In [None]:
# ============================================================
# Hypothesis Testing on Unbalanced Dataset (bin_df)
# Uses real (unbalanced) data for statistical validity
# Tests included:
# 1) BMI vs Diabetes (Welch's t-test, numeric vs binary)
# 2) PhysActivity vs Diabetes (Chi-square, categorical vs categorical)
# 3) Smoker vs Diabetes (Chi-square, categorical vs categorical)
# 4) HighBP vs Diabetes (Fisher's Exact Test, 2x2 categorical)
# 5) GenHlth vs Diabetes (Mann–Whitney U, ordinal vs binary)
# Alpha = 0.05 for significance
# ============================================================

import pandas as pd
import numpy as np
from pathlib import Path

from scipy.stats import ttest_ind, chi2_contingency, fisher_exact, mannwhitneyu

# ------------------------------------------------------------
# Load dataset: use in-memory bin_df if present, else read CSV
# ------------------------------------------------------------
if "bin_df" in globals():
    df = bin_df.copy()
else:
    csv_path = "/kaggle/working/diabetes_binary_health_indicators_BRFSS2015.csv"
    if not Path(csv_path).exists():
        raise FileNotFoundError(
            "Could not find '/kaggle/working/diabetes_binary_health_indicators_BRFSS2015.csv'. "
            "Please save your cleaned binary CSV before running hypothesis tests."
        )
    df = pd.read_csv(csv_path)

# Ensure expected columns exist
required_cols = [
    "Diabetes_binary", "BMI", "PhysActivity", "Smoker",
    "HighBP", "GenHlth"
]
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# ------------------------------------------------------------
# Basic info
# ------------------------------------------------------------
print("=== DATA SUMMARY ===")
print(f"Shape: {df.shape}")
print("Target distribution (Diabetes_binary):")
print(df["Diabetes_binary"].value_counts().sort_index())
print("Proportions:")
print(df["Diabetes_binary"].value_counts(normalize=True).sort_index())
print("-" * 60)

alpha = 0.05

def conclusion(p, alpha=0.05):
    return "Reject H0 (Significant)" if p < alpha else "Fail to reject H0 (Not significant)"

# ============================================================
# 1) BMI vs Diabetes — Welch's t-test
# H0: Mean BMI is equal in diabetic and non-diabetic groups
# H1: Mean BMI differs between groups
# ============================================================
print("TEST 1: BMI vs Diabetes (Welch's t-test)")
g0 = df.loc[df["Diabetes_binary"] == 0, "BMI"].dropna()
g1 = df.loc[df["Diabetes_binary"] == 1, "BMI"].dropna()

t_stat, p_val = ttest_ind(g0, g1, equal_var=False)
print(f"n0={len(g0)}, mean0={g0.mean():.3f}, std0={g0.std(ddof=1):.3f}")
print(f"n1={len(g1)}, mean1={g1.mean():.3f}, std1={g1.std(ddof=1):.3f}")
print(f"T-statistic={t_stat:.4f}, P-value={p_val:.4e}")
print(f"Conclusion: {conclusion(p_val, alpha)}")
print("-" * 60)

=== DATA SUMMARY ===
Shape: (253680, 22)
Target distribution (Diabetes_binary):
Diabetes_binary
0.0    213703
1.0     39977
Name: count, dtype: int64
Proportions:
Diabetes_binary
0.0    0.842412
1.0    0.157588
Name: proportion, dtype: float64
------------------------------------------------------------
TEST 1: BMI vs Diabetes (Welch's t-test)
n0=213703, mean0=27.743, std0=6.261
n1=39977, mean1=31.803, std1=7.329
T-statistic=-103.9055, P-value=0.0000e+00
Conclusion: Reject H0 (Significant)
------------------------------------------------------------


In [None]:

# ============================================================
# 2) PhysActivity vs Diabetes — Chi-square test of independence
# H0: PhysActivity and Diabetes are independent
# H1: They are associated
# ============================================================
print("TEST 2: PhysActivity vs Diabetes (Chi-square test)")
ct_pa = pd.crosstab(df["PhysActivity"], df["Diabetes_binary"])
chi2, p_pa, dof_pa, exp_pa = chi2_contingency(ct_pa)

print("Contingency table (rows=PhysActivity, cols=Diabetes_binary):")
print(ct_pa)
print(f"Chi-square={chi2:.4f}, dof={dof_pa}, P-value={p_pa:.4e}")
print(f"Conclusion: {conclusion(p_pa, alpha)}")
print("-" * 60)

TEST 2: PhysActivity vs Diabetes (Chi-square test)
Contingency table (rows=PhysActivity, cols=Diabetes_binary):
Diabetes_binary     0.0    1.0
PhysActivity                  
0.0               47212  14548
1.0              166491  25429
Chi-square=3737.4488, dof=1, P-value=0.0000e+00
Conclusion: Reject H0 (Significant)
------------------------------------------------------------


In [None]:

# ============================================================
# 3) Smoker vs Diabetes — Chi-square test of independence
# H0: Smoking and Diabetes are independent
# H1: They are associated
# ============================================================
print("TEST 3: Smoker vs Diabetes (Chi-square test)")
ct_sm = pd.crosstab(df["Smoker"], df["Diabetes_binary"])
chi2_sm, p_sm, dof_sm, exp_sm = chi2_contingency(ct_sm)

print("Contingency table (rows=Smoker, cols=Diabetes_binary):")
print(ct_sm)
print(f"Chi-square={chi2_sm:.4f}, dof={dof_sm}, P-value={p_sm:.4e}")
print(f"Conclusion: {conclusion(p_sm, alpha)}")
print("-" * 60)

TEST 3: Smoker vs Diabetes (Chi-square test)
Contingency table (rows=Smoker, cols=Diabetes_binary):
Diabetes_binary     0.0    1.0
Smoker                        
0.0              121879  19378
1.0               91824  20599
Chi-square=999.4148, dof=1, P-value=2.4070e-219
Conclusion: Reject H0 (Significant)
------------------------------------------------------------


In [None]:

# ============================================================
# 4) HighBP vs Diabetes — Fisher's Exact Test (2x2)
# H0: Odds of Diabetes are the same regardless of HighBP
# H1: Odds differ
# Note: Fisher's test expects a 2x2 table with binary variables.
# ============================================================
print("TEST 4: HighBP vs Diabetes (Fisher's Exact Test)")
ct_bp = pd.crosstab(df["HighBP"], df["Diabetes_binary"])
if ct_bp.shape != (2, 2):
    raise ValueError("HighBP or Diabetes_binary is not strictly binary for Fisher's test.")
oddsratio, p_bp = fisher_exact(ct_bp.values)

print("2x2 table (rows=HighBP 0/1, cols=Diabetes_binary 0/1):")
print(ct_bp)
print(f"Odds ratio={oddsratio:.4f}, P-value={p_bp:.4e}")
print(f"Conclusion: {conclusion(p_bp, alpha)}")
print("-" * 60)

TEST 4: HighBP vs Diabetes (Fisher's Exact Test)
2x2 table (rows=HighBP 0/1, cols=Diabetes_binary 0/1):
Diabetes_binary     0.0    1.0
HighBP                        
0.0              134391  10460
1.0               79312  29517
Odds ratio=4.7816, P-value=0.0000e+00
Conclusion: Reject H0 (Significant)
------------------------------------------------------------


In [None]:

# ============================================================
# 5) GenHlth vs Diabetes — Mann–Whitney U test (ordinal vs binary)
# H0: Distribution of GenHlth is the same in both groups
# H1: Distributions differ
# GenHlth: 1=Excellent ... 5=Poor (ordinal)
# ============================================================
print("TEST 5: GenHlth vs Diabetes (Mann–Whitney U)")
gh0 = df.loc[df["Diabetes_binary"] == 0, "GenHlth"].dropna()
gh1 = df.loc[df["Diabetes_binary"] == 1, "GenHlth"].dropna()

u_stat, p_mw = mannwhitneyu(gh0, gh1, alternative="two-sided")
print(f"n0={len(gh0)}, median0={np.median(gh0):.3f}")
print(f"n1={len(gh1)}, median1={np.median(gh1):.3f}")
print(f"U-statistic={u_stat:.4f}, P-value={p_mw:.4e}")
print(f"Conclusion: {conclusion(p_mw, alpha)}")
print("-" * 60)


TEST 5: GenHlth vs Diabetes (Mann–Whitney U)
n0=213703, median0=2.000
n1=39977, median1=3.000
U-statistic=2349291361.5000, P-value=0.0000e+00
Conclusion: Reject H0 (Significant)
------------------------------------------------------------


In [None]:
summary_rows = [
    ["BMI vs Diabetes",           "Welch t-test",          p_val],
    ["PhysActivity vs Diabetes",  "Chi-square",            p_pa],
    ["Smoker vs Diabetes",        "Chi-square",            p_sm],
    ["HighBP vs Diabetes",        "Fisher's Exact",        p_bp],
    ["GenHlth vs Diabetes",       "Mann–Whitney U",        p_mw],
]
summary = pd.DataFrame(summary_rows, columns=["Hypothesis", "Test", "P-value"])
summary["Decision (alpha=0.05)"] = summary["P-value"].apply(lambda p: conclusion(p, alpha))
print("=== SUMMARY ===")
print(summary.to_string(index=False))

=== SUMMARY ===
              Hypothesis           Test       P-value   Decision (alpha=0.05)
         BMI vs Diabetes   Welch t-test  0.000000e+00 Reject H0 (Significant)
PhysActivity vs Diabetes     Chi-square  0.000000e+00 Reject H0 (Significant)
      Smoker vs Diabetes     Chi-square 2.406995e-219 Reject H0 (Significant)
      HighBP vs Diabetes Fisher's Exact  0.000000e+00 Reject H0 (Significant)
     GenHlth vs Diabetes Mann–Whitney U  0.000000e+00 Reject H0 (Significant)
