WASFG

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("alzheimers_disease_data.csv")
df.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [4]:
from scipy import stats

### One Sample Test

* Goal: Test if the mean of SystolicBP differs significantly from a reference value (e.g., 120 mmHg).

* Hypotheses:

    * H_0: μ=120 (The average SBP is 120 mmHg)

    * H_a: μ!=120 (The average SBP is not 120 mmHg)

In [5]:
reference_sbp = 120

sbp_data = df['SystolicBP'].dropna()

t_stat, p_value = stats.ttest_1samp(sbp_data, reference_sbp)

# Results
print(f"One-Sample T-Test Results (vs. {reference_sbp} mmHg):")
print(f"Mean Systolic BP: {sbp_data.mean():.2f}")
print(f"T-Statistic: {t_stat:.3f}")
print(f"P-Value: {p_value:.5f}")

# p-value
alpha = 0.05
if p_value < alpha:
    print(f"\nConclusion: Reject the null hypothesis (p < {alpha}). The mean SBP is significantly different from {reference_sbp} mmHg.")
else:
    print(f"\nConclusion: Fail to reject the null hypothesis (p > {alpha}). The mean SBP is NOT significantly different from {reference_sbp} mmHg.")

One-Sample T-Test Results (vs. 120 mmHg):
Mean Systolic BP: 134.26
T-Statistic: 25.483
P-Value: 0.00000

Conclusion: Reject the null hypothesis (p < 0.05). The mean SBP is significantly different from 120 mmHg.


### Two Sample test

* Goal: Test if the mean BMI differs significantly between smokers and non-smokers.

* Grouping Variable: Smoking (assumed 0 = Non-smoker, 1 = Smoker)

* Hypotheses:

    * H_0: μ_Smokers=μ_Non-smokers (Mean BMI is the same for both groups)

    * H_a: μ_Smokers!=μ_Non-smokers (Mean BMI is different)

In [6]:
bmi_non_smokers = df[df['Smoking'] == 0]['BMI'].dropna()
bmi_smokers = df[df['Smoking'] == 1]['BMI'].dropna()

stat_levene, p_levene = stats.levene(bmi_non_smokers, bmi_smokers)
equal_variance = p_levene >= 0.05

t_stat, p_value = stats.ttest_ind(
    bmi_non_smokers,
    bmi_smokers,
    equal_var=equal_variance )

# Results
print(f"Two-Sample T-Test (BMI: Non-smokers vs. Smokers):")
print(f"Mean BMI (Non-smokers): {bmi_non_smokers.mean():.2f}")
print(f"Mean BMI (Smokers): {bmi_smokers.mean():.2f}")
print(f"T-Statistic: {t_stat:.3f}")
print(f"P-Value: {p_value:.5f}")

# p-value
alpha = 0.05
if p_value < alpha:
    print(f"\nConclusion: Reject the null hypothesis (p < {alpha}). The mean BMI is significantly different between the two groups.")
else:
    print(f"\nConclusion: Fail to reject the null hypothesis (p > {alpha}). The mean BMI is NOT significantly different between the two groups.")

Two-Sample T-Test (BMI: Non-smokers vs. Smokers):
Mean BMI (Non-smokers): 27.56
Mean BMI (Smokers): 27.89
T-Statistic: -0.947
P-Value: 0.34366

Conclusion: Fail to reject the null hypothesis (p > 0.05). The mean BMI is NOT significantly different between the two groups.


### One-Way ANOVA (Total Cholesterol by Education Level)
* Goal: Test if the mean MMSE (Mini-Mental State Exam) score differs significantly across different Ethnicity groups.

* Hypotheses:

    * H_0: μ_Eth1=μ_Eth2=μ_Eth3=… (Mean MMSE is the same across all ethnic groups)
    * H_a: At least one group mean is different.

In [7]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd

df_anova_new = df[['MMSE', 'Ethnicity']].dropna()

# Ensure the grouping variable is treated as a categorical factor
df_anova_new['Ethnicity'] = df_anova_new['Ethnicity'].astype('category')

if df_anova_new['Ethnicity'].nunique() < 2:
    print("ERROR: Not enough unique Ethnicity groups (fewer than 2) found in the filtered data to perform ANOVA.")
else:
    formula = 'MMSE ~ C(Ethnicity)'
    lm = ols(formula, data=df_anova_new).fit()

    anova_table = sm.stats.anova_lm(lm, typ=2)

    print("One-Way ANOVA Results (MMSE by Ethnicity Level):")
    print(anova_table)

    p_anova = anova_table.loc['C(Ethnicity)', 'PR(>F)']
    alpha = 0.05

    # Conditional Post-Hoc Test (Tukey HSD) 

    if p_anova < alpha:
        print(f"\nConclusion: The ANOVA P-Value is significant (p = {p_anova:.5f}). Proceeding with Post-Hoc Tukey HSD Test.")

        if df_anova_new['Ethnicity'].nunique() > 2:
            tukey_results = pairwise_tukeyhsd(
                endog=df_anova_new['MMSE'],
                groups=df_anova_new['Ethnicity'],
                alpha=alpha
            )

            print("\nTukey HSD Post-Hoc Test:")
            print(tukey_results)
            print("\nInterpretation: The 'reject' column indicates which specific pairs of Ethnicity groups have a statistically significant difference in mean MMSE score.")
        else:
             print("\nNOTE: Post-Hoc test skipped. Only 2 unique Ethnicity groups found in the filtered data. The ANOVA result is equivalent to a two-sample t-test.")
    else:
        print(f"\nConclusion: The ANOVA P-Value is NOT significant (p = {p_anova:.5f}). We conclude that the mean MMSE scores are not significantly different across Ethnicity groups.")

One-Way ANOVA Results (MMSE by Ethnicity Level):
                     sum_sq      df         F    PR(>F)
C(Ethnicity)      46.362471     3.0  0.208085  0.890845
Residual      159305.971047  2145.0       NaN       NaN

Conclusion: The ANOVA P-Value is NOT significant (p = 0.89085). We conclude that the mean MMSE scores are not significantly different across Ethnicity groups.
