In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols # linear ANOVA
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import f_oneway
import statsmodels.formula.api as smf

##### Data loading

The educational level is 'isced1997_r' in wave 2

In [None]:
maxchair = pd.read_csv("data/maxchair.csv")
maxchair.shape

In [None]:
maxchair

## statistical

##### Statistics on df_features

- **0**: No disease  
- **1**:  
  - Diabetes only  
  - Hypertension only  
  - OA only  
- **2**:  
  - Diabetes + Hypertension  
  - Diabetes + OA  
  - Hypertension + OA  
- **3**: Diabetes + Hypertension + OA 


In [None]:
maxchair['disease_category'] = 'No disease'  # Default group
maxchair.loc[(maxchair['OA_conserv'] == 1) & (maxchair['Hypertension'] == 0) & (maxchair['Diabetes'] == 0), 'disease_category'] = 'Only OA'
maxchair.loc[(maxchair['OA_conserv'] == 0) & (maxchair['Hypertension'] == 1) & (maxchair['Diabetes'] == 0), 'disease_category'] = 'Only HT'
maxchair.loc[(maxchair['OA_conserv'] == 0) & (maxchair['Hypertension'] == 0) & (maxchair['Diabetes'] == 1), 'disease_category'] = 'Only Diabetes'
maxchair.loc[(maxchair['OA_conserv'] == 1) & (maxchair['Hypertension'] == 1) & (maxchair['Diabetes'] == 0), 'disease_category'] = 'OA and HT'
maxchair.loc[(maxchair['OA_conserv'] == 1) & (maxchair['Hypertension'] == 0) & (maxchair['Diabetes'] == 1), 'disease_category'] = 'OA and Diab'
maxchair.loc[(maxchair['OA_conserv'] == 0) & (maxchair['Hypertension'] == 1) & (maxchair['Diabetes'] == 1), 'disease_category'] = 'HT and Diab'
maxchair.loc[(maxchair['OA_conserv'] == 1) & (maxchair['Hypertension'] == 1) & (maxchair['Diabetes'] == 1), 'disease_category'] = 'All three diseases'


### Maxgrip

#### No confounding

##### Step1: One-Way ANOVA

Checks if any group differs

In [None]:
groups = maxchair['disease_category'].unique()
grip_samples = [maxchair[maxchair['disease_category'] == group]['maxgrip'].dropna() for group in groups]

# Run ANOVA
anova_grip = f_oneway(*grip_samples)
print("ANOVA for Grip Strength")
print(f"F-statistic: {anova_grip.statistic:.3f}, p-value: {anova_grip.pvalue:.5f}")


##### Step2: Tukey HSD Post-hoc Comparison

Identifies which groups differ

In [None]:
df_grip = maxchair[['maxgrip', 'disease_category']].dropna()

# Run Tukey HSD
tukey_grip = pairwise_tukeyhsd(endog=df_grip['maxgrip'],
                               groups=df_grip['disease_category'],
                               alpha=0.05)

print("Turkey Maxgrip in age under 75")
print(tukey_grip.summary())


##### Number of samples in each group

In [None]:
group_counts = maxchair[maxchair['maxgrip'].notna()].groupby('disease_category').size()
print(group_counts)

##### Step3: Code to Compute Cohen's d and 95% CI

In [None]:
comparisons = [
    ("No disease", "Only OA"),
    ("No disease", "Only HT"),
    ("No disease", "Only Diabetes"),
    ("No disease", "HT and Diab"),
    ("No disease", "OA and Diab"),
    ("No disease", "OA and HT"),
    ("No disease", "All three diseases")
]

results = []

for g1, g2 in comparisons:

    group1 = maxchair[(maxchair['disease_category'] == g1)]['maxgrip'].dropna()
    group2 = maxchair[(maxchair['disease_category'] == g2)]['maxgrip'].dropna()

    mean1, mean2 = group1.mean(), group2.mean()
    std1, std2 = group1.std(), group2.std()
    n1, n2 = len(group1), len(group2)

    pooled_sd = np.sqrt(((n1 - 1)*std1**2 + (n2 - 1)*std2**2) / (n1 + n2 - 2))

    cohen_d = (mean1 - mean2) / pooled_sd

    diff = mean1 - mean2
    se_diff = np.sqrt(std1**2 / n1 + std2**2 / n2)
    ci_low, ci_high = diff - 1.96 * se_diff, diff + 1.96 * se_diff

    results.append({
        'Comparison': f"{g1} vs {g2}",
        'Mean1': round(mean1, 2),
        'Mean2': round(mean2, 2),
        'Cohen_d': round(cohen_d, 3),
        '95% CI Lower': round(ci_low, 3),
        '95% CI Upper': round(ci_high, 3),
        'n1': n1,
        'n2': n2
    })


effect_df = pd.DataFrame(results)
effect_df


#### With confounding: age, BMI, female, educational_level:

Run OLS model (adjusted for age, bmi, female)

In [None]:
# Set 'No disease' as reference group 
maxchair['disease_category'] = pd.Categorical(
    maxchair['disease_category'],
    categories=[
        'No disease', 'Only OA', 'Only HT', 'Only Diabetes',
        'HT and Diab', 'OA and Diab', 'OA and HT', 'All three diseases'
    ],
    ordered=False
)

model = smf.ols('maxgrip ~ C(disease_category) + age + bmi + female + educational_level', data=maxchair).fit()

results_df = pd.DataFrame({
    'Disease group': model.params.index,
    'Coefficient (Adj. Mean Diff)': model.params.values,
    '95% CI Lower': model.conf_int().iloc[:, 0],
    '95% CI Upper': model.conf_int().iloc[:, 1],
    'p-value': model.pvalues.values
})

results_df = results_df[results_df['Disease group'].str.contains('C\(disease_category\)')].copy()

results_df['Disease group'] = results_df['Disease group'].str.replace(r'C\(disease_category\)\[T\.', '', regex=True).str.rstrip(']')

results_df['Coefficient (Adj. Mean Diff)'] = results_df['Coefficient (Adj. Mean Diff)'].round(3)
results_df['95% CI Lower'] = results_df['95% CI Lower'].round(3)
results_df['95% CI Upper'] = results_df['95% CI Upper'].round(3)
results_df['p-value'] = results_df['p-value'].round(4)
results_df['Significant'] = results_df['p-value'].apply(lambda p: 'Yes' if p < 0.05 else 'No')

print("OLS results for maxgrip in age under 75 (No disease as reference):")
results_df.reset_index(drop=True)


##### Interaction Effects: Does OA Get Worse With Comorbidities?

In [None]:
maxchair['disease_category'] = pd.Categorical(
    maxchair['disease_category'],
    categories=[
        'No disease',            
        'Only OA',
        'Only HT',
        'Only Diabetes',
        'OA and HT',
        'OA and Diab',
        'HT and Diab',
        'All three diseases'
    ],
    ordered=False
)


In [None]:
model = smf.ols('maxgrip ~ C(disease_category) + age + bmi + female + educational_level + C(OA_conserv)*C(Diabetes) + C(OA_conserv)*C(Hypertension)', 
                data=maxchair).fit()
print(model.summary())

##### Logistic Regression: Predicting "High Risk" Patients

In [None]:
maxchair['low_grip'] = (maxchair['maxgrip'] < maxchair['maxgrip'].quantile(0.25)).astype(int)

X = maxchair[['age', 'bmi', 'female', 'educational_level','OA_conserv', 'Diabetes', 'Hypertension']]
y = maxchair['low_grip']

logit_model = sm.Logit(y, sm.add_constant(X)).fit()
print(logit_model.summary())


### Chair

#### No confounding

##### Step1: One-Way ANOVA

Checks if any group differs

In [None]:
# ANOVA for Chair Stand
chair_samples = [maxchair[maxchair['disease_category'] == group]['chair'].dropna() for group in groups]

anova_chair = f_oneway(*chair_samples)
print("\nANOVA for Chair Stand")
print(f"F-statistic: {anova_chair.statistic:.3f}, p-value: {anova_chair.pvalue:.5f}")


##### Step2: Tukey HSD Post-hoc Comparison

Identifies which groups differ

In [None]:
# Tukey HSD for Chair Stand
df_chair = maxchair[['chair', 'disease_category']].dropna()

tukey_chair = pairwise_tukeyhsd(endog=df_chair['chair'],
                                groups=df_chair['disease_category'],
                                alpha=0.05)

print("Turkey Chair Stand in age under 75")
print(tukey_chair.summary())


##### Number of samples in each group

In [None]:
group_counts = maxchair[maxchair['chair'].notna()].groupby('disease_category').size()
print(group_counts)

##### Step3: Compute Cohen's d and 95% CI

In [None]:
comparisons = [
    ("No disease", "Only OA"),
    ("No disease", "Only HT"),
    ("No disease", "Only Diabetes"),
    ("No disease", "HT and Diab"),
    ("No disease", "OA and Diab"),
    ("No disease", "OA and HT"),
    ("No disease", "All three diseases")
]

results = []

for g1, g2 in comparisons:
   
    group1 = maxchair[(maxchair['disease_category'] == g1)]['chair'].dropna()
    group2 = maxchair[(maxchair['disease_category'] == g2)]['chair'].dropna()

    mean1, mean2 = group1.mean(), group2.mean()
    std1, std2 = group1.std(), group2.std()
    n1, n2 = len(group1), len(group2)

    pooled_sd = np.sqrt(((n1 - 1)*std1**2 + (n2 - 1)*std2**2) / (n1 + n2 - 2))

    cohen_d = (mean1 - mean2) / pooled_sd

    diff = mean1 - mean2
    se_diff = np.sqrt(std1**2 / n1 + std2**2 / n2)
    ci_low, ci_high = diff - 1.96 * se_diff, diff + 1.96 * se_diff

    results.append({
        'Comparison': f"{g1} vs {g2}",
        'Mean1': round(mean1, 2),
        'Mean2': round(mean2, 2),
        'Cohen_d': round(cohen_d, 3),
        '95% CI Lower': round(ci_low, 3),
        '95% CI Upper': round(ci_high, 3),
        'n1': n1,
        'n2': n2
    })

effect_df = pd.DataFrame(results)
effect_df

#### With confounding: age, BMI, female, educational_level:

Run OLS model (adjusted for age, bmi, female)

In [None]:
maxchair['disease_category'] = pd.Categorical(
    maxchair['disease_category'],
    categories=[
        'No disease', 'Only OA', 'Only HT', 'Only Diabetes',
        'HT and Diab', 'OA and Diab', 'OA and HT', 'All three diseases'
    ],
    ordered=False
)

model = smf.ols('chair ~ C(disease_category) + age + bmi + female + educational_level', data=maxchair).fit()


results_df = pd.DataFrame({
    'Disease group': model.params.index,
    'Coefficient (Adj. Mean Diff)': model.params.values,
    '95% CI Lower': model.conf_int().iloc[:, 0],
    '95% CI Upper': model.conf_int().iloc[:, 1],
    'p-value': model.pvalues.values
})

results_df = results_df[results_df['Disease group'].str.contains('C\(disease_category\)')].copy()

results_df['Disease group'] = results_df['Disease group'].str.replace(r'C\(disease_category\)\[T\.', '', regex=True).str.rstrip(']')


results_df['Coefficient (Adj. Mean Diff)'] = results_df['Coefficient (Adj. Mean Diff)'].round(3)
results_df['95% CI Lower'] = results_df['95% CI Lower'].round(3)
results_df['95% CI Upper'] = results_df['95% CI Upper'].round(3)
results_df['p-value'] = results_df['p-value'].round(4)
results_df['Significant'] = results_df['p-value'].apply(lambda p: 'Yes' if p < 0.05 else 'No')

print("OLS results for chair in age under 75 (No disease as reference):")
results_df.reset_index(drop=True)


In [None]:
group_counts = maxchair[maxchair['chair'].notna()].groupby('disease_category').size()
print(group_counts)

##### Interaction Effects: Does OA Get Worse With Comorbidities?

In [None]:
maxchair['disease_category'] = pd.Categorical(
    maxchair['disease_category'],
    categories=[
        'No disease',         
        'Only OA',
        'Only HT',
        'Only Diabetes',
        'OA and HT',
        'OA and Diab',
        'HT and Diab',
        'All three diseases'
    ],
    ordered=False
)

In [None]:
model = smf.ols('chair ~ C(disease_category) + age + bmi + female + educational_level + C(OA_conserv)*C(Diabetes) + C(OA_conserv)*C(Hypertension)', 
                data=maxchair).fit()
print(model.summary())

##### Logistic Regression: Predicting "High Risk" Patients

In [None]:
maxchair['low_grip'] = (maxchair['chair'] < maxchair['chair'].quantile(0.25)).astype(int)

X = maxchair[['age', 'bmi', 'female', 'educational_level','OA_conserv', 'Diabetes', 'Hypertension']]
y = maxchair['low_grip']

logit_model = sm.Logit(y, sm.add_constant(X)).fit()
print(logit_model.summary())