In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols # linear ANOVA
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import f_oneway
import statsmodels.formula.api as smf

  from pandas.core import (


##### Data loading

The educational level is 'isced1997_r' in wave 2

In [2]:
maxchair = pd.read_csv("data/maxchair.csv")
maxchair.shape

(10359, 11)

In [3]:
maxchair

Unnamed: 0,mergeid,age,bmi,female,educational_level,OA_conserv,Hypertension,Diabetes,maxgrip,chair,disease_category
0,AT-004234-02,53.0,30.717400,1.0,1.0,0.0,0.0,1.0,37.0,6.00,Only Diabetes
1,AT-016392-01,61.0,23.588329,1.0,1.0,0.0,0.0,0.0,34.0,13.20,No disease
2,AT-017298-01,64.0,25.381469,0.0,2.0,0.0,1.0,0.0,55.0,9.40,Only HT
3,AT-026212-02,59.0,24.337480,0.0,1.0,0.0,0.0,0.0,52.0,22.00,No disease
4,AT-117118-02,75.0,28.393726,0.0,1.0,0.0,0.0,0.0,36.0,14.44,No disease
...,...,...,...,...,...,...,...,...,...,...,...
10354,SE-985630-02,63.0,21.107266,1.0,1.0,0.0,0.0,0.0,25.0,7.94,No disease
10355,SE-994435-01,54.0,25.484765,0.0,1.0,0.0,0.0,0.0,61.0,7.41,No disease
10356,SE-996850-01,61.0,26.827421,0.0,1.0,0.0,0.0,0.0,61.0,7.03,No disease
10357,SE-996850-02,61.0,23.323418,1.0,1.0,0.0,1.0,0.0,32.0,13.94,Only HT


## statistical

##### Statistics on df_features

- **0**: No disease  
- **1**:  
  - Diabetes only  
  - Hypertension only  
  - OA only  
- **2**:  
  - Diabetes + Hypertension  
  - Diabetes + OA  
  - Hypertension + OA  
- **3**: Diabetes + Hypertension + OA 


In [4]:
maxchair['disease_category'] = 'No disease'  # Default group
maxchair.loc[(maxchair['OA_conserv'] == 1) & (maxchair['Hypertension'] == 0) & (maxchair['Diabetes'] == 0), 'disease_category'] = 'Only OA'
maxchair.loc[(maxchair['OA_conserv'] == 0) & (maxchair['Hypertension'] == 1) & (maxchair['Diabetes'] == 0), 'disease_category'] = 'Only HT'
maxchair.loc[(maxchair['OA_conserv'] == 0) & (maxchair['Hypertension'] == 0) & (maxchair['Diabetes'] == 1), 'disease_category'] = 'Only Diabetes'
maxchair.loc[(maxchair['OA_conserv'] == 1) & (maxchair['Hypertension'] == 1) & (maxchair['Diabetes'] == 0), 'disease_category'] = 'OA and HT'
maxchair.loc[(maxchair['OA_conserv'] == 1) & (maxchair['Hypertension'] == 0) & (maxchair['Diabetes'] == 1), 'disease_category'] = 'OA and Diab'
maxchair.loc[(maxchair['OA_conserv'] == 0) & (maxchair['Hypertension'] == 1) & (maxchair['Diabetes'] == 1), 'disease_category'] = 'HT and Diab'
maxchair.loc[(maxchair['OA_conserv'] == 1) & (maxchair['Hypertension'] == 1) & (maxchair['Diabetes'] == 1), 'disease_category'] = 'All three diseases'


### Maxgrip

#### No confounding

##### Step1: One-Way ANOVA

Checks if any group differs

In [5]:
groups = maxchair['disease_category'].unique()
grip_samples = [maxchair[maxchair['disease_category'] == group]['maxgrip'].dropna() for group in groups]

# Run ANOVA
anova_grip = f_oneway(*grip_samples)
print("ANOVA for Grip Strength")
print(f"F-statistic: {anova_grip.statistic:.3f}, p-value: {anova_grip.pvalue:.5f}")


ANOVA for Grip Strength
F-statistic: 32.537, p-value: 0.00000


##### Step2: Tukey HSD Post-hoc Comparison

Identifies which groups differ

In [6]:
df_grip = maxchair[['maxgrip', 'disease_category']].dropna()

# Run Tukey HSD
tukey_grip = pairwise_tukeyhsd(endog=df_grip['maxgrip'],
                               groups=df_grip['disease_category'],
                               alpha=0.05)

print("Turkey Maxgrip in age under 75")
print(tukey_grip.summary())


Turkey Maxgrip in age under 75
          Multiple Comparison of Means - Tukey HSD, FWER=0.05          
      group1           group2    meandiff p-adj   lower   upper  reject
-----------------------------------------------------------------------
All three diseases   HT and Diab   4.7938 0.0077  0.7558  8.8318   True
All three diseases    No disease   6.4219    0.0  2.7524 10.0914   True
All three diseases   OA and Diab   1.6516   0.99 -4.2297  7.5329  False
All three diseases     OA and HT   0.9421 0.9963 -3.0012  4.8855  False
All three diseases Only Diabetes   6.2111 0.0001  2.0721 10.3501   True
All three diseases       Only HT   6.0537    0.0  2.3379  9.7695   True
All three diseases       Only OA   1.9848 0.7745 -1.8733   5.843  False
       HT and Diab    No disease   1.6281 0.1098 -0.1708   3.427  False
       HT and Diab   OA and Diab  -3.1422 0.5301 -8.0779  1.7934  False
       HT and Diab     OA and HT  -3.8517    0.0 -6.1585 -1.5448   True
       HT and Diab Only Diabetes 

##### Number of samples in each group

In [7]:
group_counts = maxchair[maxchair['maxgrip'].notna()].groupby('disease_category').size()
print(group_counts)

disease_category
All three diseases      90
HT and Diab            393
No disease            6039
OA and Diab             56
OA and HT              523
Only Diabetes          309
Only HT               2211
Only OA                738
dtype: int64


##### Step3: Code to Compute Cohen's d and 95% CI

In [8]:
comparisons = [
    ("No disease", "Only OA"),
    ("No disease", "Only HT"),
    ("No disease", "Only Diabetes"),
    ("No disease", "HT and Diab"),
    ("No disease", "OA and Diab"),
    ("No disease", "OA and HT"),
    ("No disease", "All three diseases")
]

results = []

for g1, g2 in comparisons:

    group1 = maxchair[(maxchair['disease_category'] == g1)]['maxgrip'].dropna()
    group2 = maxchair[(maxchair['disease_category'] == g2)]['maxgrip'].dropna()

    mean1, mean2 = group1.mean(), group2.mean()
    std1, std2 = group1.std(), group2.std()
    n1, n2 = len(group1), len(group2)

    pooled_sd = np.sqrt(((n1 - 1)*std1**2 + (n2 - 1)*std2**2) / (n1 + n2 - 2))

    cohen_d = (mean1 - mean2) / pooled_sd

    diff = mean1 - mean2
    se_diff = np.sqrt(std1**2 / n1 + std2**2 / n2)
    ci_low, ci_high = diff - 1.96 * se_diff, diff + 1.96 * se_diff

    results.append({
        'Comparison': f"{g1} vs {g2}",
        'Mean1': round(mean1, 2),
        'Mean2': round(mean2, 2),
        'Cohen_d': round(cohen_d, 3),
        '95% CI Lower': round(ci_low, 3),
        '95% CI Upper': round(ci_high, 3),
        'n1': n1,
        'n2': n2
    })


effect_df = pd.DataFrame(results)
effect_df


Unnamed: 0,Comparison,Mean1,Mean2,Cohen_d,95% CI Lower,95% CI Upper,n1,n2
0,No disease vs Only OA,37.88,33.44,0.391,3.564,5.31,6039,738
1,No disease vs Only HT,37.88,37.51,0.032,-0.195,0.931,6039,2211
2,No disease vs Only Diabetes,37.88,37.67,0.019,-1.046,1.468,6039,309
3,No disease vs HT and Diab,37.88,36.25,0.143,0.472,2.784,6039,393
4,No disease vs OA and Diab,37.88,33.11,0.42,1.695,7.845,6039,56
5,No disease vs OA and HT,37.88,32.4,0.483,4.478,6.482,6039,523
6,No disease vs All three diseases,37.88,31.46,0.566,4.012,8.831,6039,90


#### With confounding: age, BMI, female, educational_level:

Run OLS model (adjusted for age, bmi, female)

In [9]:
# Set 'No disease' as reference group 
maxchair['disease_category'] = pd.Categorical(
    maxchair['disease_category'],
    categories=[
        'No disease', 'Only OA', 'Only HT', 'Only Diabetes',
        'HT and Diab', 'OA and Diab', 'OA and HT', 'All three diseases'
    ],
    ordered=False
)

model = smf.ols('maxgrip ~ C(disease_category) + age + bmi + female + educational_level', data=maxchair).fit()

results_df = pd.DataFrame({
    'Disease group': model.params.index,
    'Coefficient (Adj. Mean Diff)': model.params.values,
    '95% CI Lower': model.conf_int().iloc[:, 0],
    '95% CI Upper': model.conf_int().iloc[:, 1],
    'p-value': model.pvalues.values
})

results_df = results_df[results_df['Disease group'].str.contains('C\(disease_category\)')].copy()

results_df['Disease group'] = results_df['Disease group'].str.replace(r'C\(disease_category\)\[T\.', '', regex=True).str.rstrip(']')

results_df['Coefficient (Adj. Mean Diff)'] = results_df['Coefficient (Adj. Mean Diff)'].round(3)
results_df['95% CI Lower'] = results_df['95% CI Lower'].round(3)
results_df['95% CI Upper'] = results_df['95% CI Upper'].round(3)
results_df['p-value'] = results_df['p-value'].round(4)
results_df['Significant'] = results_df['p-value'].apply(lambda p: 'Yes' if p < 0.05 else 'No')

print("OLS results for maxgrip in age under 75 (No disease as reference):")
results_df.reset_index(drop=True)


OLS results for maxgrip in age under 75 (No disease as reference):


Unnamed: 0,Disease group,Coefficient (Adj. Mean Diff),95% CI Lower,95% CI Upper,p-value,Significant
0,Only OA,-1.508,-2.062,-0.954,0.0,Yes
1,Only HT,0.091,-0.271,0.453,0.6231,No
2,Only Diabetes,-1.598,-2.429,-0.766,0.0002,Yes
3,HT and Diab,-1.267,-2.02,-0.514,0.001,Yes
4,OA and Diab,-2.727,-4.628,-0.826,0.0049,Yes
5,OA and HT,-2.161,-2.819,-1.504,0.0,Yes
6,All three diseases,-3.367,-4.886,-1.848,0.0,Yes


##### Interaction Effects: Does OA Get Worse With Comorbidities?

In [10]:
maxchair['disease_category'] = pd.Categorical(
    maxchair['disease_category'],
    categories=[
        'No disease',            
        'Only OA',
        'Only HT',
        'Only Diabetes',
        'OA and HT',
        'OA and Diab',
        'HT and Diab',
        'All three diseases'
    ],
    ordered=False
)


In [11]:
model = smf.ols('maxgrip ~ C(disease_category) + age + bmi + female + educational_level + C(OA_conserv)*C(Diabetes) + C(OA_conserv)*C(Hypertension)', 
                data=maxchair).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                maxgrip   R-squared:                       0.609
Model:                            OLS   Adj. R-squared:                  0.609
Method:                 Least Squares   F-statistic:                     1465.
Date:                Fri, 27 Jun 2025   Prob (F-statistic):               0.00
Time:                        11:06:59   Log-Likelihood:                -35152.
No. Observations:               10359   AIC:                         7.033e+04
Df Residuals:                   10347   BIC:                         7.041e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                                                  coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------

##### Logistic Regression: Predicting "High Risk" Patients

In [12]:
maxchair['low_grip'] = (maxchair['maxgrip'] < maxchair['maxgrip'].quantile(0.25)).astype(int)

X = maxchair[['age', 'bmi', 'female', 'educational_level','OA_conserv', 'Diabetes', 'Hypertension']]
y = maxchair['low_grip']

logit_model = sm.Logit(y, sm.add_constant(X)).fit()
print(logit_model.summary())


Optimization terminated successfully.
         Current function value: 0.383795
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:               low_grip   No. Observations:                10359
Model:                          Logit   Df Residuals:                    10351
Method:                           MLE   Df Model:                            7
Date:                Fri, 27 Jun 2025   Pseudo R-squ.:                  0.2696
Time:                        11:06:59   Log-Likelihood:                -3975.7
converged:                       True   LL-Null:                       -5443.0
Covariance Type:            nonrobust   LLR p-value:                     0.000
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                -7.7764      0.316    -24.646      0.000      -8.395      -7.158
age     

### Chair

#### No confounding

##### Step1: One-Way ANOVA

Checks if any group differs

In [13]:
# ANOVA for Chair Stand
chair_samples = [maxchair[maxchair['disease_category'] == group]['chair'].dropna() for group in groups]

anova_chair = f_oneway(*chair_samples)
print("\nANOVA for Chair Stand")
print(f"F-statistic: {anova_chair.statistic:.3f}, p-value: {anova_chair.pvalue:.5f}")



ANOVA for Chair Stand
F-statistic: 9.105, p-value: 0.00000


##### Step2: Tukey HSD Post-hoc Comparison

Identifies which groups differ

In [14]:
# Tukey HSD for Chair Stand
df_chair = maxchair[['chair', 'disease_category']].dropna()

tukey_chair = pairwise_tukeyhsd(endog=df_chair['chair'],
                                groups=df_chair['disease_category'],
                                alpha=0.05)

print("Turkey Chair Stand in age under 75")
print(tukey_chair.summary())


Turkey Chair Stand in age under 75
          Multiple Comparison of Means - Tukey HSD, FWER=0.05          
      group1           group2    meandiff p-adj   lower   upper  reject
-----------------------------------------------------------------------
All three diseases   HT and Diab  -0.0612    1.0 -2.5195  2.3971  False
All three diseases    No disease  -1.9054 0.1611 -4.1393  0.3286  False
All three diseases   OA and Diab   0.4922 0.9999 -3.0883  4.0727  False
All three diseases     OA and HT  -0.6216 0.9939 -3.0223  1.7791  False
All three diseases Only Diabetes  -1.6915 0.4581 -4.2113  0.8283  False
All three diseases       Only HT   -1.607 0.3804 -3.8692  0.6551  False
All three diseases       Only OA  -0.6991 0.9859 -3.0479  1.6497  False
       HT and Diab    No disease  -1.8442    0.0 -2.9393  -0.749   True
       HT and Diab   OA and Diab   0.5534 0.9993 -2.4514  3.5582  False
       HT and Diab     OA and HT  -0.5604 0.9294 -1.9647   0.844  False
       HT and Diab Only Diabe

##### Number of samples in each group

In [15]:
group_counts = maxchair[maxchair['chair'].notna()].groupby('disease_category').size()
print(group_counts)

disease_category
No disease            6039
Only OA                738
Only HT               2211
Only Diabetes          309
OA and HT              523
OA and Diab             56
HT and Diab            393
All three diseases      90
dtype: int64


  group_counts = maxchair[maxchair['chair'].notna()].groupby('disease_category').size()


##### Step3: Compute Cohen's d and 95% CI

In [16]:
comparisons = [
    ("No disease", "Only OA"),
    ("No disease", "Only HT"),
    ("No disease", "Only Diabetes"),
    ("No disease", "HT and Diab"),
    ("No disease", "OA and Diab"),
    ("No disease", "OA and HT"),
    ("No disease", "All three diseases")
]

results = []

for g1, g2 in comparisons:
   
    group1 = maxchair[(maxchair['disease_category'] == g1)]['chair'].dropna()
    group2 = maxchair[(maxchair['disease_category'] == g2)]['chair'].dropna()

    mean1, mean2 = group1.mean(), group2.mean()
    std1, std2 = group1.std(), group2.std()
    n1, n2 = len(group1), len(group2)

    pooled_sd = np.sqrt(((n1 - 1)*std1**2 + (n2 - 1)*std2**2) / (n1 + n2 - 2))

    cohen_d = (mean1 - mean2) / pooled_sd

    diff = mean1 - mean2
    se_diff = np.sqrt(std1**2 / n1 + std2**2 / n2)
    ci_low, ci_high = diff - 1.96 * se_diff, diff + 1.96 * se_diff

    results.append({
        'Comparison': f"{g1} vs {g2}",
        'Mean1': round(mean1, 2),
        'Mean2': round(mean2, 2),
        'Cohen_d': round(cohen_d, 3),
        '95% CI Lower': round(ci_low, 3),
        '95% CI Upper': round(ci_high, 3),
        'n1': n1,
        'n2': n2
    })

effect_df = pd.DataFrame(results)
effect_df

Unnamed: 0,Comparison,Mean1,Mean2,Cohen_d,95% CI Lower,95% CI Upper,n1,n2
0,No disease vs Only OA,10.85,12.06,-0.168,-1.711,-0.701,6039,738
1,No disease vs Only HT,10.85,11.15,-0.043,-0.611,0.014,6039,2211
2,No disease vs Only Diabetes,10.85,11.07,-0.03,-0.851,0.423,6039,309
3,No disease vs HT and Diab,10.85,12.7,-0.249,-2.783,-0.906,6039,393
4,No disease vs OA and Diab,10.85,13.25,-0.33,-4.184,-0.611,6039,56
5,No disease vs OA and HT,10.85,12.14,-0.179,-1.82,-0.748,6039,523
6,No disease vs All three diseases,10.85,12.76,-0.263,-3.054,-0.757,6039,90


#### With confounding: age, BMI, female, educational_level:

Run OLS model (adjusted for age, bmi, female)

In [17]:
maxchair['disease_category'] = pd.Categorical(
    maxchair['disease_category'],
    categories=[
        'No disease', 'Only OA', 'Only HT', 'Only Diabetes',
        'HT and Diab', 'OA and Diab', 'OA and HT', 'All three diseases'
    ],
    ordered=False
)

model = smf.ols('chair ~ C(disease_category) + age + bmi + female + educational_level', data=maxchair).fit()


results_df = pd.DataFrame({
    'Disease group': model.params.index,
    'Coefficient (Adj. Mean Diff)': model.params.values,
    '95% CI Lower': model.conf_int().iloc[:, 0],
    '95% CI Upper': model.conf_int().iloc[:, 1],
    'p-value': model.pvalues.values
})

results_df = results_df[results_df['Disease group'].str.contains('C\(disease_category\)')].copy()

results_df['Disease group'] = results_df['Disease group'].str.replace(r'C\(disease_category\)\[T\.', '', regex=True).str.rstrip(']')


results_df['Coefficient (Adj. Mean Diff)'] = results_df['Coefficient (Adj. Mean Diff)'].round(3)
results_df['95% CI Lower'] = results_df['95% CI Lower'].round(3)
results_df['95% CI Upper'] = results_df['95% CI Upper'].round(3)
results_df['p-value'] = results_df['p-value'].round(4)
results_df['Significant'] = results_df['p-value'].apply(lambda p: 'Yes' if p < 0.05 else 'No')

print("OLS results for chair in age under 75 (No disease as reference):")
results_df.reset_index(drop=True)


OLS results for chair in age under 75 (No disease as reference):


Unnamed: 0,Disease group,Coefficient (Adj. Mean Diff),95% CI Lower,95% CI Upper,p-value,Significant
0,Only OA,0.802,0.274,1.329,0.0029,Yes
1,Only HT,-0.146,-0.492,0.199,0.4057,No
2,Only Diabetes,-0.251,-1.043,0.542,0.5353,No
3,HT and Diab,1.086,0.369,1.803,0.003,Yes
4,OA and Diab,1.671,-0.14,3.483,0.0705,No
5,OA and HT,0.433,-0.193,1.06,0.1752,No
6,All three diseases,0.796,-0.651,2.243,0.281,No


In [18]:
group_counts = maxchair[maxchair['chair'].notna()].groupby('disease_category').size()
print(group_counts)

disease_category
No disease            6039
Only OA                738
Only HT               2211
Only Diabetes          309
HT and Diab            393
OA and Diab             56
OA and HT              523
All three diseases      90
dtype: int64


  group_counts = maxchair[maxchair['chair'].notna()].groupby('disease_category').size()


##### Interaction Effects: Does OA Get Worse With Comorbidities?

In [19]:
maxchair['disease_category'] = pd.Categorical(
    maxchair['disease_category'],
    categories=[
        'No disease',         
        'Only OA',
        'Only HT',
        'Only Diabetes',
        'OA and HT',
        'OA and Diab',
        'HT and Diab',
        'All three diseases'
    ],
    ordered=False
)

In [20]:
model = smf.ols('chair ~ C(disease_category) + age + bmi + female + educational_level + C(OA_conserv)*C(Diabetes) + C(OA_conserv)*C(Hypertension)', 
                data=maxchair).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  chair   R-squared:                       0.027
Model:                            OLS   Adj. R-squared:                  0.026
Method:                 Least Squares   F-statistic:                     26.17
Date:                Fri, 27 Jun 2025   Prob (F-statistic):           1.95e-54
Time:                        11:07:00   Log-Likelihood:                -34652.
No. Observations:               10359   AIC:                         6.933e+04
Df Residuals:                   10347   BIC:                         6.942e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                                                  coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------

##### Logistic Regression: Predicting "High Risk" Patients

In [21]:
maxchair['low_grip'] = (maxchair['chair'] < maxchair['chair'].quantile(0.25)).astype(int)

X = maxchair[['age', 'bmi', 'female', 'educational_level','OA_conserv', 'Diabetes', 'Hypertension']]
y = maxchair['low_grip']

logit_model = sm.Logit(y, sm.add_constant(X)).fit()
print(logit_model.summary())

Optimization terminated successfully.
         Current function value: 0.542243
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               low_grip   No. Observations:                10359
Model:                          Logit   Df Residuals:                    10351
Method:                           MLE   Df Model:                            7
Date:                Fri, 27 Jun 2025   Pseudo R-squ.:                 0.03413
Time:                        11:07:00   Log-Likelihood:                -5617.1
converged:                       True   LL-Null:                       -5815.6
Covariance Type:            nonrobust   LLR p-value:                 1.047e-81
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 2.3407      0.255      9.178      0.000       1.841       2.841
age     