In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

  from pandas.core import (


#### Data loading

In [2]:
df = pd.read_csv("data/main_df_after_merging.csv")
df.shape

  df = pd.read_csv("data/main_df_after_merging.csv")


(37143, 315)

All variables
| variable name  | meaning |
|---|---|
|ws010_ |	first walking speed result|
|ws011_ |	first walking speed time|
|ws012_ |	second walking speed result|
|ws013_ |	second walking speed time|
|cs004_ |	chair rise result|
|cs008_ |	time 5 chair rise|
|gs006_|	maxgrip left one|
|gs007_|	maxgrip left two|
|gs008_|	maxgrip right one|
|gs009_|	maxgrip right two|
|dn002_|    MOB|
|dn003_|    YOB|
|dn042_|    female|
|ph006d8| OA|
|ph010d1| pain joint|
|ph012_ | weight|
|ph013_ | height|
|ph044_ | eyesight_close |
|ph046_ | hearing |

##### Create variable "age"

In [3]:
# Wave 2 is 2007
reference_year = 2007
df['age'] = reference_year - df['YOB']

##### Plausible Height and Weight

Consider weight > 10 and height > 100

In [4]:
def clean_height_weight(df):
    print("Initial dataset size:", len(df))
    
    df = df.loc[df['weight'] > 10.0]
    print("After filtering plausible weight:", len(df))
    

    df = df.loc[df['height'] > 100.0]
    
    df.loc[(df['height'] > 1.0) & (df['height'] < 2.0), 'height'] *= 100
    print("After filtering and correcting height:", len(df))
    
    return df
df_raw = df.copy()
df = clean_height_weight(df)

Initial dataset size: 37143
After filtering plausible weight: 36504
After filtering and correcting height: 15864


In [5]:
excluded_hw = df_raw[~df_raw['mergeid'].isin(df['mergeid'])]
print("Excluded due to implausible/missing weight or height:", excluded_hw.shape)

Excluded due to implausible/missing weight or height: (21279, 316)


##### Create variable "bmi"

In [6]:
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)

##### Compute the maximum grip strength across all measurements

In [7]:
df['maxgrip'] = df[['maxgrip_left_one', 'maxgrip_left_two', 'maxgrip_right_one', 'maxgrip_right_two']].max(axis=1)

##### Compute the average walking speed across two attempts

In [8]:
df['walking'] = df[['first_walking_speed_time', 'second_walking_speed_time']].mean(axis=1)

##### Other variables

In [9]:
df['chair'] = df['time_5_chair_rise']
df['OA_conserv'] = np.logical_and(df['OA'], df['pain_joint'].astype(bool)).astype(np.int8)
df['activity'] = 2 * (4 - df['br015_']) + (4 - df['br016_'])
df['female'] = df['female'].astype(int)

In [10]:
df['chair'].sort_values(ascending=False).shape

(15864,)

##### Create df_anno

In [11]:
df_anno = df.drop_duplicates(subset='mergeid', keep='first')
df_anno.set_index('mergeid', inplace=True, drop=True)

df_anno_raw = df_anno.copy()

In [12]:
print("df:", df.shape, "df_anno:", df_anno.shape)

df: (15864, 322) df_anno: (15864, 321)


##### Categorizing & Identifying Control Groups; OA, diabetes, hypertension

In [13]:
df_anno['Hypertension'] = pd.to_numeric(df_anno['ph006d2'])
df_anno['Diabetes'] = pd.to_numeric(df_anno['ph006d5'])

In [14]:
df_anno["control"] = 0
df_anno.loc[(df_anno["OA_conserv"] == 0) & (df_anno["Hypertension"] == 0) & (df_anno["Diabetes"] == 0), "control"] = 1

In [15]:
display(df_anno['Hypertension'].value_counts())
display(df_anno['Diabetes'].value_counts())
display(df_anno['OA_conserv'].value_counts())

Hypertension
0.0    10340
1.0     5496
Name: count, dtype: int64

Diabetes
0.0    14169
1.0     1667
Name: count, dtype: int64

OA_conserv
0    12988
1     2876
Name: count, dtype: int64

In [16]:
display(df_anno[['Hypertension', 'OA_conserv', 'Diabetes']].value_counts(sort=False))

Hypertension  OA_conserv  Diabetes
0.0           0           0.0         8294
                          1.0          535
              1           0.0         1367
                          1.0          144
1.0           0           0.0         3450
                          1.0          702
              1           0.0         1058
                          1.0          286
Name: count, dtype: int64

## data exploration

##### Select features

In [17]:
df_anno.shape

(15864, 324)

In [18]:
df_anno = df_anno.drop_duplicates()
print(df_anno.shape)
_df_anno = df_anno.copy()

(15864, 324)


##### Create df_feature

In [19]:
# features = ['bmi', 'weight', 'age', 'OA_conserv', 'female', 'hearing', 'eyesight_close', 'Hypertension', 'Diabetes']
features = ['age', 'bmi', 'female', 'educational_level', 'OA_conserv', 'Hypertension', 'Diabetes']
df_features = df_anno.loc[:, features].copy()
df_features = df_features.dropna(axis=0)

In [20]:
excluded_features = df_anno[~df_anno.index.isin(df_features.index)]
print("Excluded due to missing features:", excluded_features.shape)


Excluded due to missing features: (43, 324)


In [21]:
df_features.shape

(15821, 7)

In [22]:
df_features.isna().sum()

age                  0
bmi                  0
female               0
educational_level    0
OA_conserv           0
Hypertension         0
Diabetes             0
dtype: int64

In [23]:
excluded_all = df_raw[~df_raw['mergeid'].isin(df_features.index)]
print("Total excluded cases:", excluded_all.shape)

Total excluded cases: (21322, 316)


In [24]:
excluded_all.to_csv("data/excluded_cases.csv")
df_features.to_csv("data/included_cases.csv")

##### Create data with outcomes and features

In [25]:
data = pd.concat([df_features, df_anno.loc[:, ['maxgrip', 'chair', 'walking',]]], axis=1)

In [26]:
data.shape

(15864, 10)

In [27]:
data = data.dropna(subset=features)

In [28]:
data.shape

(15821, 10)

In [29]:
data.isna().sum()

age                      0
bmi                      0
female                   0
educational_level        0
OA_conserv               0
Hypertension             0
Diabetes                 0
maxgrip               1407
chair                 5308
walking              14396
dtype: int64

In [30]:
print("df:", df.shape)
print("df_anno:", df_anno.shape)
print("df_features:", df_features.shape)
print("data:", data.shape)

df: (15864, 322)
df_anno: (15864, 324)
df_features: (15821, 7)
data: (15821, 10)


In [31]:
features = ['age', 'bmi', 'female', 'educational_level', 'OA_conserv', 'Hypertension', 'Diabetes']
outcomes = ['maxgrip', 'chair', 'walking']
print(data.shape)

(15821, 10)


In [32]:
print(data['female'].value_counts())
print("------------------------------------------------------------")
print(data['OA_conserv'].value_counts())
print("------------------------------------------------------------")
print(data['Hypertension'].value_counts())
print("------------------------------------------------------------")
print(data['Diabetes'].value_counts())

female
1.0    8716
0.0    7105
Name: count, dtype: int64
------------------------------------------------------------
OA_conserv
0.0    12970
1.0     2851
Name: count, dtype: int64
------------------------------------------------------------
Hypertension
0.0    10335
1.0     5486
Name: count, dtype: int64
------------------------------------------------------------
Diabetes
0.0    14158
1.0     1663
Name: count, dtype: int64


In [33]:
data.chair.value_counts()

chair
10.000000    272
9.000000     212
8.000000     208
12.000000    207
11.000000    176
            ... 
9.140000       1
16.040001      1
15.320000      1
24.580000      1
2.650000       1
Name: count, Length: 2552, dtype: int64

In [34]:
mean_age_per_chair = data.groupby('chair')['age'].mean()
print(mean_age_per_chair)


chair
0.000000     55.000000
0.050000     46.500000
0.060000     61.666667
0.060000     57.000000
0.070000     61.000000
               ...    
89.000000    56.000000
90.000000    73.000000
91.000000    65.000000
91.150002    72.000000
98.000000    65.000000
Name: age, Length: 2552, dtype: float64


#### Summery table

In [35]:
features = ['age', 'bmi', 'female', 'educational_level', 'OA_conserv', 'Hypertension', 'Diabetes']
outcomes = ['maxgrip', 'chair', 'walking']

# Create separate datasets for each outcome
datasets = {outcome: data[features + [outcome]].dropna() for outcome in outcomes}

for outcome, df_outcome in datasets.items():
    print(f"Dataset for {outcome}: {df_outcome.shape}")


Dataset for maxgrip: (14414, 8)
Dataset for chair: (10513, 8)
Dataset for walking: (1425, 8)


In [36]:
df_maxgrip = datasets['maxgrip']
df_chair = datasets['chair']
df_walking = datasets['walking']

print("Valid samples per dataset:")
print("data Maxgrip:", df_maxgrip.shape)
print("data Chair Stand:", df_chair.shape)
print("data Walking Speed:", df_walking.shape)

Valid samples per dataset:
data Maxgrip: (14414, 8)
data Chair Stand: (10513, 8)
data Walking Speed: (1425, 8)


In [37]:
df_maxgrip.age.describe()

count    14414.000000
mean        62.993270
std         10.068992
min         15.000000
25%         55.000000
50%         61.000000
75%         70.000000
max         99.000000
Name: age, dtype: float64

In [38]:
df_chair.age.describe()

count    10513.000000
mean        59.708266
std          7.457453
min         15.000000
25%         54.000000
50%         59.000000
75%         65.000000
max         96.000000
Name: age, dtype: float64

In [39]:
df_walking.age.describe()

count    1425.000000
mean       80.066667
std         3.985813
min        60.000000
25%        77.000000
50%        79.000000
75%        82.000000
max        98.000000
Name: age, dtype: float64

##### disease_group

- **0**: No disease  
- **1**:  
  - Diabetes only  
  - Hypertension only  
  - OA only  
- **2**:  
  - Diabetes + Hypertension  
  - Diabetes + OA  
  - Hypertension + OA  
- **3**: Diabetes + Hypertension + OA 


In [40]:
# Create a new categorical variable 'disease_group' based on conditions in 'data'
data['disease_group'] = 0  # Default: No disease

# Assign groups based on disease conditions
data.loc[(data['Diabetes'] == 1) & (data['Hypertension'] == 0) & (data['OA_conserv'] == 0), 'disease_group'] = 1
data.loc[(data['Diabetes'] == 0) & (data['Hypertension'] == 1) & (data['OA_conserv'] == 0), 'disease_group'] = 1
data.loc[(data['Diabetes'] == 0) & (data['Hypertension'] == 0) & (data['OA_conserv'] == 1), 'disease_group'] = 1

data.loc[(data['Diabetes'] == 1) & (data['Hypertension'] == 1) & (data['OA_conserv'] == 0), 'disease_group'] = 2
data.loc[(data['Diabetes'] == 1) & (data['Hypertension'] == 0) & (data['OA_conserv'] == 1), 'disease_group'] = 2
data.loc[(data['Diabetes'] == 0) & (data['Hypertension'] == 1) & (data['OA_conserv'] == 1), 'disease_group'] = 2

data.loc[(data['Diabetes'] == 1) & (data['Hypertension'] == 1) & (data['OA_conserv'] == 1), 'disease_group'] = 3

# Convert disease_group to categorical
data['disease_group'] = data['disease_group'].astype(int)

In [41]:
disease_counts = data['disease_group'].value_counts()
print(disease_counts)

disease_group
0    8290
1    5347
2    1899
3     285
Name: count, dtype: int64


In [42]:
valid_samples = {
    "Maxgrip": data.dropna(subset=['maxgrip'])['disease_group'].value_counts(),
    "Chair Stand": data.dropna(subset=['chair'])['disease_group'].value_counts(),
    "Walking Speed": data.dropna(subset=['walking'])['disease_group'].value_counts()
}

valid_samples_df = pd.DataFrame(valid_samples)
valid_samples_df

Unnamed: 0_level_0,Maxgrip,Chair Stand,Walking Speed
disease_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,7612,6117,533
1,4896,3314,619
2,1681,991,231
3,225,91,42


##### Summery

In [43]:
print("data: ", data.shape)
print("maxgrip: ", df_maxgrip.shape)
print("chair: ", df_chair.shape)
print("walking: ", df_walking.shape)

data:  (15821, 11)
maxgrip:  (14414, 8)
chair:  (10513, 8)
walking:  (1425, 8)


In [44]:
data

Unnamed: 0_level_0,age,bmi,female,educational_level,OA_conserv,Hypertension,Diabetes,maxgrip,chair,walking,disease_group
mergeid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AT-004234-02,53.0,30.717400,1.0,1.0,0.0,0.0,1.0,37.0,6.00,,1
AT-016392-01,61.0,23.588329,1.0,1.0,0.0,0.0,0.0,34.0,13.20,,0
AT-017298-01,64.0,25.381469,0.0,2.0,0.0,1.0,0.0,55.0,9.40,,1
AT-026212-02,59.0,24.337480,0.0,1.0,0.0,0.0,0.0,52.0,22.00,,0
AT-057442-01,66.0,31.221304,1.0,1.0,0.0,1.0,0.0,,10.84,,1
...,...,...,...,...,...,...,...,...,...,...,...
SE-994503-02,77.0,24.056935,0.0,2.0,0.0,0.0,0.0,43.0,,2.105,0
SE-996850-01,61.0,26.827421,0.0,1.0,0.0,0.0,0.0,61.0,7.03,,0
SE-996850-02,61.0,23.323418,1.0,1.0,0.0,1.0,0.0,32.0,13.94,,1
SE-996868-01,58.0,35.492158,0.0,1.0,0.0,1.0,0.0,58.0,10.38,,1


In [45]:
data.isna().sum()

age                      0
bmi                      0
female                   0
educational_level        0
OA_conserv               0
Hypertension             0
Diabetes                 0
maxgrip               1407
chair                 5308
walking              14396
disease_group            0
dtype: int64

In [46]:
# value count of ech columns in data
features = ['age', 'bmi', 'female', 'educational_level', 'OA_conserv', 'Hypertension', 'Diabetes']
for i in features: 
    print(data[i].value_counts())
    print("------------------------------------------------------------")


age
51.0     924
52.0     915
57.0     611
59.0     599
61.0     597
        ... 
15.0       1
100.0      1
30.0       1
98.0       1
31.0       1
Name: count, Length: 72, dtype: int64
------------------------------------------------------------
bmi
24.221453    110
25.711662    104
27.681661    101
25.951557     90
27.343750     85
            ... 
28.577961      1
17.715421      1
24.382717      1
27.776912      1
22.591438      1
Name: count, Length: 2821, dtype: int64
------------------------------------------------------------
female
1.0    8716
0.0    7105
Name: count, dtype: int64
------------------------------------------------------------
educational_level
0.0    6569
1.0    5654
2.0    3022
3.0     576
Name: count, dtype: int64
------------------------------------------------------------
OA_conserv
0.0    12970
1.0     2851
Name: count, dtype: int64
------------------------------------------------------------
Hypertension
0.0    10335
1.0     5486
Name: count, dtype: int64
--

In [47]:
# value count of ech columns in data
features = ['age', 'bmi', 'female', 'educational_level', 'OA_conserv', 'Hypertension', 'Diabetes']
for i in features:     
    print(data[i].describe())
    print("------------------------------------------------------------")

count    15821.00000
mean        63.48739
std         10.38545
min         15.00000
25%         55.00000
50%         62.00000
75%         71.00000
max        100.00000
Name: age, dtype: float64
------------------------------------------------------------
count    15821.000000
mean        26.769211
std          4.703205
min         10.816658
25%         23.808690
50%         26.218821
75%         29.097746
max        157.394159
Name: bmi, dtype: float64
------------------------------------------------------------
count    15821.000000
mean         0.550913
std          0.497417
min          0.000000
25%          0.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: female, dtype: float64
------------------------------------------------------------
count    15821.000000
mean         0.848619
std          0.853802
min          0.000000
25%          0.000000
50%          1.000000
75%          1.000000
max          3.000000
Name: educational_level, dtype: float64


##### Create maxchair and maxwalk

In [48]:
data.isna().sum()

age                      0
bmi                      0
female                   0
educational_level        0
OA_conserv               0
Hypertension             0
Diabetes                 0
maxgrip               1407
chair                 5308
walking              14396
disease_group            0
dtype: int64

In [49]:
features = ['age', 'bmi', 'female', 'educational_level', 'OA_conserv', 'Hypertension', 'Diabetes']
outcomes = ['maxgrip', 'chair', 'walking']

maxchair = data[features + ['maxgrip', 'chair']].dropna()

maxwalk = data[features + ['maxgrip', 'walking']].dropna()

print("Dataset maxchair (Maxgrip + Chair):", maxchair.shape)
print("Dataset maxwalk (Maxgrip + Walking):", maxwalk.shape)


Dataset maxchair (Maxgrip + Chair): (10359, 9)
Dataset maxwalk (Maxgrip + Walking): (1361, 9)


##### Overlap between maxchair and maxwalk:

In [50]:
overlap_cases_max = maxchair.index.intersection(maxwalk.index)
print(f"Number of overlapping cases between maxchair and maxwalk: {len(overlap_cases_max)}")

Number of overlapping cases between maxchair and maxwalk: 0


##### Overlap between Chair and Walk:

In [51]:
overlap_cases = df_chair.index.intersection(df_walking.index)
print(f"Number of overlapping cases: {len(overlap_cases)}")

Number of overlapping cases: 0


##### Describe of Age:

In [52]:
datasets = {
    "Maxgrip": df_maxgrip,
    "Chair": df_chair,
    "Walking": df_walking,
    "Maxchair": maxchair,
    "Maxwalk": maxwalk
}

for name, df in datasets.items():
    print(f"\nAge Statistics for {name}:\n{df.age.describe()}")
    print("-" * 50)



Age Statistics for Maxgrip:
count    14414.000000
mean        62.993270
std         10.068992
min         15.000000
25%         55.000000
50%         61.000000
75%         70.000000
max         99.000000
Name: age, dtype: float64
--------------------------------------------------

Age Statistics for Chair:
count    10513.000000
mean        59.708266
std          7.457453
min         15.000000
25%         54.000000
50%         59.000000
75%         65.000000
max         96.000000
Name: age, dtype: float64
--------------------------------------------------

Age Statistics for Walking:
count    1425.000000
mean       80.066667
std         3.985813
min        60.000000
25%        77.000000
50%        79.000000
75%        82.000000
max        98.000000
Name: age, dtype: float64
--------------------------------------------------

Age Statistics for Maxchair:
count    10359.000000
mean        59.711072
std          7.458557
min         15.000000
25%         54.000000
50%         59.000000
75

#### Summery for paper:

##### maxchair:

In [53]:
maxchair.shape

(10359, 9)

In [54]:
maxchair['disease_category'] = 'No disease'
maxchair.loc[(maxchair['OA_conserv'] == 1) & (maxchair['Hypertension'] == 0) & (maxchair['Diabetes'] == 0), 'disease_category'] = 'Only OA'
maxchair.loc[(maxchair['OA_conserv'] == 0) & (maxchair['Hypertension'] == 1) & (maxchair['Diabetes'] == 0), 'disease_category'] = 'Only HT'
maxchair.loc[(maxchair['OA_conserv'] == 0) & (maxchair['Hypertension'] == 0) & (maxchair['Diabetes'] == 1), 'disease_category'] = 'Only Diabetes'
maxchair.loc[(maxchair['OA_conserv'] == 1) & (maxchair['Hypertension'] == 1) & (maxchair['Diabetes'] == 0), 'disease_category'] = 'OA and HT'
maxchair.loc[(maxchair['OA_conserv'] == 1) & (maxchair['Hypertension'] == 0) & (maxchair['Diabetes'] == 1), 'disease_category'] = 'OA and Diab'
maxchair.loc[(maxchair['OA_conserv'] == 0) & (maxchair['Hypertension'] == 1) & (maxchair['Diabetes'] == 1), 'disease_category'] = 'HT and Diab'
maxchair.loc[(maxchair['OA_conserv'] == 1) & (maxchair['Hypertension'] == 1) & (maxchair['Diabetes'] == 1), 'disease_category'] = 'All three diseases'


summary_dict = {
    'Variable': ['Number of cases', 'Age; mean (SD)', 'Age; n (%)', 'Sex; n, (%)', 'BMI; mean (SD)',
                 'Educational Level - Low (%)', 'Educational Level - Medium (%)',
                 'Educational Level - High (%)', 'Educational Level - Other (%)',
                 'Grip strength (kg); mean (SD)', 'Maxgrip; n (%)', 'Missing grip strength (n)',
                 '5 Chairs stands (sec); mean (SD)', 'Chair stand; n (%)', 'Missing 5 Chairs stands (n)',
                 'Total missing values (n)']
}


disease_groups = maxchair['disease_category'].unique()


for group in disease_groups:
    subset = maxchair[maxchair['disease_category'] == group]
    num_cases = len(subset)

    # Age 
    age_mean_sd = f"{subset['age'].mean():.1f} ± {subset['age'].std():.1f}" if num_cases > 0 else "NaN"
    age_n = f"{num_cases - subset['age'].isna().sum()} ({((num_cases - subset['age'].isna().sum()) / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    # Female
    female_n = f"{subset['female'].sum()} ({(subset['female'].sum() / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    # BMI
    bmi_mean_sd = f"{subset['bmi'].mean():.1f} ± {subset['bmi'].std():.1f}" if num_cases > 0 else "NaN"

    # Educational level counts (%)
    def edu_percent(level):
        count = (subset['educational_level'] == level).sum()
        return f"{count} ({(count / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    edu_low = edu_percent(0)
    edu_medium = edu_percent(1)
    edu_high = edu_percent(2)
    edu_other = edu_percent(3)

    # Grip strength
    maxgrip_mean_sd = f"{subset['maxgrip'].mean():.1f} ± {subset['maxgrip'].std():.1f}" if num_cases > 0 else "NaN"
    missing_maxgrip = subset['maxgrip'].isna().sum()
    maxgrip_n = f"{num_cases - missing_maxgrip} ({((num_cases - missing_maxgrip) / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    # Chair stand
    chair_mean_sd = f"{subset['chair'].mean():.1f} ± {subset['chair'].std():.1f}" if num_cases > 0 else "NaN"
    missing_chair = subset['chair'].isna().sum()
    chair_n = f"{num_cases - missing_chair} ({((num_cases - missing_chair) / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    total_missing = missing_maxgrip + missing_chair

    summary_dict[group] = [
        num_cases, age_mean_sd, age_n, female_n, bmi_mean_sd,
        edu_low, edu_medium, edu_high, edu_other,
        maxgrip_mean_sd, maxgrip_n, missing_maxgrip,
        chair_mean_sd, chair_n, missing_chair,
        total_missing
    ]


max_len = max(len(v) for v in summary_dict.values())
for k in summary_dict:
    summary_dict[k] += [""] * (max_len - len(summary_dict[k]))

summary_table_maxchair = pd.DataFrame(summary_dict)
summary_table_maxchair


Unnamed: 0,Variable,Only Diabetes,No disease,Only HT,HT and Diab,All three diseases,Only OA,OA and Diab,OA and HT
0,Number of cases,309,6039,2211,393,90,738,56,523
1,Age; mean (SD),62.9 ± 7.3,58.3 ± 7.3,61.4 ± 7.1,63.5 ± 6.6,63.7 ± 7.0,60.1 ± 7.0,61.7 ± 6.9,63.0 ± 7.2
2,Age; n (%),309 (100.0%),6039 (100.0%),2211 (100.0%),393 (100.0%),90 (100.0%),738 (100.0%),56 (100.0%),523 (100.0%)
3,"Sex; n, (%)",118.0 (38.2%),3242.0 (53.7%),1144.0 (51.7%),191.0 (48.6%),59.0 (65.6%),494.0 (66.9%),34.0 (60.7%),341.0 (65.2%)
4,BMI; mean (SD),27.8 ± 4.4,25.8 ± 4.0,27.9 ± 5.2,29.8 ± 4.7,32.3 ± 5.1,26.1 ± 4.0,28.9 ± 4.9,28.4 ± 4.5
5,Educational Level - Low (%),135 (43.7%),2036 (33.7%),832 (37.6%),164 (41.7%),42 (46.7%),307 (41.6%),31 (55.4%),241 (46.1%)
6,Educational Level - Medium (%),113 (36.6%),2416 (40.0%),857 (38.8%),154 (39.2%),29 (32.2%),273 (37.0%),14 (25.0%),189 (36.1%)
7,Educational Level - High (%),48 (15.5%),1472 (24.4%),469 (21.2%),65 (16.5%),12 (13.3%),129 (17.5%),10 (17.9%),75 (14.3%)
8,Educational Level - Other (%),13 (4.2%),115 (1.9%),53 (2.4%),10 (2.5%),7 (7.8%),29 (3.9%),1 (1.8%),18 (3.4%)
9,Grip strength (kg); mean (SD),37.7 ± 11.0,37.9 ± 11.3,37.5 ± 11.6,36.2 ± 11.3,31.5 ± 11.6,33.4 ± 11.4,33.1 ± 11.7,32.4 ± 11.2


##### maxwalk

In [55]:
maxwalk.shape

(1361, 9)

In [56]:
only_maxgrip = maxwalk[(maxwalk['maxgrip'].notna()) & (maxwalk['walking'].isna())]
only_walking = maxwalk[(maxwalk['walking'].notna()) & (maxwalk['maxgrip'].isna())]

print(f"Cases with Maxgrip but missing Walking speed: {len(only_maxgrip)}")
print(f"Cases with Walking speed but missing Maxgrip: {len(only_walking)}")


Cases with Maxgrip but missing Walking speed: 0
Cases with Walking speed but missing Maxgrip: 0


In [57]:
maxwalk['disease_category'] = 'No disease'
maxwalk.loc[(maxwalk['OA_conserv'] == 1) & (maxwalk['Hypertension'] == 0) & (maxwalk['Diabetes'] == 0), 'disease_category'] = 'Only OA'
maxwalk.loc[(maxwalk['OA_conserv'] == 0) & (maxwalk['Hypertension'] == 1) & (maxwalk['Diabetes'] == 0), 'disease_category'] = 'Only HT'
maxwalk.loc[(maxwalk['OA_conserv'] == 0) & (maxwalk['Hypertension'] == 0) & (maxwalk['Diabetes'] == 1), 'disease_category'] = 'Only Diabetes'
maxwalk.loc[(maxwalk['OA_conserv'] == 1) & (maxwalk['Hypertension'] == 1) & (maxwalk['Diabetes'] == 0), 'disease_category'] = 'OA and HT'
maxwalk.loc[(maxwalk['OA_conserv'] == 1) & (maxwalk['Hypertension'] == 0) & (maxwalk['Diabetes'] == 1), 'disease_category'] = 'OA and Diab'
maxwalk.loc[(maxwalk['OA_conserv'] == 0) & (maxwalk['Hypertension'] == 1) & (maxwalk['Diabetes'] == 1), 'disease_category'] = 'HT and Diab'
maxwalk.loc[(maxwalk['OA_conserv'] == 1) & (maxwalk['Hypertension'] == 1) & (maxwalk['Diabetes'] == 1), 'disease_category'] = 'All three diseases'


summary_dict = {
    'Variable': ['Number of cases', 'Age; mean (SD)', 'Age; n (%)', 'Sex; n, (%)', 'BMI; mean (SD)',
                 'Educational Level - Low (%)', 'Educational Level - Medium (%)',
                 'Educational Level - High (%)', 'Educational Level - Other (%)',
                 'Grip strength (kg); mean (SD)', 'Maxgrip; n (%)', 'Missing grip strength (n)',
                 'Walking speed (sec); mean (SD)', 'Walking speed; n (%)', 'Missing walking speed (n)',
                 'Total missing values (n)']
}


disease_groups = maxwalk['disease_category'].unique()


for group in disease_groups:
    subset = maxwalk[maxwalk['disease_category'] == group]
    num_cases = len(subset)

    # Age 
    age_mean_sd = f"{subset['age'].mean():.1f} ± {subset['age'].std():.1f}" if num_cases > 0 else "NaN"
    age_n = f"{num_cases - subset['age'].isna().sum()} ({((num_cases - subset['age'].isna().sum()) / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    # Female
    female_n = f"{subset['female'].sum()} ({(subset['female'].sum() / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    # BMI
    bmi_mean_sd = f"{subset['bmi'].mean():.1f} ± {subset['bmi'].std():.1f}" if num_cases > 0 else "NaN"

    # Educational level counts (%)
    def edu_percent(level):
        count = (subset['educational_level'] == level).sum()
        return f"{count} ({(count / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    edu_low = edu_percent(0)
    edu_medium = edu_percent(1)
    edu_high = edu_percent(2)
    edu_other = edu_percent(3)

    # Grip strength
    maxgrip_mean_sd = f"{subset['maxgrip'].mean():.1f} ± {subset['maxgrip'].std():.1f}" if num_cases > 0 else "NaN"
    missing_maxgrip = subset['maxgrip'].isna().sum()
    maxgrip_n = f"{num_cases - missing_maxgrip} ({((num_cases - missing_maxgrip) / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    # Walking speed
    walking_mean_sd = f"{subset['walking'].mean():.1f} ± {subset['walking'].std():.1f}" if num_cases > 0 else "NaN"
    missing_walking = subset['walking'].isna().sum()
    valid_walking = num_cases - missing_walking
    walking_n = f"{valid_walking} ({(valid_walking / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    total_missing = missing_maxgrip + missing_walking


    summary_dict[group] = [
        num_cases, age_mean_sd, age_n, female_n, bmi_mean_sd,
        edu_low, edu_medium, edu_high, edu_other,
        maxgrip_mean_sd, maxgrip_n, missing_maxgrip,
        walking_mean_sd, walking_n, missing_walking,
        total_missing
    ]


max_len = max(len(v) for v in summary_dict.values())
for k in summary_dict:
    summary_dict[k] += [""] * (max_len - len(summary_dict[k]))

summary_table_maxwalk = pd.DataFrame(summary_dict)
summary_table_maxwalk


Unnamed: 0,Variable,No disease,Only HT,Only OA,OA and HT,Only Diabetes,HT and Diab,All three diseases,OA and Diab
0,Number of cases,510,390,124,117,79,79,38,24
1,Age; mean (SD),80.1 ± 4.1,80.0 ± 4.1,80.6 ± 3.8,80.1 ± 4.2,79.6 ± 3.4,79.9 ± 3.6,79.1 ± 3.7,78.8 ± 3.2
2,Age; n (%),510 (100.0%),390 (100.0%),124 (100.0%),117 (100.0%),79 (100.0%),79 (100.0%),38 (100.0%),24 (100.0%)
3,"Sex; n, (%)",228.0 (44.7%),217.0 (55.6%),74.0 (59.7%),68.0 (58.1%),39.0 (49.4%),48.0 (60.8%),25.0 (65.8%),10.0 (41.7%)
4,BMI; mean (SD),25.2 ± 4.4,26.2 ± 3.8,26.4 ± 6.5,27.6 ± 4.1,27.1 ± 3.9,27.4 ± 4.0,29.2 ± 4.5,27.6 ± 5.5
5,Educational Level - Low (%),256 (50.2%),203 (52.1%),76 (61.3%),61 (52.1%),48 (60.8%),38 (48.1%),13 (34.2%),14 (58.3%)
6,Educational Level - Medium (%),144 (28.2%),106 (27.2%),17 (13.7%),29 (24.8%),13 (16.5%),19 (24.1%),12 (31.6%),7 (29.2%)
7,Educational Level - High (%),80 (15.7%),59 (15.1%),22 (17.7%),18 (15.4%),10 (12.7%),17 (21.5%),7 (18.4%),0 (0.0%)
8,Educational Level - Other (%),30 (5.9%),22 (5.6%),9 (7.3%),9 (7.7%),8 (10.1%),5 (6.3%),6 (15.8%),3 (12.5%)
9,Grip strength (kg); mean (SD),29.2 ± 9.3,27.5 ± 9.4,24.2 ± 9.1,26.5 ± 10.8,26.8 ± 8.6,26.0 ± 8.1,24.3 ± 8.7,26.3 ± 7.1


##### Create csv from maxchair and maxwalk

In [58]:
maxchair.to_csv("data/maxchair.csv")
maxwalk.to_csv("data/maxwalk.csv")

In [59]:
maxchair.describe()

Unnamed: 0,age,bmi,female,educational_level,OA_conserv,Hypertension,Diabetes,maxgrip,chair
count,10359.0,10359.0,10359.0,10359.0,10359.0,10359.0,10359.0,10359.0,10359.0
mean,59.711072,26.706177,0.542813,0.901921,0.135824,0.310551,0.081861,37.056473,11.173458
std,7.458557,4.554358,0.498188,0.819272,0.342618,0.462742,0.274166,11.519487,6.958415
min,15.0,13.061224,0.0,0.0,0.0,0.0,0.0,1.0,0.05
25%,54.0,23.833005,0.0,0.0,0.0,0.0,0.0,28.0,7.9
50%,59.0,26.196187,1.0,1.0,0.0,0.0,0.0,35.0,10.0
75%,65.0,29.03179,1.0,1.0,0.0,1.0,0.0,46.0,12.68
max,96.0,157.394159,1.0,3.0,1.0,1.0,1.0,84.0,98.0


In [60]:
maxwalk.describe()

Unnamed: 0,age,bmi,female,educational_level,OA_conserv,Hypertension,Diabetes,maxgrip,walking
count,1361.0,1361.0,1361.0,1361.0,1361.0,1361.0,1361.0,1361.0,1361.0
mean,80.044085,26.180041,0.52094,0.770757,0.22263,0.458486,0.161646,27.529023,5.013295
std,3.957992,4.523825,0.499745,0.94654,0.416165,0.498457,0.368261,9.424198,3.85986
min,60.0,12.802768,0.0,0.0,0.0,0.0,0.0,0.0,0.5
25%,77.0,23.463385,0.0,0.0,0.0,0.0,0.0,20.0,2.815
50%,79.0,25.765713,1.0,0.0,0.0,0.0,0.0,26.0,3.8
75%,82.0,28.393726,1.0,1.0,0.0,1.0,0.0,34.0,5.69
max,98.0,80.329225,1.0,3.0,1.0,1.0,1.0,80.0,30.0


In [61]:
data.describe()

Unnamed: 0,age,bmi,female,educational_level,OA_conserv,Hypertension,Diabetes,maxgrip,chair,walking,disease_group
count,15821.0,15821.0,15821.0,15821.0,15821.0,15821.0,15821.0,14414.0,10513.0,1425.0,15821.0
mean,63.48739,26.769211,0.550913,0.848619,0.180204,0.346754,0.105113,34.973845,11.194354,5.124547,0.632071
std,10.38545,4.703205,0.497417,0.853802,0.384369,0.475952,0.306709,12.022814,7.009656,4.043442,0.762062
min,15.0,10.816658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0
25%,55.0,23.80869,0.0,0.0,0.0,0.0,0.0,26.0,7.9,2.825,0.0
50%,62.0,26.218821,1.0,1.0,0.0,0.0,0.0,33.0,10.0,3.84,0.0
75%,71.0,29.097746,1.0,1.0,0.0,1.0,0.0,44.0,12.72,5.77,1.0
max,100.0,157.394159,1.0,3.0,1.0,1.0,1.0,84.0,98.0,30.0,3.0


##### Comparison between disease:

In [62]:
maxchair['disease_count'] = maxchair[['OA_conserv', 'Diabetes', 'Hypertension']].sum(axis=1)

summary_chair = maxchair.groupby('disease_count').agg(
    n=('disease_count', 'count'),
    grip_strength_mean=('maxgrip', 'mean'),
    chair_stand_mean=('chair', 'mean')
).round(1)

summary_chair


Unnamed: 0_level_0,n,grip_strength_mean,chair_stand_mean
disease_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,6039,37.9,10.9
1.0,3258,36.6,11.3
2.0,972,34.0,12.4
3.0,90,31.5,12.8


In [63]:
maxwalk['disease_count'] = maxwalk[['OA_conserv', 'Diabetes', 'Hypertension']].sum(axis=1)

summary_walk = maxwalk.groupby('disease_count').agg(
    n=('disease_count', 'count'),
    grip_strength_mean=('maxgrip', 'mean'),
    walking_mean=('walking', 'mean')
).round(1)

summary_walk 

Unnamed: 0_level_0,n,grip_strength_mean,walking_mean
disease_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,510,29.2,4.8
1.0,593,26.7,5.0
2.0,220,26.3,5.4
3.0,38,24.3,5.6
