In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

#### Data loading

In [None]:
df = pd.read_csv("data/main_df_after_merging.csv")
df.shape

All variables
| variable name  | meaning |
|---|---|
|ws010_ |	first walking speed result|
|ws011_ |	first walking speed time|
|ws012_ |	second walking speed result|
|ws013_ |	second walking speed time|
|cs004_ |	chair rise result|
|cs008_ |	time 5 chair rise|
|gs006_|	maxgrip left one|
|gs007_|	maxgrip left two|
|gs008_|	maxgrip right one|
|gs009_|	maxgrip right two|
|dn002_|    MOB|
|dn003_|    YOB|
|dn042_|    female|
|ph006d8| OA|
|ph010d1| pain joint|
|ph012_ | weight|
|ph013_ | height|
|ph044_ | eyesight_close |
|ph046_ | hearing |

##### Create variable "age"

In [None]:
# Wave 2 is 2007
reference_year = 2007
df['age'] = reference_year - df['YOB']

##### Plausible Height and Weight

Consider weight > 10 and height > 100

In [None]:
def clean_height_weight(df):
    print("Initial dataset size:", len(df))
    
    df = df.loc[df['weight'] > 10.0]
    print("After filtering plausible weight:", len(df))
    

    df = df.loc[df['height'] > 100.0]
    
    df.loc[(df['height'] > 1.0) & (df['height'] < 2.0), 'height'] *= 100
    print("After filtering and correcting height:", len(df))
    
    return df
df_raw = df.copy()
df = clean_height_weight(df)

In [None]:
excluded_hw = df_raw[~df_raw['mergeid'].isin(df['mergeid'])]
print("Excluded due to implausible/missing weight or height:", excluded_hw.shape)

##### Create variable "bmi"

In [None]:
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)

##### Compute the maximum grip strength across all measurements

In [None]:
df['maxgrip'] = df[['maxgrip_left_one', 'maxgrip_left_two', 'maxgrip_right_one', 'maxgrip_right_two']].max(axis=1)

##### Compute the average walking speed across two attempts

In [None]:
df['walking'] = df[['first_walking_speed_time', 'second_walking_speed_time']].mean(axis=1)

##### Other variables

In [None]:
df['chair'] = df['time_5_chair_rise']
df['OA_conserv'] = np.logical_and(df['OA'], df['pain_joint'].astype(bool)).astype(np.int8)
df['activity'] = 2 * (4 - df['br015_']) + (4 - df['br016_'])
df['female'] = df['female'].astype(int)

In [None]:
df['chair'].sort_values(ascending=False).shape

##### Create df_anno

In [None]:
df_anno = df.drop_duplicates(subset='mergeid', keep='first')
df_anno.set_index('mergeid', inplace=True, drop=True)

df_anno_raw = df_anno.copy()

In [None]:
print("df:", df.shape, "df_anno:", df_anno.shape)

##### Categorizing & Identifying Control Groups; OA, diabetes, hypertension

In [None]:
df_anno['Hypertension'] = pd.to_numeric(df_anno['ph006d2'])
df_anno['Diabetes'] = pd.to_numeric(df_anno['ph006d5'])

In [None]:
df_anno["control"] = 0
df_anno.loc[(df_anno["OA_conserv"] == 0) & (df_anno["Hypertension"] == 0) & (df_anno["Diabetes"] == 0), "control"] = 1

In [None]:
display(df_anno['Hypertension'].value_counts())
display(df_anno['Diabetes'].value_counts())
display(df_anno['OA_conserv'].value_counts())

In [None]:
display(df_anno[['Hypertension', 'OA_conserv', 'Diabetes']].value_counts(sort=False))

## data exploration

##### Select features

In [None]:
df_anno.shape

In [None]:
df_anno = df_anno.drop_duplicates()
print(df_anno.shape)
_df_anno = df_anno.copy()

##### Create df_feature

In [None]:
# features = ['bmi', 'weight', 'age', 'OA_conserv', 'female', 'hearing', 'eyesight_close', 'Hypertension', 'Diabetes']
features = ['age', 'bmi', 'female', 'educational_level', 'OA_conserv', 'Hypertension', 'Diabetes']
df_features = df_anno.loc[:, features].copy()
df_features = df_features.dropna(axis=0)

In [None]:
excluded_features = df_anno[~df_anno.index.isin(df_features.index)]
print("Excluded due to missing features:", excluded_features.shape)


In [None]:
df_features.shape

In [None]:
df_features.isna().sum()

In [None]:
excluded_all = df_raw[~df_raw['mergeid'].isin(df_features.index)]
print("Total excluded cases:", excluded_all.shape)

In [None]:
excluded_all.to_csv("data/excluded_cases.csv")
df_features.to_csv("data/included_cases.csv")

##### Create data with outcomes and features

In [None]:
data = pd.concat([df_features, df_anno.loc[:, ['maxgrip', 'chair', 'walking',]]], axis=1)

In [None]:
data.shape

In [None]:
data = data.dropna(subset=features)

In [None]:
data.shape

In [None]:
data.isna().sum()

In [None]:
print("df:", df.shape)
print("df_anno:", df_anno.shape)
print("df_features:", df_features.shape)
print("data:", data.shape)

In [None]:
features = ['age', 'bmi', 'female', 'educational_level', 'OA_conserv', 'Hypertension', 'Diabetes']
outcomes = ['maxgrip', 'chair', 'walking']
print(data.shape)

In [None]:
print(data['female'].value_counts())
print("------------------------------------------------------------")
print(data['OA_conserv'].value_counts())
print("------------------------------------------------------------")
print(data['Hypertension'].value_counts())
print("------------------------------------------------------------")
print(data['Diabetes'].value_counts())

In [None]:
data.chair.value_counts()

In [None]:
mean_age_per_chair = data.groupby('chair')['age'].mean()
print(mean_age_per_chair)


#### Summery table

In [None]:
features = ['age', 'bmi', 'female', 'educational_level', 'OA_conserv', 'Hypertension', 'Diabetes']
outcomes = ['maxgrip', 'chair', 'walking']

# Create separate datasets for each outcome
datasets = {outcome: data[features + [outcome]].dropna() for outcome in outcomes}

for outcome, df_outcome in datasets.items():
    print(f"Dataset for {outcome}: {df_outcome.shape}")


In [None]:
df_maxgrip = datasets['maxgrip']
df_chair = datasets['chair']
df_walking = datasets['walking']

print("Valid samples per dataset:")
print("data Maxgrip:", df_maxgrip.shape)
print("data Chair Stand:", df_chair.shape)
print("data Walking Speed:", df_walking.shape)

In [None]:
df_maxgrip.age.describe()

In [None]:
df_chair.age.describe()

In [None]:
df_walking.age.describe()

##### disease_group

- **0**: No disease  
- **1**:  
  - Diabetes only  
  - Hypertension only  
  - OA only  
- **2**:  
  - Diabetes + Hypertension  
  - Diabetes + OA  
  - Hypertension + OA  
- **3**: Diabetes + Hypertension + OA 


In [None]:
# Create a new categorical variable 'disease_group' based on conditions in 'data'
data['disease_group'] = 0  # Default: No disease

# Assign groups based on disease conditions
data.loc[(data['Diabetes'] == 1) & (data['Hypertension'] == 0) & (data['OA_conserv'] == 0), 'disease_group'] = 1
data.loc[(data['Diabetes'] == 0) & (data['Hypertension'] == 1) & (data['OA_conserv'] == 0), 'disease_group'] = 1
data.loc[(data['Diabetes'] == 0) & (data['Hypertension'] == 0) & (data['OA_conserv'] == 1), 'disease_group'] = 1

data.loc[(data['Diabetes'] == 1) & (data['Hypertension'] == 1) & (data['OA_conserv'] == 0), 'disease_group'] = 2
data.loc[(data['Diabetes'] == 1) & (data['Hypertension'] == 0) & (data['OA_conserv'] == 1), 'disease_group'] = 2
data.loc[(data['Diabetes'] == 0) & (data['Hypertension'] == 1) & (data['OA_conserv'] == 1), 'disease_group'] = 2

data.loc[(data['Diabetes'] == 1) & (data['Hypertension'] == 1) & (data['OA_conserv'] == 1), 'disease_group'] = 3

# Convert disease_group to categorical
data['disease_group'] = data['disease_group'].astype(int)

In [None]:
disease_counts = data['disease_group'].value_counts()
print(disease_counts)

In [None]:
valid_samples = {
    "Maxgrip": data.dropna(subset=['maxgrip'])['disease_group'].value_counts(),
    "Chair Stand": data.dropna(subset=['chair'])['disease_group'].value_counts(),
    "Walking Speed": data.dropna(subset=['walking'])['disease_group'].value_counts()
}

valid_samples_df = pd.DataFrame(valid_samples)
valid_samples_df

##### Summery

In [None]:
print("data: ", data.shape)
print("maxgrip: ", df_maxgrip.shape)
print("chair: ", df_chair.shape)
print("walking: ", df_walking.shape)

In [None]:
data

In [None]:
data.isna().sum()

In [None]:
# value count of ech columns in data
features = ['age', 'bmi', 'female', 'educational_level', 'OA_conserv', 'Hypertension', 'Diabetes']
for i in features: 
    print(data[i].value_counts())
    print("------------------------------------------------------------")


In [None]:
# value count of ech columns in data
features = ['age', 'bmi', 'female', 'educational_level', 'OA_conserv', 'Hypertension', 'Diabetes']
for i in features:     
    print(data[i].describe())
    print("------------------------------------------------------------")

##### Create maxchair and maxwalk

In [None]:
data.isna().sum()

In [None]:
features = ['age', 'bmi', 'female', 'educational_level', 'OA_conserv', 'Hypertension', 'Diabetes']
outcomes = ['maxgrip', 'chair', 'walking']

maxchair = data[features + ['maxgrip', 'chair']].dropna()

maxwalk = data[features + ['maxgrip', 'walking']].dropna()

print("Dataset maxchair (Maxgrip + Chair):", maxchair.shape)
print("Dataset maxwalk (Maxgrip + Walking):", maxwalk.shape)


##### Overlap between maxchair and maxwalk:

In [None]:
overlap_cases_max = maxchair.index.intersection(maxwalk.index)
print(f"Number of overlapping cases between maxchair and maxwalk: {len(overlap_cases_max)}")

##### Overlap between Chair and Walk:

In [None]:
overlap_cases = df_chair.index.intersection(df_walking.index)
print(f"Number of overlapping cases: {len(overlap_cases)}")

##### Describe of Age:

In [None]:
datasets = {
    "Maxgrip": df_maxgrip,
    "Chair": df_chair,
    "Walking": df_walking,
    "Maxchair": maxchair,
    "Maxwalk": maxwalk
}

for name, df in datasets.items():
    print(f"\nAge Statistics for {name}:\n{df.age.describe()}")
    print("-" * 50)


#### Summery for paper:

##### maxchair:

In [None]:
maxchair.shape

In [None]:
maxchair['disease_category'] = 'No disease'
maxchair.loc[(maxchair['OA_conserv'] == 1) & (maxchair['Hypertension'] == 0) & (maxchair['Diabetes'] == 0), 'disease_category'] = 'Only OA'
maxchair.loc[(maxchair['OA_conserv'] == 0) & (maxchair['Hypertension'] == 1) & (maxchair['Diabetes'] == 0), 'disease_category'] = 'Only HT'
maxchair.loc[(maxchair['OA_conserv'] == 0) & (maxchair['Hypertension'] == 0) & (maxchair['Diabetes'] == 1), 'disease_category'] = 'Only Diabetes'
maxchair.loc[(maxchair['OA_conserv'] == 1) & (maxchair['Hypertension'] == 1) & (maxchair['Diabetes'] == 0), 'disease_category'] = 'OA and HT'
maxchair.loc[(maxchair['OA_conserv'] == 1) & (maxchair['Hypertension'] == 0) & (maxchair['Diabetes'] == 1), 'disease_category'] = 'OA and Diab'
maxchair.loc[(maxchair['OA_conserv'] == 0) & (maxchair['Hypertension'] == 1) & (maxchair['Diabetes'] == 1), 'disease_category'] = 'HT and Diab'
maxchair.loc[(maxchair['OA_conserv'] == 1) & (maxchair['Hypertension'] == 1) & (maxchair['Diabetes'] == 1), 'disease_category'] = 'All three diseases'


summary_dict = {
    'Variable': ['Number of cases', 'Age; mean (SD)', 'Age; n (%)', 'Sex; n, (%)', 'BMI; mean (SD)',
                 'Educational Level - Low (%)', 'Educational Level - Medium (%)',
                 'Educational Level - High (%)', 'Educational Level - Other (%)',
                 'Grip strength (kg); mean (SD)', 'Maxgrip; n (%)', 'Missing grip strength (n)',
                 '5 Chairs stands (sec); mean (SD)', 'Chair stand; n (%)', 'Missing 5 Chairs stands (n)',
                 'Total missing values (n)']
}


disease_groups = maxchair['disease_category'].unique()


for group in disease_groups:
    subset = maxchair[maxchair['disease_category'] == group]
    num_cases = len(subset)

    # Age 
    age_mean_sd = f"{subset['age'].mean():.1f} ± {subset['age'].std():.1f}" if num_cases > 0 else "NaN"
    age_n = f"{num_cases - subset['age'].isna().sum()} ({((num_cases - subset['age'].isna().sum()) / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    # Female
    female_n = f"{subset['female'].sum()} ({(subset['female'].sum() / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    # BMI
    bmi_mean_sd = f"{subset['bmi'].mean():.1f} ± {subset['bmi'].std():.1f}" if num_cases > 0 else "NaN"

    # Educational level counts (%)
    def edu_percent(level):
        count = (subset['educational_level'] == level).sum()
        return f"{count} ({(count / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    edu_low = edu_percent(0)
    edu_medium = edu_percent(1)
    edu_high = edu_percent(2)
    edu_other = edu_percent(3)

    # Grip strength
    maxgrip_mean_sd = f"{subset['maxgrip'].mean():.1f} ± {subset['maxgrip'].std():.1f}" if num_cases > 0 else "NaN"
    missing_maxgrip = subset['maxgrip'].isna().sum()
    maxgrip_n = f"{num_cases - missing_maxgrip} ({((num_cases - missing_maxgrip) / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    # Chair stand
    chair_mean_sd = f"{subset['chair'].mean():.1f} ± {subset['chair'].std():.1f}" if num_cases > 0 else "NaN"
    missing_chair = subset['chair'].isna().sum()
    chair_n = f"{num_cases - missing_chair} ({((num_cases - missing_chair) / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    total_missing = missing_maxgrip + missing_chair

    summary_dict[group] = [
        num_cases, age_mean_sd, age_n, female_n, bmi_mean_sd,
        edu_low, edu_medium, edu_high, edu_other,
        maxgrip_mean_sd, maxgrip_n, missing_maxgrip,
        chair_mean_sd, chair_n, missing_chair,
        total_missing
    ]


max_len = max(len(v) for v in summary_dict.values())
for k in summary_dict:
    summary_dict[k] += [""] * (max_len - len(summary_dict[k]))

summary_table_maxchair = pd.DataFrame(summary_dict)
summary_table_maxchair


##### maxwalk

In [None]:
maxwalk.shape

In [None]:
only_maxgrip = maxwalk[(maxwalk['maxgrip'].notna()) & (maxwalk['walking'].isna())]
only_walking = maxwalk[(maxwalk['walking'].notna()) & (maxwalk['maxgrip'].isna())]

print(f"Cases with Maxgrip but missing Walking speed: {len(only_maxgrip)}")
print(f"Cases with Walking speed but missing Maxgrip: {len(only_walking)}")


In [None]:
maxwalk['disease_category'] = 'No disease'
maxwalk.loc[(maxwalk['OA_conserv'] == 1) & (maxwalk['Hypertension'] == 0) & (maxwalk['Diabetes'] == 0), 'disease_category'] = 'Only OA'
maxwalk.loc[(maxwalk['OA_conserv'] == 0) & (maxwalk['Hypertension'] == 1) & (maxwalk['Diabetes'] == 0), 'disease_category'] = 'Only HT'
maxwalk.loc[(maxwalk['OA_conserv'] == 0) & (maxwalk['Hypertension'] == 0) & (maxwalk['Diabetes'] == 1), 'disease_category'] = 'Only Diabetes'
maxwalk.loc[(maxwalk['OA_conserv'] == 1) & (maxwalk['Hypertension'] == 1) & (maxwalk['Diabetes'] == 0), 'disease_category'] = 'OA and HT'
maxwalk.loc[(maxwalk['OA_conserv'] == 1) & (maxwalk['Hypertension'] == 0) & (maxwalk['Diabetes'] == 1), 'disease_category'] = 'OA and Diab'
maxwalk.loc[(maxwalk['OA_conserv'] == 0) & (maxwalk['Hypertension'] == 1) & (maxwalk['Diabetes'] == 1), 'disease_category'] = 'HT and Diab'
maxwalk.loc[(maxwalk['OA_conserv'] == 1) & (maxwalk['Hypertension'] == 1) & (maxwalk['Diabetes'] == 1), 'disease_category'] = 'All three diseases'


summary_dict = {
    'Variable': ['Number of cases', 'Age; mean (SD)', 'Age; n (%)', 'Sex; n, (%)', 'BMI; mean (SD)',
                 'Educational Level - Low (%)', 'Educational Level - Medium (%)',
                 'Educational Level - High (%)', 'Educational Level - Other (%)',
                 'Grip strength (kg); mean (SD)', 'Maxgrip; n (%)', 'Missing grip strength (n)',
                 'Walking speed (sec); mean (SD)', 'Walking speed; n (%)', 'Missing walking speed (n)',
                 'Total missing values (n)']
}


disease_groups = maxwalk['disease_category'].unique()


for group in disease_groups:
    subset = maxwalk[maxwalk['disease_category'] == group]
    num_cases = len(subset)

    # Age 
    age_mean_sd = f"{subset['age'].mean():.1f} ± {subset['age'].std():.1f}" if num_cases > 0 else "NaN"
    age_n = f"{num_cases - subset['age'].isna().sum()} ({((num_cases - subset['age'].isna().sum()) / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    # Female
    female_n = f"{subset['female'].sum()} ({(subset['female'].sum() / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    # BMI
    bmi_mean_sd = f"{subset['bmi'].mean():.1f} ± {subset['bmi'].std():.1f}" if num_cases > 0 else "NaN"

    # Educational level counts (%)
    def edu_percent(level):
        count = (subset['educational_level'] == level).sum()
        return f"{count} ({(count / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    edu_low = edu_percent(0)
    edu_medium = edu_percent(1)
    edu_high = edu_percent(2)
    edu_other = edu_percent(3)

    # Grip strength
    maxgrip_mean_sd = f"{subset['maxgrip'].mean():.1f} ± {subset['maxgrip'].std():.1f}" if num_cases > 0 else "NaN"
    missing_maxgrip = subset['maxgrip'].isna().sum()
    maxgrip_n = f"{num_cases - missing_maxgrip} ({((num_cases - missing_maxgrip) / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    # Walking speed
    walking_mean_sd = f"{subset['walking'].mean():.1f} ± {subset['walking'].std():.1f}" if num_cases > 0 else "NaN"
    missing_walking = subset['walking'].isna().sum()
    valid_walking = num_cases - missing_walking
    walking_n = f"{valid_walking} ({(valid_walking / num_cases) * 100:.1f}%)" if num_cases > 0 else "NaN"

    total_missing = missing_maxgrip + missing_walking


    summary_dict[group] = [
        num_cases, age_mean_sd, age_n, female_n, bmi_mean_sd,
        edu_low, edu_medium, edu_high, edu_other,
        maxgrip_mean_sd, maxgrip_n, missing_maxgrip,
        walking_mean_sd, walking_n, missing_walking,
        total_missing
    ]


max_len = max(len(v) for v in summary_dict.values())
for k in summary_dict:
    summary_dict[k] += [""] * (max_len - len(summary_dict[k]))

summary_table_maxwalk = pd.DataFrame(summary_dict)
summary_table_maxwalk


##### Create csv from maxchair and maxwalk

In [None]:
maxchair.to_csv("data/maxchair.csv")
maxwalk.to_csv("data/maxwalk.csv")

In [None]:
maxchair.describe()

In [None]:
maxwalk.describe()

In [None]:
data.describe()

##### Comparison between disease:

In [None]:
maxchair['disease_count'] = maxchair[['OA_conserv', 'Diabetes', 'Hypertension']].sum(axis=1)

summary_chair = maxchair.groupby('disease_count').agg(
    n=('disease_count', 'count'),
    grip_strength_mean=('maxgrip', 'mean'),
    chair_stand_mean=('chair', 'mean')
).round(1)

summary_chair


In [None]:
maxwalk['disease_count'] = maxwalk[['OA_conserv', 'Diabetes', 'Hypertension']].sum(axis=1)

summary_walk = maxwalk.groupby('disease_count').agg(
    n=('disease_count', 'count'),
    grip_strength_mean=('maxgrip', 'mean'),
    walking_mean=('walking', 'mean')
).round(1)

summary_walk 