In [52]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

#### Data loading

In [None]:
df = pd.read_csv("main_df_after_merging.csv")
df.shape

  df = pd.read_csv("/Users/mahdie/Documents/1.PhysioAi/6. Functional test/2.My_script/data/main_df_after_merging.csv")


(37143, 315)

All variables
| variable name  | meaning |
|---|---|
|ws010_ |	first walking speed result|
|ws011_ |	first walking speed time|
|ws012_ |	second walking speed result|
|ws013_ |	second walking speed time|
|cs004_ |	chair rise result|
|cs008_ |	time 5 chair rise|
|gs006_|	maxgrip left one|
|gs007_|	maxgrip left two|
|gs008_|	maxgrip right one|
|gs009_|	maxgrip right two|
|dn002_|    MOB|
|dn003_|    YOB|
|dn042_|    female|
|ph006d8| OA|
|ph010d1| pain joint|
|ph012_ | weight|
|ph013_ | height|
|ph044_ | eyesight_close |
|ph046_ | hearing |

##### Create variable "age"

In [54]:
# Wave 2 is 2007
reference_year = 2007
df['age'] = reference_year - df['YOB']

##### Plausible Height and Weight

Consider weight > 10 and height > 100

In [55]:
def clean_height_weight(df):
    print("Initial dataset size:", len(df))
    
    df = df.loc[df['weight'] > 10.0]
    print("After filtering plausible weight:", len(df))
    

    df = df.loc[df['height'] > 100.0]
    
    df.loc[(df['height'] > 1.0) & (df['height'] < 2.0), 'height'] *= 100
    print("After filtering and correcting height:", len(df))
    
    return df
df_raw = df.copy()
df = clean_height_weight(df)

Initial dataset size: 37143
After filtering plausible weight: 36504
After filtering and correcting height: 15864


In [56]:
excluded_hw = df_raw[~df_raw['mergeid'].isin(df['mergeid'])]
print("Excluded due to implausible/missing weight or height:", excluded_hw.shape)

Excluded due to implausible/missing weight or height: (21279, 316)


##### Create variable "bmi"

In [57]:
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)

##### Compute the maximum grip strength across all measurements

In [58]:
df['maxgrip'] = df[['maxgrip_left_one', 'maxgrip_left_two', 'maxgrip_right_one', 'maxgrip_right_two']].max(axis=1)

##### Compute the average walking speed across two attempts

In [59]:
df['walking'] = df[['first_walking_speed_time', 'second_walking_speed_time']].mean(axis=1)

##### Other variables

In [60]:
df['chair'] = df['time_5_chair_rise']
df['OA_conserv'] = np.logical_and(df['OA'], df['pain_joint'].astype(bool)).astype(np.int8)
df['activity'] = 2 * (4 - df['br015_']) + (4 - df['br016_'])
df['female'] = df['female'].astype(int)

In [61]:
df['chair'].sort_values(ascending=False).shape

(15864,)

##### Create df_anno

In [62]:
df_anno = df.drop_duplicates(subset='mergeid', keep='first')
df_anno.set_index('mergeid', inplace=True, drop=True)

df_anno_raw = df_anno.copy()

In [63]:
print("df:", df.shape, "df_anno:", df_anno.shape)

df: (15864, 322) df_anno: (15864, 321)


##### Categorizing & Identifying Control Groups; OA, diabetes, hypertension

In [64]:
df_anno['Hypertension'] = pd.to_numeric(df_anno['ph006d2'])
df_anno['Diabetes'] = pd.to_numeric(df_anno['ph006d5'])

In [65]:
df_anno["control"] = 0
df_anno.loc[(df_anno["OA_conserv"] == 0) & (df_anno["Hypertension"] == 0) & (df_anno["Diabetes"] == 0), "control"] = 1

In [66]:
display(df_anno['Hypertension'].value_counts())
display(df_anno['Diabetes'].value_counts())
display(df_anno['OA_conserv'].value_counts())

Hypertension
0.0    10340
1.0     5496
Name: count, dtype: int64

Diabetes
0.0    14169
1.0     1667
Name: count, dtype: int64

OA_conserv
0    12988
1     2876
Name: count, dtype: int64

In [67]:
display(df_anno[['Hypertension', 'OA_conserv', 'Diabetes']].value_counts(sort=False))

Hypertension  OA_conserv  Diabetes
0.0           0           0.0         8294
                          1.0          535
              1           0.0         1367
                          1.0          144
1.0           0           0.0         3450
                          1.0          702
              1           0.0         1058
                          1.0          286
Name: count, dtype: int64

## data exploration

##### Select features

In [68]:
df_anno.shape

(15864, 324)

In [69]:
df_anno = df_anno.drop_duplicates()
print(df_anno.shape)
_df_anno = df_anno.copy()

(15864, 324)


##### Create df_feature

In [70]:
# features = ['bmi', 'weight', 'age', 'OA_conserv', 'female', 'hearing', 'eyesight_close', 'Hypertension', 'Diabetes']
features = ['age', 'bmi', 'female', 'educational_level', 'OA_conserv', 'Hypertension', 'Diabetes']
df_features = df_anno.loc[:, features].copy()
df_features = df_features.dropna(axis=0)

In [71]:
excluded_features = df_anno[~df_anno.index.isin(df_features.index)]
print("Excluded due to missing features:", excluded_features.shape)


Excluded due to missing features: (43, 324)


In [72]:
df_features.shape

(15821, 7)

In [73]:
df_features.isna().sum()

age                  0
bmi                  0
female               0
educational_level    0
OA_conserv           0
Hypertension         0
Diabetes             0
dtype: int64

In [74]:
excluded_all = df_raw[~df_raw['mergeid'].isin(df_features.index)]
print("Total excluded cases:", excluded_all.shape)

Total excluded cases: (21322, 316)


In [75]:
excluded_all.to_csv("excluded_cases.csv")
df_features.to_csv("included_cases.csv")

##### Create data with outcomes and features

In [76]:
data = pd.concat([df_features, df_anno.loc[:, ['maxgrip', 'chair', 'walking',]]], axis=1)

In [77]:
data.shape

(15864, 10)

In [78]:
data = data.dropna(subset=features)

In [79]:
data.shape

(15821, 10)

In [80]:
data.isna().sum()

age                      0
bmi                      0
female                   0
educational_level        0
OA_conserv               0
Hypertension             0
Diabetes                 0
maxgrip               1407
chair                 5308
walking              14396
dtype: int64

In [81]:
print("df:", df.shape)
print("df_anno:", df_anno.shape)
print("df_features:", df_features.shape)
print("data:", data.shape)

df: (15864, 322)
df_anno: (15864, 324)
df_features: (15821, 7)
data: (15821, 10)


In [82]:
features = ['age', 'bmi', 'female', 'educational_level', 'OA_conserv', 'Hypertension', 'Diabetes']
outcomes = ['maxgrip', 'chair', 'walking']
print(data.shape)

(15821, 10)


In [83]:
print(data['female'].value_counts())
print("------------------------------------------------------------")
print(data['OA_conserv'].value_counts())
print("------------------------------------------------------------")
print(data['Hypertension'].value_counts())
print("------------------------------------------------------------")
print(data['Diabetes'].value_counts())

female
1.0    8716
0.0    7105
Name: count, dtype: int64
------------------------------------------------------------
OA_conserv
0.0    12970
1.0     2851
Name: count, dtype: int64
------------------------------------------------------------
Hypertension
0.0    10335
1.0     5486
Name: count, dtype: int64
------------------------------------------------------------
Diabetes
0.0    14158
1.0     1663
Name: count, dtype: int64


In [84]:
data.chair.value_counts()

chair
10.000000    272
9.000000     212
8.000000     208
12.000000    207
11.000000    176
            ... 
9.140000       1
16.040001      1
15.320000      1
24.580000      1
2.650000       1
Name: count, Length: 2552, dtype: int64

In [85]:
mean_age_per_chair = data.groupby('chair')['age'].mean()
print(mean_age_per_chair)


chair
0.000000     55.000000
0.050000     46.500000
0.060000     61.666667
0.060000     57.000000
0.070000     61.000000
               ...    
89.000000    56.000000
90.000000    73.000000
91.000000    65.000000
91.150002    72.000000
98.000000    65.000000
Name: age, Length: 2552, dtype: float64


#### Summery table

In [86]:
features = ['age', 'bmi', 'female', 'educational_level', 'OA_conserv', 'Hypertension', 'Diabetes']
outcomes = ['maxgrip', 'chair', 'walking']

# Create separate datasets for each outcome
datasets = {outcome: data[features + [outcome]].dropna() for outcome in outcomes}

for outcome, df_outcome in datasets.items():
    print(f"Dataset for {outcome}: {df_outcome.shape}")


Dataset for maxgrip: (14414, 8)
Dataset for chair: (10513, 8)
Dataset for walking: (1425, 8)


In [87]:
df_maxgrip = datasets['maxgrip']
df_chair = datasets['chair']
df_walking = datasets['walking']

print("Valid samples per dataset:")
print("data Maxgrip:", df_maxgrip.shape)
print("data Chair Stand:", df_chair.shape)
print("data Walking Speed:", df_walking.shape)

Valid samples per dataset:
data Maxgrip: (14414, 8)
data Chair Stand: (10513, 8)
data Walking Speed: (1425, 8)


In [88]:
df_maxgrip.age.describe()

count    14414.000000
mean        62.993270
std         10.068992
min         15.000000
25%         55.000000
50%         61.000000
75%         70.000000
max         99.000000
Name: age, dtype: float64

In [89]:
df_chair.age.describe()

count    10513.000000
mean        59.708266
std          7.457453
min         15.000000
25%         54.000000
50%         59.000000
75%         65.000000
max         96.000000
Name: age, dtype: float64

In [90]:
df_walking.age.describe()

count    1425.000000
mean       80.066667
std         3.985813
min        60.000000
25%        77.000000
50%        79.000000
75%        82.000000
max        98.000000
Name: age, dtype: float64

##### disease_group

- **0**: No disease  
- **1**:  
  - Diabetes only  
  - Hypertension only  
  - OA only  
- **2**:  
  - Diabetes + Hypertension  
  - Diabetes + OA  
  - Hypertension + OA  
- **3**: Diabetes + Hypertension + OA 


In [91]:
# Create a new categorical variable 'disease_group' based on conditions in 'data'
data['disease_group'] = 0  # Default: No disease

# Assign groups based on disease conditions
data.loc[(data['Diabetes'] == 1) & (data['Hypertension'] == 0) & (data['OA_conserv'] == 0), 'disease_group'] = 1
data.loc[(data['Diabetes'] == 0) & (data['Hypertension'] == 1) & (data['OA_conserv'] == 0), 'disease_group'] = 1
data.loc[(data['Diabetes'] == 0) & (data['Hypertension'] == 0) & (data['OA_conserv'] == 1), 'disease_group'] = 1

data.loc[(data['Diabetes'] == 1) & (data['Hypertension'] == 1) & (data['OA_conserv'] == 0), 'disease_group'] = 2
data.loc[(data['Diabetes'] == 1) & (data['Hypertension'] == 0) & (data['OA_conserv'] == 1), 'disease_group'] = 2
data.loc[(data['Diabetes'] == 0) & (data['Hypertension'] == 1) & (data['OA_conserv'] == 1), 'disease_group'] = 2

data.loc[(data['Diabetes'] == 1) & (data['Hypertension'] == 1) & (data['OA_conserv'] == 1), 'disease_group'] = 3

# Convert disease_group to categorical
data['disease_group'] = data['disease_group'].astype(int)

In [92]:
disease_counts = data['disease_group'].value_counts()
print(disease_counts)

disease_group
0    8290
1    5347
2    1899
3     285
Name: count, dtype: int64


In [93]:
valid_samples = {
    "Maxgrip": data.dropna(subset=['maxgrip'])['disease_group'].value_counts(),
    "Chair Stand": data.dropna(subset=['chair'])['disease_group'].value_counts(),
    "Walking Speed": data.dropna(subset=['walking'])['disease_group'].value_counts()
}

valid_samples_df = pd.DataFrame(valid_samples)
valid_samples_df

Unnamed: 0_level_0,Maxgrip,Chair Stand,Walking Speed
disease_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,7612,6117,533
1,4896,3314,619
2,1681,991,231
3,225,91,42


##### Summery

In [94]:
print("data: ", data.shape)
print("maxgrip: ", df_maxgrip.shape)
print("chair: ", df_chair.shape)
print("walking: ", df_walking.shape)

data:  (15821, 11)
maxgrip:  (14414, 8)
chair:  (10513, 8)
walking:  (1425, 8)


In [95]:
data

Unnamed: 0_level_0,age,bmi,female,educational_level,OA_conserv,Hypertension,Diabetes,maxgrip,chair,walking,disease_group
mergeid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AT-004234-02,53.0,30.717400,1.0,1.0,0.0,0.0,1.0,37.0,6.00,,1
AT-016392-01,61.0,23.588329,1.0,1.0,0.0,0.0,0.0,34.0,13.20,,0
AT-017298-01,64.0,25.381469,0.0,2.0,0.0,1.0,0.0,55.0,9.40,,1
AT-026212-02,59.0,24.337480,0.0,1.0,0.0,0.0,0.0,52.0,22.00,,0
AT-057442-01,66.0,31.221304,1.0,1.0,0.0,1.0,0.0,,10.84,,1
...,...,...,...,...,...,...,...,...,...,...,...
SE-994503-02,77.0,24.056935,0.0,2.0,0.0,0.0,0.0,43.0,,2.105,0
SE-996850-01,61.0,26.827421,0.0,1.0,0.0,0.0,0.0,61.0,7.03,,0
SE-996850-02,61.0,23.323418,1.0,1.0,0.0,1.0,0.0,32.0,13.94,,1
SE-996868-01,58.0,35.492158,0.0,1.0,0.0,1.0,0.0,58.0,10.38,,1


##### Remove ages under 50

In [96]:
data = data[data['age'] >= 50].copy()
data.shape

(15285, 11)

##### Create maxchair and maxwalk

In [97]:
data.isna().sum()

age                      0
bmi                      0
female                   0
educational_level        0
OA_conserv               0
Hypertension             0
Diabetes                 0
maxgrip               1363
chair                 5224
walking              13860
disease_group            0
dtype: int64

In [98]:
features = ['age', 'bmi', 'female', 'educational_level', 'OA_conserv', 'Hypertension', 'Diabetes']
outcomes = ['maxgrip', 'chair', 'walking']

# Create maxchair dataset: Includes maxgrip and chair outcomes, restricted to age < 75
maxchair = data[
    (data['age'] < 75) & 
    data[['maxgrip', 'chair'] + features].notnull().all(axis=1)
][features + ['maxgrip', 'chair']]

# Create maxwalk dataset: Includes maxgrip and walking outcomes, restricted to age ≥ 75
maxwalk = data[
    (data['age'] >= 75) & 
    data[['maxgrip', 'walking'] + features].notnull().all(axis=1)
][features + ['maxgrip', 'walking']]

print("Dataset maxchair (Age 50–74, Maxgrip + Chair):", maxchair.shape)
print("Dataset maxwalk (Age ≥75, Maxgrip + Walking):", maxwalk.shape)

Dataset maxchair (Age 50–74, Maxgrip + Chair): (9753, 9)
Dataset maxwalk (Age ≥75, Maxgrip + Walking): (1331, 9)


In [99]:
maxchair.shape

(9753, 9)

##### Overlap between maxchair and maxwalk:

In [100]:
overlap_cases_max = maxchair.index.intersection(maxwalk.index)
print(f"Number of overlapping cases between maxchair and maxwalk: {len(overlap_cases_max)}")

Number of overlapping cases between maxchair and maxwalk: 0


##### Overlap between Chair and Walk:

In [101]:
overlap_cases = df_chair.index.intersection(df_walking.index)
print(f"Number of overlapping cases: {len(overlap_cases)}")

Number of overlapping cases: 0


##### Create csv from maxchair and maxwalk

In [102]:
maxchair.to_csv("maxchair.csv")
maxwalk.to_csv("maxwalk.csv")

##### Comparison between disease:

In [103]:
maxchair['disease_count'] = maxchair[['OA_conserv', 'Diabetes', 'Hypertension']].sum(axis=1)

summary_chair = maxchair.groupby('disease_count').agg(
    n=('disease_count', 'count'),
    grip_strength_mean=('maxgrip', 'mean'),
    chair_stand_mean=('chair', 'mean')
).round(1)

summary_chair


Unnamed: 0_level_0,n,grip_strength_mean,chair_stand_mean
disease_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,5617,38.2,10.9
1.0,3125,36.8,11.3
2.0,923,34.3,12.4
3.0,88,31.7,12.7


In [104]:
maxwalk['disease_count'] = maxwalk[['OA_conserv', 'Diabetes', 'Hypertension']].sum(axis=1)

summary_walk = maxwalk.groupby('disease_count').agg(
    n=('disease_count', 'count'),
    grip_strength_mean=('maxgrip', 'mean'),
    walking_mean=('walking', 'mean')
).round(1)

summary_walk 

Unnamed: 0_level_0,n,grip_strength_mean,walking_mean
disease_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,507,29.2,4.8
1.0,578,26.7,5.1
2.0,211,26.3,5.4
3.0,35,24.9,5.3
