## Contents

- Keep the mean for each feature by week
- Imputation - keep the columns which have more than 50% of the values
- Fill the missing values with KNN imputation
    

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.impute import KNNImputer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

pd.set_option('display.max_columns', None)

In [None]:
KEEP_ONLY_MALES = True

In [None]:
# read pickle file from cleaning process
biometrics = pd.read_pickle('../data_processed/filtered_biometrics.pkl')
biometrics.drop(columns=['MeasureProvidedBy'], inplace=True)
biometrics.shape

(4555157, 11)

### Pivot table to put features as columns

In [None]:
# Create dummy variables for the gender column
biometrics = pd.get_dummies(biometrics, columns=['Gender'], prefix='', prefix_sep='')

# Rename the dummy columns to gender_m and gender_f
biometrics.rename(columns={'M': 'gender_m', 'F': 'gender_f'}, inplace=True)

biometrics['gender_m'] = biometrics['gender_m'].astype(int)
biometrics['gender_f'] = biometrics['gender_f'].astype(int)

# Pivot the dataframe to make each value of 'BiometricName' a new column
biometrics_pivot = biometrics.pivot_table(index=['CloudId', 'gender_m', 'gender_f', 'Age', 'MeasuredOnWeek'], 
                                          columns='BiometricName', 
                                          values='Value', 
                                          aggfunc='last').reset_index()

# Merge the pivoted dataframe with the original dataframe to keep other columns
biometrics_merged = pd.merge(biometrics.drop(columns=['BiometricName', 'Value']), 
                             biometrics_pivot, 
                             on=['CloudId', 'gender_m', 'gender_f', 'Age', 'MeasuredOnWeek'], 
                             how='left').drop_duplicates()

# Group by 'MeasuredOnDate' and aggregate to put measurements on the same row
biometrics_final = biometrics_merged.groupby(['CloudId', 'gender_m', 'gender_f', 'Age', 'MeasuredOnDate']).first().reset_index()
biometrics_final.head()

Unnamed: 0,CloudId,gender_m,gender_f,Age,MeasuredOnDate,MeasuredOnUTC,MeasuredOn,MeasuredOnDay,MeasuredOnWeek,MeasuredOnYear,BFM Control,BMI,Basal Metabolic Rate,Basal Metabolic Rate Score,Body cell mass,Bone Mass,Bone Mineral Content,Chest Circumference,Degree Of Obesity Perc,Diastolic Blood Pressure,Dry Lean Mass,ECW/TBW,ECW/TBW-LA,ECW/TBW-LLL,ECW/TBW-RA,ECW/TBW-RL,ECW/TBW-TR,Extra Cellular Water,Extra Cellular Water Perc,FFM Control,Fat Free Mass,Fat Free Mass Perc of Ideal Left Arm,Fat Free Mass Perc of Ideal Left Leg,Fat Free Mass Perc of Ideal Right Arm,Fat Free Mass Perc of Ideal Right Leg,Fat Free Mass Perc of Ideal Trunk,Fat Mass,Fat Mass Perc of Ideal Left Arm,Fat Mass Perc of Ideal Left Leg,Fat Mass Perc of Ideal Right Arm,Fat Mass Perc of Ideal Right Leg,Fat Mass Perc of Ideal Trunk,Fat mass Perc,Growth Score,HR At Rest,Height,Hip Circumference,InBody Score,Intra Cellular Water,Left Arm Circumference,Left Arm Fat Free Mass,Left Arm Fat Mass,Left Arm Fat Perc,Left Arm Fat Perc Score,Left Arm Muscle Circumference,Left Arm Muscle Mass,Left Arm Muscle Mass Score,Left Leg Fat Free Mass,Left Leg Fat Mass,Left Leg Fat Perc,Left Leg Fat Perc Score,Left Leg Muscle Mass,Left Leg Muscle Mass Score,Left Thigh Circumference,Leg Muscle Score,Metabolic Age,Minerals,Muscle Mass,Muscle Mass Balance Arm,Muscle Mass Balance Leg,Muscle Score,Neck Circumference,Obesity Degree,Obesity Degree of a Child,Power Threshold,Protein,Right Arm Circumference,Right Arm Fat Free Mass,Right Arm Fat Mass,Right Arm Fat Perc,Right Arm Fat Perc Score,Right Arm Muscle Mass,Right Arm Muscle Mass Score,Right Leg Fat Free Mass,Right Leg Fat Mass,Right Leg Fat Perc,Right Leg Fat Perc Score,Right Leg Muscle Mass,Right Leg Muscle Mass Score,Right Thigh Circumference,Segmental ECW LA,Segmental ECW LL,Segmental ECW RA,Segmental ECW RL,Segmental ECW TR,Segmental ICW LA,Segmental ICW LL,Segmental ICW RA,Segmental ICW RL,Segmental ICW TR,Segmental body water LA,Segmental body water LL,Segmental body water RA,Segmental body water RL,Segmental body water TR,Skeletal Muscle Mass,Soft Lean Mass,Standard Body Weight,Standard Fat Perc,Standard Muscle Mass Perc,Systolic Blood Pressure,TBW/FFM,Target Weight,Total Body Water,Total Body Water Perc,Trunk Fat Free Mass,Trunk Fat Mass,Trunk Fat Perc,Trunk Fat Perc Score,Trunk Muscle Mass,Trunk Muscle Mass Score,VFA (Visceral Fat Area),Visceral Fat Rating,Waist Circumference,Waist-Hip ratio,Weight,Weight Control
0,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,1,0,56,2022-03-07,2022-03-07 19:57:09.961000+00:00,2022-03-07 19:57:09+00:00,66,10,2022,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,88.0,
1,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,1,0,56,2022-05-08,2022-05-08 10:33:57.662000+00:00,2022-05-08 10:33:57+00:00,128,18,2022,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,88.0,
2,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,1,0,56,2022-06-11,2022-06-11 16:18:22.175000+00:00,2022-06-11 16:18:22+00:00,162,23,2022,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,88.0,
3,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,1,0,56,2022-06-19,2022-06-19 13:07:47.928000+00:00,2022-06-19 13:07:47+00:00,170,24,2022,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,87.0,
4,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,1,0,56,2022-06-26,2022-06-26 14:06:34.011000+00:00,2022-06-26 14:06:34+00:00,177,25,2022,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,86.0,


In [6]:
# Group by 'CloudId', 'BiometricName', and 'MeasuredOnWeek' and aggregate the values into lists
grouped_measurements = biometrics_final.groupby(['CloudId', 'MeasuredOnWeek']).agg(list).reset_index()
grouped_measurements.head()

Unnamed: 0,CloudId,MeasuredOnWeek,gender_m,gender_f,Age,MeasuredOnDate,MeasuredOnUTC,MeasuredOn,MeasuredOnDay,MeasuredOnYear,BFM Control,BMI,Basal Metabolic Rate,Basal Metabolic Rate Score,Body cell mass,Bone Mass,Bone Mineral Content,Chest Circumference,Degree Of Obesity Perc,Diastolic Blood Pressure,Dry Lean Mass,ECW/TBW,ECW/TBW-LA,ECW/TBW-LLL,ECW/TBW-RA,ECW/TBW-RL,ECW/TBW-TR,Extra Cellular Water,Extra Cellular Water Perc,FFM Control,Fat Free Mass,Fat Free Mass Perc of Ideal Left Arm,Fat Free Mass Perc of Ideal Left Leg,Fat Free Mass Perc of Ideal Right Arm,Fat Free Mass Perc of Ideal Right Leg,Fat Free Mass Perc of Ideal Trunk,Fat Mass,Fat Mass Perc of Ideal Left Arm,Fat Mass Perc of Ideal Left Leg,Fat Mass Perc of Ideal Right Arm,Fat Mass Perc of Ideal Right Leg,Fat Mass Perc of Ideal Trunk,Fat mass Perc,Growth Score,HR At Rest,Height,Hip Circumference,InBody Score,Intra Cellular Water,Left Arm Circumference,Left Arm Fat Free Mass,Left Arm Fat Mass,Left Arm Fat Perc,Left Arm Fat Perc Score,Left Arm Muscle Circumference,Left Arm Muscle Mass,Left Arm Muscle Mass Score,Left Leg Fat Free Mass,Left Leg Fat Mass,Left Leg Fat Perc,Left Leg Fat Perc Score,Left Leg Muscle Mass,Left Leg Muscle Mass Score,Left Thigh Circumference,Leg Muscle Score,Metabolic Age,Minerals,Muscle Mass,Muscle Mass Balance Arm,Muscle Mass Balance Leg,Muscle Score,Neck Circumference,Obesity Degree,Obesity Degree of a Child,Power Threshold,Protein,Right Arm Circumference,Right Arm Fat Free Mass,Right Arm Fat Mass,Right Arm Fat Perc,Right Arm Fat Perc Score,Right Arm Muscle Mass,Right Arm Muscle Mass Score,Right Leg Fat Free Mass,Right Leg Fat Mass,Right Leg Fat Perc,Right Leg Fat Perc Score,Right Leg Muscle Mass,Right Leg Muscle Mass Score,Right Thigh Circumference,Segmental ECW LA,Segmental ECW LL,Segmental ECW RA,Segmental ECW RL,Segmental ECW TR,Segmental ICW LA,Segmental ICW LL,Segmental ICW RA,Segmental ICW RL,Segmental ICW TR,Segmental body water LA,Segmental body water LL,Segmental body water RA,Segmental body water RL,Segmental body water TR,Skeletal Muscle Mass,Soft Lean Mass,Standard Body Weight,Standard Fat Perc,Standard Muscle Mass Perc,Systolic Blood Pressure,TBW/FFM,Target Weight,Total Body Water,Total Body Water Perc,Trunk Fat Free Mass,Trunk Fat Mass,Trunk Fat Perc,Trunk Fat Perc Score,Trunk Muscle Mass,Trunk Muscle Mass Score,VFA (Visceral Fat Area),Visceral Fat Rating,Waist Circumference,Waist-Hip ratio,Weight,Weight Control
0,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,10,[1],[0],[56],[2022-03-07],[2022-03-07 19:57:09.961000+00:00],[2022-03-07 19:57:09+00:00],[66],[2022],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[88.0],[nan]
1,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,18,[1],[0],[56],[2022-05-08],[2022-05-08 10:33:57.662000+00:00],[2022-05-08 10:33:57+00:00],[128],[2022],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[88.0],[nan]
2,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,23,[1],[0],[56],[2022-06-11],[2022-06-11 16:18:22.175000+00:00],[2022-06-11 16:18:22+00:00],[162],[2022],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[88.0],[nan]
3,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,24,[1],[0],[56],[2022-06-19],[2022-06-19 13:07:47.928000+00:00],[2022-06-19 13:07:47+00:00],[170],[2022],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[87.0],[nan]
4,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,25,[1],[0],[56],[2022-06-26],[2022-06-26 14:06:34.011000+00:00],[2022-06-26 14:06:34+00:00],[177],[2022],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[86.0],[nan]


### Select mean for each week

In [None]:
arithmetic_columns = biometrics_final.select_dtypes(include=['float64', 'int64']).columns
# Group by 'CloudId' and 'MeasuredOnWeek' and calculate the mean for the arithmetic columns
mean_values = biometrics_final.groupby(['CloudId', 'MeasuredOnWeek'])[arithmetic_columns].mean().reset_index()
print(mean_values.shape)
mean_values.head()

(90425, 122)


Unnamed: 0,CloudId,MeasuredOnWeek,gender_m,gender_f,Age,BFM Control,BMI,Basal Metabolic Rate,Basal Metabolic Rate Score,Body cell mass,Bone Mass,Bone Mineral Content,Chest Circumference,Degree Of Obesity Perc,Diastolic Blood Pressure,Dry Lean Mass,ECW/TBW,ECW/TBW-LA,ECW/TBW-LLL,ECW/TBW-RA,ECW/TBW-RL,ECW/TBW-TR,Extra Cellular Water,Extra Cellular Water Perc,FFM Control,Fat Free Mass,Fat Free Mass Perc of Ideal Left Arm,Fat Free Mass Perc of Ideal Left Leg,Fat Free Mass Perc of Ideal Right Arm,Fat Free Mass Perc of Ideal Right Leg,Fat Free Mass Perc of Ideal Trunk,Fat Mass,Fat Mass Perc of Ideal Left Arm,Fat Mass Perc of Ideal Left Leg,Fat Mass Perc of Ideal Right Arm,Fat Mass Perc of Ideal Right Leg,Fat Mass Perc of Ideal Trunk,Fat mass Perc,Growth Score,HR At Rest,Height,Hip Circumference,InBody Score,Intra Cellular Water,Left Arm Circumference,Left Arm Fat Free Mass,Left Arm Fat Mass,Left Arm Fat Perc,Left Arm Fat Perc Score,Left Arm Muscle Circumference,Left Arm Muscle Mass,Left Arm Muscle Mass Score,Left Leg Fat Free Mass,Left Leg Fat Mass,Left Leg Fat Perc,Left Leg Fat Perc Score,Left Leg Muscle Mass,Left Leg Muscle Mass Score,Left Thigh Circumference,Leg Muscle Score,Metabolic Age,Minerals,Muscle Mass,Muscle Mass Balance Arm,Muscle Mass Balance Leg,Muscle Score,Neck Circumference,Obesity Degree,Obesity Degree of a Child,Power Threshold,Protein,Right Arm Circumference,Right Arm Fat Free Mass,Right Arm Fat Mass,Right Arm Fat Perc,Right Arm Fat Perc Score,Right Arm Muscle Mass,Right Arm Muscle Mass Score,Right Leg Fat Free Mass,Right Leg Fat Mass,Right Leg Fat Perc,Right Leg Fat Perc Score,Right Leg Muscle Mass,Right Leg Muscle Mass Score,Right Thigh Circumference,Segmental ECW LA,Segmental ECW LL,Segmental ECW RA,Segmental ECW RL,Segmental ECW TR,Segmental ICW LA,Segmental ICW LL,Segmental ICW RA,Segmental ICW RL,Segmental ICW TR,Segmental body water LA,Segmental body water LL,Segmental body water RA,Segmental body water RL,Segmental body water TR,Skeletal Muscle Mass,Soft Lean Mass,Standard Body Weight,Standard Fat Perc,Standard Muscle Mass Perc,Systolic Blood Pressure,TBW/FFM,Target Weight,Total Body Water,Total Body Water Perc,Trunk Fat Free Mass,Trunk Fat Mass,Trunk Fat Perc,Trunk Fat Perc Score,Trunk Muscle Mass,Trunk Muscle Mass Score,VFA (Visceral Fat Area),Visceral Fat Rating,Waist Circumference,Waist-Hip ratio,Weight,Weight Control
0,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,10,1.0,0.0,56.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,88.0,
1,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,18,1.0,0.0,56.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,88.0,
2,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,23,1.0,0.0,56.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,88.0,
3,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,24,1.0,0.0,56.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,87.0,
4,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,25,1.0,0.0,56.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,86.0,


In [9]:
# store to pickle file
mean_values.to_pickle('../data_processed/biometrics_mean_per_week.pkl')

In [40]:
mean_values = pd.read_pickle('../data_processed/biometrics_mean_per_week.pkl')
print(mean_values.shape)
all_users = mean_values.copy()

(90425, 122)


In [41]:
all_users.head(2)

Unnamed: 0,CloudId,MeasuredOnWeek,gender_m,gender_f,Age,BFM Control,BMI,Basal Metabolic Rate,Basal Metabolic Rate Score,Body cell mass,Bone Mass,Bone Mineral Content,Chest Circumference,Degree Of Obesity Perc,Diastolic Blood Pressure,Dry Lean Mass,ECW/TBW,ECW/TBW-LA,ECW/TBW-LLL,ECW/TBW-RA,ECW/TBW-RL,ECW/TBW-TR,Extra Cellular Water,Extra Cellular Water Perc,FFM Control,Fat Free Mass,Fat Free Mass Perc of Ideal Left Arm,Fat Free Mass Perc of Ideal Left Leg,Fat Free Mass Perc of Ideal Right Arm,Fat Free Mass Perc of Ideal Right Leg,Fat Free Mass Perc of Ideal Trunk,Fat Mass,Fat Mass Perc of Ideal Left Arm,Fat Mass Perc of Ideal Left Leg,Fat Mass Perc of Ideal Right Arm,Fat Mass Perc of Ideal Right Leg,Fat Mass Perc of Ideal Trunk,Fat mass Perc,Growth Score,HR At Rest,Height,Hip Circumference,InBody Score,Intra Cellular Water,Left Arm Circumference,Left Arm Fat Free Mass,Left Arm Fat Mass,Left Arm Fat Perc,Left Arm Fat Perc Score,Left Arm Muscle Circumference,Left Arm Muscle Mass,Left Arm Muscle Mass Score,Left Leg Fat Free Mass,Left Leg Fat Mass,Left Leg Fat Perc,Left Leg Fat Perc Score,Left Leg Muscle Mass,Left Leg Muscle Mass Score,Left Thigh Circumference,Leg Muscle Score,Metabolic Age,Minerals,Muscle Mass,Muscle Mass Balance Arm,Muscle Mass Balance Leg,Muscle Score,Neck Circumference,Obesity Degree,Obesity Degree of a Child,Power Threshold,Protein,Right Arm Circumference,Right Arm Fat Free Mass,Right Arm Fat Mass,Right Arm Fat Perc,Right Arm Fat Perc Score,Right Arm Muscle Mass,Right Arm Muscle Mass Score,Right Leg Fat Free Mass,Right Leg Fat Mass,Right Leg Fat Perc,Right Leg Fat Perc Score,Right Leg Muscle Mass,Right Leg Muscle Mass Score,Right Thigh Circumference,Segmental ECW LA,Segmental ECW LL,Segmental ECW RA,Segmental ECW RL,Segmental ECW TR,Segmental ICW LA,Segmental ICW LL,Segmental ICW RA,Segmental ICW RL,Segmental ICW TR,Segmental body water LA,Segmental body water LL,Segmental body water RA,Segmental body water RL,Segmental body water TR,Skeletal Muscle Mass,Soft Lean Mass,Standard Body Weight,Standard Fat Perc,Standard Muscle Mass Perc,Systolic Blood Pressure,TBW/FFM,Target Weight,Total Body Water,Total Body Water Perc,Trunk Fat Free Mass,Trunk Fat Mass,Trunk Fat Perc,Trunk Fat Perc Score,Trunk Muscle Mass,Trunk Muscle Mass Score,VFA (Visceral Fat Area),Visceral Fat Rating,Waist Circumference,Waist-Hip ratio,Weight,Weight Control
0,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,10,1.0,0.0,56.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,88.0,
1,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,18,1.0,0.0,56.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,88.0,


### Keep only columns where we have more than x% of values

## KNN


Fill the missing values with the mean of the values of the most similar users

In [42]:
# drop columns which contain NaN values for 10% of the rows
all_users.dropna(axis=1, thresh=int(0.5*len(all_users)), inplace=True)
all_users.shape

(90425, 48)

In [44]:
imputer = KNNImputer(n_neighbors=10)
imputed_data = imputer.fit_transform(all_users.drop(columns=['MeasuredOnWeek', 'CloudId']))

In [46]:
# store the imputed data in a new DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=all_users.drop(columns=['MeasuredOnWeek', 'CloudId']).columns)
print(imputed_df.shape)
imputed_df.head(2)

(90425, 46)


Unnamed: 0,gender_m,gender_f,Age,BMI,Basal Metabolic Rate,Basal Metabolic Rate Score,Bone Mass,Degree Of Obesity Perc,Extra Cellular Water,Fat Free Mass,Fat Mass,Fat mass Perc,Height,Intra Cellular Water,Left Arm Fat Free Mass,Left Arm Fat Mass,Left Arm Fat Perc,Left Arm Muscle Mass,Left Leg Fat Free Mass,Left Leg Fat Mass,Left Leg Fat Perc,Left Leg Muscle Mass,Leg Muscle Score,Metabolic Age,Muscle Mass,Right Arm Fat Free Mass,Right Arm Fat Mass,Right Arm Fat Perc,Right Arm Muscle Mass,Right Leg Fat Free Mass,Right Leg Fat Mass,Right Leg Fat Perc,Right Leg Muscle Mass,Standard Body Weight,Standard Fat Perc,Standard Muscle Mass Perc,Total Body Water,Total Body Water Perc,Trunk Fat Free Mass,Trunk Fat Mass,Trunk Fat Perc,Trunk Fat Perc Score,Trunk Muscle Mass,Trunk Muscle Mass Score,Visceral Fat Rating,Weight
0,1.0,0.0,56.0,29.19,1938.7,9.5,3.62,23.93,18.81,67.98,20.17,22.88,174.0,30.12,4.114,1.13,17.4,4.09,10.749,3.04,20.69,10.5,85.4,46.0,63.19,4.189,1.1,16.6,4.07,10.897,3.06,20.7,10.59,71.17,17.0,56.08,48.93,56.47,35.161,11.33,21.69,-1.7,36.98,2.0,10.1,88.0
1,1.0,0.0,56.0,29.19,1938.7,9.5,3.62,23.93,18.81,67.98,20.17,22.88,174.0,30.12,4.114,1.13,17.4,4.09,10.749,3.04,20.69,10.5,85.4,46.0,63.19,4.189,1.1,16.6,4.07,10.897,3.06,20.7,10.59,71.17,17.0,56.08,48.93,56.47,35.161,11.33,21.69,-1.7,36.98,2.0,10.1,88.0


In [1]:
imputed_df.columns

NameError: name 'imputed_df' is not defined

In [47]:
imputed_df.to_pickle('../data_processed/biometrics_m50_imputed_knn.pkl')

In [48]:
# store the imputed data to a pickle file
imputed_df.to_pickle('../data_processed/biometrics_m50_imputed.pkl')

In [52]:
# read pickle file
imputed_df = pd.read_pickle('../data_processed/biometrics_m50_imputed.pkl')
imputed_df = pd.DataFrame(imputed_data, columns=all_users.drop(columns=['MeasuredOnWeek', 'CloudId']).columns)
print(imputed_df.shape)
print(imputed_df.isnull().sum())

(90425, 46)
gender_m                      0
gender_f                      0
Age                           0
BMI                           0
Basal Metabolic Rate          0
Basal Metabolic Rate Score    0
Bone Mass                     0
Degree Of Obesity Perc        0
Extra Cellular Water          0
Fat Free Mass                 0
Fat Mass                      0
Fat mass Perc                 0
Height                        0
Intra Cellular Water          0
Left Arm Fat Free Mass        0
Left Arm Fat Mass             0
Left Arm Fat Perc             0
Left Arm Muscle Mass          0
Left Leg Fat Free Mass        0
Left Leg Fat Mass             0
Left Leg Fat Perc             0
Left Leg Muscle Mass          0
Leg Muscle Score              0
Metabolic Age                 0
Muscle Mass                   0
Right Arm Fat Free Mass       0
Right Arm Fat Mass            0
Right Arm Fat Perc            0
Right Arm Muscle Mass         0
Right Leg Fat Free Mass       0
Right Leg Fat Mass          

In [55]:
# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(imputed_df)
pd.DataFrame(scaled_data, columns=imputed_df.columns).describe()

Unnamed: 0,gender_m,gender_f,Age,BMI,Basal Metabolic Rate,Basal Metabolic Rate Score,Bone Mass,Degree Of Obesity Perc,Extra Cellular Water,Fat Free Mass,Fat Mass,Fat mass Perc,Height,Intra Cellular Water,Left Arm Fat Free Mass,Left Arm Fat Mass,Left Arm Fat Perc,Left Arm Muscle Mass,Left Leg Fat Free Mass,Left Leg Fat Mass,Left Leg Fat Perc,Left Leg Muscle Mass,Leg Muscle Score,Metabolic Age,Muscle Mass,Right Arm Fat Free Mass,Right Arm Fat Mass,Right Arm Fat Perc,Right Arm Muscle Mass,Right Leg Fat Free Mass,Right Leg Fat Mass,Right Leg Fat Perc,Right Leg Muscle Mass,Standard Body Weight,Standard Fat Perc,Standard Muscle Mass Perc,Total Body Water,Total Body Water Perc,Trunk Fat Free Mass,Trunk Fat Mass,Trunk Fat Perc,Trunk Fat Perc Score,Trunk Muscle Mass,Trunk Muscle Mass Score,Visceral Fat Rating,Weight
count,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0,90425.0
mean,-1.404977e-16,1.404977e-16,-1.253321e-16,-2.571076e-16,-6.963594e-16,2.152255e-16,-1.101548e-15,2.599365e-16,5.752705e-16,1.728719e-16,1.304397e-16,-1.0922360000000001e-17,-2.115638e-15,-5.397532e-16,-2.344771e-16,-1.219533e-16,2.0076710000000002e-17,-3.413434e-16,-7.458636e-16,1.310683e-16,7.16554e-16,3.67117e-16,-6.700750000000001e-17,-3.815754e-16,2.841385e-16,-1.360973e-16,-7.009169e-17,-2.884603e-16,-3.0645470000000005e-17,6.902303e-16,-2.923106e-17,1.822227e-16,-1.077149e-15,-2.170328e-16,1.88116e-16,-6.679141e-17,7.727373e-16,-9.588103e-16,7.151396e-16,4.230646e-16,-2.806811e-16,-1.552704e-16,-8.769319e-17,-4.3375130000000003e-17,5.720488000000001e-17,1.643854e-16
std,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006
min,-1.214924,-0.8230967,-2.411656,-6.10722,-2.244213,-3.032205,-3.986481,-2.334413,-2.329027,-2.364726,-1.899771,-2.563399,-10.03757,-1.967503,-1.956453,-1.430087,-2.240567,-1.993352,-2.826006,-2.06392,-2.473147,-2.240323,-3.801997,-2.008317,-3.5614,-2.093363,-1.439468,-2.18584,-2.04176,-2.82126,-2.041666,-2.444165,-2.15223,-2.911189,-1.294842,-2.151,-2.304076,-8.34774,-2.971606,-1.961751,-2.472051,-2.152407,-2.66028,-4.231314,-1.599506,-2.801733
25%,-1.214924,-0.8230967,-0.7556577,-0.6670063,-0.7705706,-0.6134957,-0.7721879,-0.6319198,-0.824924,-0.8167634,-0.6652687,-0.7215919,-0.7367228,-0.8625051,-0.8538843,-0.5343404,-0.7129614,-0.8084371,-0.8694025,-0.6899076,-0.7984082,-0.8911056,-0.6926235,-0.710135,-0.8134509,-0.8665997,-0.6135529,-0.7052603,-0.8259541,-0.8530242,-0.6923987,-0.812262,-0.9143011,-0.7220904,-0.7941926,-0.9081997,-0.8563061,-0.6534716,-0.7132481,-0.6997549,-0.663922,-0.7440961,-0.7420556,-0.623204,-0.6854359,-0.6984633
50%,0.8230967,-0.8230967,0.03922167,-0.1614996,-0.02548312,-0.1988598,-0.03448142,-0.1690747,0.09061721,0.0312224,-0.2057594,-0.09181252,0.06193725,-0.01649077,-0.0520163,-0.2357581,-0.2204205,-0.06517209,0.02855569,-0.2319035,-0.2493471,0.02515427,-0.04930485,-0.01059669,-0.03939184,-0.0487575,-0.1928038,-0.2187841,-0.05225934,0.03105354,-0.2426428,-0.2261419,0.05623552,-0.01097334,-0.443738,0.114179,0.03287525,0.02771114,-0.04020632,-0.1411664,-0.002073646,-0.03994044,0.01670868,-0.03583728,-0.1562376,-0.07434771
75%,0.8230967,1.214924,0.7678611,0.4884375,0.7059226,0.5267531,0.4924518,0.4021992,0.7118773,0.7508247,0.4629295,0.6567931,0.6786241,0.7431956,0.7498517,0.2121152,0.5293247,0.7427246,0.73747,0.4812743,0.819179,0.7299695,0.6905116,0.7225732,0.7511032,0.7384156,0.2746952,0.5172756,0.7214354,0.7290096,0.4705416,0.819641,0.7395725,0.7055825,1.208405,0.8094435,0.7396604,0.6624496,0.7588314,0.4794874,0.647192,0.6642152,0.7328458,0.8032581,0.5653963,0.5684914
max,0.8230967,1.214924,2.29138,17.96452,3.986702,4.914984,5.630051,7.305496,3.850876,3.653766,7.530457,4.043342,4.115895,4.973267,7.335192,14.39477,4.580031,5.654737,5.274522,8.208458,4.487071,5.210581,4.668365,3.036431,3.61567,4.858295,14.923,4.741158,4.921493,9.290604,9.780489,4.400594,5.077277,4.050416,1.375288,3.078139,4.011392,3.681328,3.739673,6.851533,5.115298,3.480838,3.870208,2.481449,6.097923,11.24087


In [56]:
# Apply PCA to reduce dimensionality to 2 components
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

In [59]:
# Step 3: Create a DataFrame for the PCA result
pca_df = pd.DataFrame(pca_result, columns=['PCA1', 'PCA2'])
pca_df['MeasuredOnWeek'] = all_users['MeasuredOnWeek'].values
pca_df['CloudId'] = all_users['CloudId'].values
pca_df.head()

Unnamed: 0,PCA1,PCA2,MeasuredOnWeek,CloudId
0,4.503959,0.922746,10,0015d65e3205deb6bb6a8f0d57cc48547918f0f2
1,4.503959,0.922746,18,0015d65e3205deb6bb6a8f0d57cc48547918f0f2
2,4.503959,0.922746,23,0015d65e3205deb6bb6a8f0d57cc48547918f0f2
3,4.276904,0.936682,24,0015d65e3205deb6bb6a8f0d57cc48547918f0f2
4,4.206436,0.645258,25,0015d65e3205deb6bb6a8f0d57cc48547918f0f2


### Do it for multiple users

In [60]:
#merge on fields MeasuredOnWeek and CloudId
all = pd.merge(pca_df, all_users, on=['MeasuredOnWeek', 'CloudId'])
all.head()

Unnamed: 0,PCA1,PCA2,MeasuredOnWeek,CloudId,gender_m,gender_f,Age,BMI,Basal Metabolic Rate,Basal Metabolic Rate Score,Bone Mass,Degree Of Obesity Perc,Extra Cellular Water,Fat Free Mass,Fat Mass,Fat mass Perc,Height,Intra Cellular Water,Left Arm Fat Free Mass,Left Arm Fat Mass,Left Arm Fat Perc,Left Arm Muscle Mass,Left Leg Fat Free Mass,Left Leg Fat Mass,Left Leg Fat Perc,Left Leg Muscle Mass,Leg Muscle Score,Metabolic Age,Muscle Mass,Right Arm Fat Free Mass,Right Arm Fat Mass,Right Arm Fat Perc,Right Arm Muscle Mass,Right Leg Fat Free Mass,Right Leg Fat Mass,Right Leg Fat Perc,Right Leg Muscle Mass,Standard Body Weight,Standard Fat Perc,Standard Muscle Mass Perc,Total Body Water,Total Body Water Perc,Trunk Fat Free Mass,Trunk Fat Mass,Trunk Fat Perc,Trunk Fat Perc Score,Trunk Muscle Mass,Trunk Muscle Mass Score,Visceral Fat Rating,Weight
0,4.503959,0.922746,10,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,1.0,0.0,56.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,88.0
1,4.503959,0.922746,18,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,1.0,0.0,56.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,88.0
2,4.503959,0.922746,23,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,1.0,0.0,56.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,88.0
3,4.276904,0.936682,24,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,1.0,0.0,56.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,87.0
4,4.206436,0.645258,25,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,1.0,0.0,56.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,86.0


In [61]:
unique_cloud_ids = all['CloudId'].unique()[:30]

# Initialize a new figure
fig = go.Figure()

# Iterate over each selected CloudId and add a trace for each
for cloud_id in unique_cloud_ids:
    user_data = all[all['CloudId'] == cloud_id]
    fig.add_trace(go.Scatter(
        x=user_data['PCA1'],
        y=user_data['PCA2'],
        mode='lines+markers',
        line_shape='spline',
        name=f'{cloud_id[:4]}',
        text=user_data.apply(lambda row: '<br>'.join([f'{col}: {row[col]}' for col in ['Weight', 'Basal Metabolic Rate', 'Basal Metabolic Rate Score', 'Degree Of Obesity Perc']]), axis=1),
        hoverinfo='text',
        marker=dict(size=2, showscale=True),  # Adjusted marker size
        line=dict(shape='spline')
    ))

fig.update_layout(
    title='2D PCA Plot of User Data by Week with Temporal Evolution',
    xaxis_title='PCA1',
    yaxis_title='PCA2',
    showlegend=True
)

fig.show()