In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

pd.set_option('display.max_columns', None)

### Contents
- We don't ignore biometrics with less measurements
- We delete redundant columns

In [None]:
# read pickle file from data_cleaned 
biometrics = pd.read_pickle('../data_cleaned/filtered_biometrics.pkl')
biometrics.drop(columns=['MeasureProvidedBy'], inplace=True)
biometrics.shape

In [None]:
mean_values = pd.read_pickle('../data_cleaned/biometrics_mean_per_week.pkl')

In [None]:
all_users = mean_values.copy()

#### Keep only columns where we have more than 20% of values
-> After this only 58 features are kept

In [None]:
# drop columns which contain NaN values for 50% of the rows
all_users.dropna(axis=1, thresh=int(0.2*len(all_users)), inplace=True)
all_users.shape

In [None]:
# read pickle file
imputed_df = pd.read_pickle('../data_cleaned/biometrics_m10_imputed.pkl')
imputed_df['gender_m']=mean_values['gender_m'].values
imputed_df['gender_f']=mean_values['gender_f'].values

In [None]:
imputed_df.shape

In [None]:
imputed_df.columns

In [None]:
columns_to_keep = ['Age', 'BMI', 'Basal Metabolic Rate', 'Bone Mass', 'Degree Of Obesity Perc',
                   'Extra Cellular Water Perc', 'Fat Free Mass', 'Fat mass Perc', 'Height', 
                   'Intra Cellular Water', 'Left Arm Fat Perc', 'Left Leg Fat Perc', 'Metabolic Age', 
                   'Muscle Mass', 'Muscle Mass Balance Arm', 'Muscle Mass Balance Leg', 
                   'Right Arm Fat Perc', 'Right Leg Fat Perc', 'Standard Body Weight', 
                   'Total Body Water Perc', 'Trunk Fat Perc', 'Trunk Muscle Mass', 
                   'Visceral Fat Rating', 'Weight', 'gender_m', 'gender_f']

imputed_df = imputed_df[columns_to_keep]
imputed_df.head()

In [None]:
# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(imputed_df)
pd.DataFrame(scaled_data, columns=imputed_df.columns).describe()

In [None]:
# Apply PCA to reduce dimensionality to 2 components
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)
# Step 3: Create a DataFrame for the PCA result
pca_df = pd.DataFrame(pca_result, columns=['PCA1', 'PCA2'])
pca_df['MeasuredOnWeek'] = all_users['MeasuredOnWeek'].values  # Aligns the indexes
pca_df['CloudId'] = all_users['CloudId'].values  # Aligns the indexes
pca_df

In [None]:
unique_cloud_ids = all_users['CloudId'].unique()[:30]

# Initialize a new figure
fig = go.Figure()

# Iterate over each selected CloudId and add a trace for each
for cloud_id in unique_cloud_ids:
    user_data = pca_df[pca_df['CloudId'] == cloud_id]
    fig.add_trace(go.Scatter(
        x=user_data['PCA1'],
        y=user_data['PCA2'],
        mode='lines+markers',
        line_shape='spline',
        name=f'{cloud_id[:4]}',
        text=all_users.apply(lambda row: '<br>'.join([f'{col}: {row[col]}' for col in ['Weight', 'Basal Metabolic Rate', 'Basal Metabolic Rate Score', 'Degree Of Obesity Perc']]), axis=1),
        hoverinfo='text',
        marker=dict(size=2, showscale=True),  # Adjusted marker size
        line=dict(shape='spline')
    ))

fig.update_layout(
    title='2D PCA Plot of User Data by Week with Temporal Evolution',
    xaxis_title='PCA1',
    yaxis_title='PCA2',
    showlegend=True
)

fig.show()

### Cluster biometrics

In [None]:
bio = imputed_df.copy()
bio_mean = bio.mean()
bio_std = bio.std()
bio['CloudId'] = all_users['CloudId'].values  # Aligns the indexes
bio['MeasuredOnWeek'] = all_users['MeasuredOnWeek'].values  # Aligns the indexes

#### PCA

In [None]:
from sklearn.cluster import KMeans

bio_features = bio.drop(columns=['CloudId', 'MeasuredOnWeek'])

scaler = StandardScaler()
bio_scaled = scaler.fit_transform(bio_features)

num_clusters = 6  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
bio['Cluster'] = kmeans.fit_predict(bio_scaled)

# Add cluster labels to the DataFrame
bio['Cluster'] = kmeans.labels_

# Optional: Visualize the clusters using PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(bio_scaled)
pca_df_bio = pd.DataFrame(pca_result, columns=['PCA1', 'PCA2'])
pca_df_bio['Cluster'] = bio['Cluster']
pca_df_bio['CloudId'] = bio['CloudId']


fig = px.scatter(
    pca_df_bio, 
    x='PCA1', 
    y='PCA2', 
    color='Cluster', 
    title='PCA of Biometric Profiles',
    hover_data=['CloudId'],
)

fig.update_traces(marker=dict(size=1))

fig.show()

In [None]:
pca_df_bio.shape

In [None]:
# Group by 'Cluster' and calculate the mean for each group, keeping only numerical features
numerical_features = [col for col in bio.columns if bio[col].dtype in ['int64', 'float64']]

cluster_means = bio.groupby('Cluster')[numerical_features].mean()

# Add the count of records in each cluster
cluster_counts = bio['Cluster'].value_counts().sort_index()
cluster_means['Count'] = cluster_counts

# Display the average column values for each cluster along with the count
cluster_means

In [None]:
cluster_means_scaled_df = (cluster_means - bio_mean) / bio_std

In [None]:
# Function to find the quantile for a given value
def find_quantile(value, feature_values):
    return np.count_nonzero(feature_values < value) / feature_values.size

# Find the quantile for each cluster mean
cluster_quantiles = pd.DataFrame(index=cluster_means.index, columns=cluster_means.columns)

for feature in cluster_means.columns:
    if feature == 'Count':
        continue
    feature_values = bio[feature].values
    for cluster in cluster_means.index:
        cluster_mean = cluster_means.at[cluster, feature]
        quantile = find_quantile(cluster_mean, feature_values)
        cluster_quantiles.at[cluster, feature] = quantile

In [None]:
# x% of the data points for that feature in the original dataset are less
# than the cluster mean.
cluster_quantiles

In [None]:
# Convert cluster_quantiles to numeric
cluster_quantiles_numeric = cluster_quantiles.apply(pd.to_numeric, errors='coerce')

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(cluster_quantiles_numeric.drop(columns=['Count']), annot=True, cmap='viridis')
plt.title('Heatmap of Quantiles for Cluster Means')
plt.show()

In [None]:
# plot cluster means for each feature without counts dont show counts
plt.figure(figsize=(12, 8))
sns.heatmap(cluster_means_scaled_df.drop(columns=['Count', 'Basal Metabolic Rate']), annot=False, cmap='coolwarm', fmt='.2f')
plt.title('Cluster Means for Biometric Features')
plt.show()


#### TSNE

In [None]:
from sklearn.manifold import TSNE

bio_features = bio.drop(columns=['CloudId', 'MeasuredOnWeek'])

# Standardize the Data
scaler = StandardScaler()
bio_scaled = scaler.fit_transform(bio_features)

# Apply t-SNE for visualization
tsne = TSNE(n_components=2, perplexity=40, max_iter=2000, random_state=42)
tsne_result = tsne.fit_transform(bio_scaled)
tsne_df_bio = pd.DataFrame(tsne_result, columns=['TSNE1', 'TSNE2'])

In [None]:
tsne_df_bio['CloudId'] = bio['CloudId']

In [None]:
from sklearn.cluster import KMeans
# Apply Clustering
num_clusters = 4 # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
bio['Cluster'] = kmeans.fit_predict(bio_scaled)
tsne_df_bio['Cluster'] = bio['Cluster']

In [None]:
# Plot the data with Seaborn and Matplotlib
plt.figure(figsize=(12, 8))

# Define a color palette
palette = sns.color_palette("viridis", num_clusters)

# Plot the clusters
sns.scatterplot(
    x='TSNE1', 
    y='TSNE2', 
    hue='Cluster', 
    palette=palette, 
    data=tsne_df_bio, 
    legend='full', 
    s=5 # Marker size
)

### UMAP

In [None]:
import umap

# Apply UMAP for visualization
umap_reducer = umap.UMAP(n_components=2, random_state=42)
umap_result = umap_reducer.fit_transform(bio_scaled)
umap_df_bio = pd.DataFrame(umap_result, columns=['UMAP1', 'UMAP2'])
umap_df_bio['Cluster'] = bio_sample['Cluster']
umap_df_bio['CloudId'] = bio_sample['CloudId']



In [None]:
# Plot the data with Seaborn and Matplotlib
plt.figure(figsize=(12, 8))

# Define a color palette
palette = sns.color_palette("viridis", num_clusters)

# Plot the clusters
sns.scatterplot(
    x='UMAP1', 
    y='UMAP2', 
    hue='Cluster', 
    palette=palette, 
    data=umap_df_bio, 
    legend='full', 
    s=3  # Marker size
)

In [None]:
# merge bio and pca_df on cloudid and keep only the columns with PCA1 and PCA2
bio_pca = pd.merge(bio, pca_df, on=['CloudId', 'MeasuredOnWeek'])

In [None]:
# Select a subset of unique CloudId values (e.g., the first 30 unique CloudIds)
import plotly.graph_objects as go
unique_cloud_ids = pca_df_bio['CloudId'].unique()[:30]

# Initialize a new figure
fig = go.Figure()

# Iterate over each selected CloudId and add a trace for each
for cloud_id in unique_cloud_ids:
    user_data = pca_df_bio[pca_df_bio['CloudId'] == cloud_id]
    fig.add_trace(go.Scatter(
        x=user_data['PCA1'],
        y=user_data['PCA2'],
        mode='lines+markers',
        line_shape='spline',
        text=user_data.apply(lambda row: '<br>'.join([f'{col}: {row[col]}' for col in ['Cluster']]), axis=1),
        # text=user_data.apply(lambda row: '<br>'.join([f'{col}: {row[col]}' for col in ['Weight', 'Basal Metabolic Rate', 'Basal Metabolic Rate Score', 'Degree Of Obesity Perc', 'Cluster']]), axis=1),
        # hoverinfo='text',
        marker_color=user_data['Cluster'],  # Adjusted marker size and color by cluster
        line=dict(shape='spline'),
    ))

fig.update_layout(
    title='2D PCA Plot of User Data by Week with Temporal Evolution',
    xaxis_title='PCA1',
    yaxis_title='PCA2',
    showlegend=True
)

# Add a legend to the plot
fig.update_layout(
    legend_title_text='Cluster',
    legend_title_font_size=16,
    legend_font_size=12
)

fig.show()

In [None]:
# Group by CloudId and count the number of unique clusters for each user
user_cluster_counts = bio.groupby('CloudId')['Cluster'].nunique()

# Filter users who have more than one distinct cluster
users_with_multiple_clusters = user_cluster_counts[user_cluster_counts > 1]

# Get the number of such users
num_users_with_multiple_clusters = users_with_multiple_clusters.shape[0]

total_users = bio['CloudId'].nunique()
percentage_users_with_multiple_clusters = (num_users_with_multiple_clusters / total_users) * 100

percentage_users_with_multiple_clusters

Keep only those users and perform analysis only to them

In [None]:
bio['PCA1'] = pca_df_bio['PCA1'].values
bio['PCA2'] = pca_df_bio['PCA2'].values

In [None]:
# Filter the bio DataFrame to keep only those users with multiple clusters
bio_filtered = bio[bio['CloudId'].isin(users_with_multiple_clusters.index)]

# Display the shape of the filtered DataFrame
bio_filtered.shape

In [None]:
# store to file
bio_filtered.to_pickle('../data_cleaned/biometrics_m10_imputed_clustered_changes.pkl')