In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from scipy.spatial import Voronoi, voronoi_plot_2d

pd.set_option('display.max_columns', None)

In [None]:

# read file
bio = pd.read_pickle('../data_cleaned/biometrics_m10_imputed_clustered_changes.pkl')

### Keep only 4 states
#### Methodology

1. Calculate the absolute differences between consecutive measurements.
2. Select the top N measurements with the largest changes.
3. Ensure the first and last measurements are always included.



In [None]:

def select_top_states(group):
    # Ensure chronological order
    group = group.sort_values(by='MeasuredOnWeek').reset_index(drop=True)
    
    # Calculate absolute differences between consecutive measurements for each column
    diff_cols = [col for col in group.columns if col not in ['gender_m', 'gender_f', 'CloudId', 'MeasuredOnWeek', 'Cluster']]
    group['Diff'] = group[diff_cols].diff().abs().sum(axis=1)
    
    # Always include the first measurement
    indices = [0]
    
    # Select the top 3 measurements with the largest changes
    if len(group) > 1:
        largest_changes = group.iloc[1:].nlargest(3, 'Diff').index
        indices.extend(largest_changes)
    
    # Sort indices to maintain chronological order
    indices = sorted(indices)
    
    return group.loc[indices].drop(columns='Diff')

# Apply the function to each user group
reduced_bio = bio.groupby('CloudId').apply(select_top_states).reset_index(drop=True)

reduced_bio.head(10)





Unnamed: 0,Age,BMI,Basal Metabolic Rate,Bone Mass,Degree Of Obesity Perc,Extra Cellular Water Perc,Fat Free Mass,Fat mass Perc,Height,Intra Cellular Water,Left Arm Fat Perc,Left Leg Fat Perc,Metabolic Age,Muscle Mass,Muscle Mass Balance Arm,Muscle Mass Balance Leg,Right Arm Fat Perc,Right Leg Fat Perc,Standard Body Weight,Total Body Water Perc,Trunk Fat Perc,Trunk Muscle Mass,Visceral Fat Rating,Weight,gender_m,gender_f,CloudId,MeasuredOnWeek,Cluster,PCA1,PCA2
0,56.0,29.6,1887.2,3.8,26.46,39.42,66.44,23.78,175.8,29.0,16.94,20.74,43.4,59.54,0.0,0.8,16.2,20.6,69.92,55.68,20.26,37.68,10.2,88.0,1.0,0.0,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,10,0,-2.790102,1.563701
1,56.0,28.6,1956.8,3.6,28.72,38.42,67.36,22.24,176.6,27.44,16.02,21.0,44.2,58.7,0.6,0.2,14.88,21.32,67.92,53.36,23.94,35.22,11.0,87.0,1.0,0.0,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,24,0,-2.506614,1.41827
2,56.0,31.72,1744.4,3.46,52.9,42.4,60.04,28.68,166.4,25.78,39.78,42.6,48.4,48.48,-0.2,0.4,38.9,42.86,55.76,48.18,34.1,28.96,9.6,85.2,1.0,0.0,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,41,2,2.293639,3.70722
3,56.0,27.62,1917.4,3.78,20.74,39.4,66.78,22.32,176.2,28.62,17.14,20.94,47.0,58.44,0.6,0.8,16.06,20.68,72.58,55.22,24.58,35.0,11.0,87.5,1.0,0.0,0015d65e3205deb6bb6a8f0d57cc48547918f0f2,45,0,-2.641949,1.361404
4,63.0,24.7,1846.0,3.2,13.5,40.3,65.2,14.0,174.0,27.5,12.6,17.6,47.0,62.0,-1.0,1.0,13.0,17.0,66.6,61.9,10.7,35.8,9.0,74.8,1.0,0.0,0024a5d2cf20efc70369e4736d130e270618ce0a,3,5,-3.012908,-0.576501
5,63.0,25.0,1848.0,3.2,13.5,42.8,65.2,13.8,174.0,25.3,12.0,13.4,48.0,62.0,1.0,-1.0,10.1,15.0,66.6,58.5,14.0,33.9,8.0,75.6,1.0,0.0,0024a5d2cf20efc70369e4736d130e270618ce0a,31,4,-2.502473,-0.883911
6,63.0,24.4,1776.0,3.1,11.0,42.4,62.6,15.3,174.0,24.9,13.0,15.2,48.0,59.5,1.0,-2.0,10.4,17.2,66.6,58.5,15.3,32.7,8.0,73.9,1.0,0.0,0024a5d2cf20efc70369e4736d130e270618ce0a,37,4,-2.017589,-1.014583
7,63.0,24.4,1839.0,3.2,11.0,43.2,65.1,11.9,174.0,24.8,10.4,10.9,48.0,61.9,0.2,-2.0,9.6,13.5,66.6,59.1,12.3,33.9,7.0,73.9,1.0,0.0,0024a5d2cf20efc70369e4736d130e270618ce0a,45,4,-2.659034,-1.161355
8,38.0,27.0,1770.0,3.0,20.9,40.2,62.6,23.3,172.0,27.3,19.5,20.0,45.0,59.5,0.8,1.0,20.2,21.5,65.1,56.3,25.3,31.2,8.0,79.9,1.0,0.0,009f25b3ae8c7605c65c431bc94edd625d57386e,2,0,-1.556801,0.440451
9,38.0,27.0,1825.0,3.1,22.7,40.4,62.5,21.8,172.0,27.0,17.9,20.3,41.0,59.4,0.4,1.0,17.2,19.2,65.1,56.7,23.9,32.2,8.0,79.9,1.0,0.0,009f25b3ae8c7605c65c431bc94edd625d57386e,3,5,-1.789687,0.286895


In [25]:
# Select a subset of unique CloudId values (e.g., the first 30 unique CloudIds)
import plotly.graph_objects as go
unique_cloud_ids = reduced_bio['CloudId'].unique()[:30]

# Initialize a new figure
fig = go.Figure()

# Iterate over each selected CloudId and add a trace for each
for cloud_id in unique_cloud_ids:
    user_data = reduced_bio[reduced_bio['CloudId'] == cloud_id]
    fig.add_trace(go.Scatter(
        x=user_data['PCA1'],
        y=user_data['PCA2'],
        mode='lines+markers',
        line_shape='spline',
        text=user_data.apply(lambda row: '<br>'.join([f'{col}: {row[col]}' for col in ['Cluster']]), axis=1),
        # text=user_data.apply(lambda row: '<br>'.join([f'{col}: {row[col]}' for col in ['Weight', 'Basal Metabolic Rate', 'Basal Metabolic Rate Score', 'Degree Of Obesity Perc', 'Cluster']]), axis=1),
        # hoverinfo='text',
        marker_color=user_data['Cluster'],  # Adjusted marker size and color by cluster
        line=dict(shape='spline'),
    ))

fig.update_layout(
    title='2D PCA Plot of User Data by Week with Temporal Evolution',
    xaxis_title='PCA1',
    yaxis_title='PCA2',
    showlegend=True
)

# Add a legend to the plot
fig.update_layout(
    legend_title_text='Cluster',
    legend_title_font_size=16,
    legend_font_size=12
)

fig.show()

### Algorithm for clusters background 

In [26]:
from scipy.spatial import Voronoi, ConvexHull

# Select a subset of unique CloudId values (e.g., the first 10 unique CloudIds)
unique_cloud_ids = reduced_bio['CloudId'].unique()[:10]

# Initialize a new figure
fig = go.Figure()

# Collect all PCA points and their corresponding clusters
points = []
clusters = []
for cloud_id in unique_cloud_ids:
    user_data = reduced_bio[reduced_bio['CloudId'] == cloud_id]
    points.extend(user_data[['PCA1', 'PCA2']].values)
    clusters.extend(user_data['Cluster'].values)

points = np.array(points)
clusters = np.array(clusters)

# Compute the Convex Hull for each cluster
unique_clusters = np.unique(clusters)
for cluster in unique_clusters:
    cluster_points = points[clusters == cluster]
    if len(cluster_points) >= 3:  # ConvexHull requires at least 3 points
        hull = ConvexHull(cluster_points)
        hull_points = cluster_points[hull.vertices]
        fig.add_trace(go.Scatter(
            x=hull_points[:, 0],
            y=hull_points[:, 1],
            fill='toself',
            fillcolor=f'hsl({cluster * 40}, 70%, 50%)',
            line=dict(color='rgba(0,0,0,0)'),
            showlegend=False,
            hoverinfo='skip'
        ))

# Plot the user trajectories
for cloud_id in unique_cloud_ids:
    user_data = reduced_bio[reduced_bio['CloudId'] == cloud_id]
    fig.add_trace(go.Scatter(
        x=user_data['PCA1'],
        y=user_data['PCA2'],
        mode='lines+markers',
        line_shape='spline',
        text=user_data.apply(lambda row: '<br>'.join([f'{col}: {row[col]}' for col in ['Cluster']]), axis=1),
        marker=dict(size=2, color=user_data['Cluster'], colorscale='Viridis'),  # Set marker size to 6
        line=dict(shape='spline'),
    ))

fig.update_layout(
    title='2D PCA Plot of User Data by Week with Temporal Evolution',
    xaxis_title='PCA1',
    yaxis_title='PCA2',
    showlegend=True
)

# Add a legend to the plot
fig.update_layout(
    legend_title_text='Cluster',
    legend_title_font_size=16,
    legend_font_size=12
)

fig.show()

In [27]:
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist, squareform
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean

# Select a subset of unique CloudId values (e.g., the first 10 unique CloudIds)
unique_cloud_ids = reduced_bio['CloudId'].unique()[:50]

# Collect all user trajectories
trajectories = []
valid_cloud_ids = []
for cloud_id in unique_cloud_ids:
    user_data = reduced_bio[reduced_bio['CloudId'] == cloud_id]
    trajectory = user_data[['PCA1', 'PCA2']].values
    if len(trajectory) > 0:
        trajectories.append(trajectory)
        valid_cloud_ids.append(cloud_id)

# Compute pairwise DTW distances
n = len(trajectories)
distance_matrix = np.zeros((n, n))
for i in range(n):
    for j in range(i + 1, n):
        distance, path = fastdtw(trajectories[i], trajectories[j], dist=euclidean)
        distance_matrix[i, j] = distance
        distance_matrix[j, i] = distance

# Perform hierarchical clustering
Z = linkage(squareform(distance_matrix), method='ward')
cluster_labels = fcluster(Z, t=5, criterion='maxclust')  # Adjust the number of clusters as needed

# Initialize a new figure
fig = go.Figure()

# Plot the user trajectories with cluster colors
for i, cloud_id in enumerate(valid_cloud_ids):
    user_data = reduced_bio[reduced_bio['CloudId'] == cloud_id]
    fig.add_trace(go.Scatter(
        x=user_data['PCA1'],
        y=user_data['PCA2'],
        mode='lines+markers',
        line_shape='spline',
        text=user_data.apply(lambda row: '<br>'.join([f'{col}: {row[col]}' for col in ['Cluster']]), axis=1),
        marker=dict(size=6, color=cluster_labels[i], colorscale='Viridis'),  # Set marker size to 6
        line=dict(shape='spline', color=f'hsl({cluster_labels[i] * 40}, 70%, 50%)'),
        name=f'Cluster {cluster_labels[i]}'
    ))

fig.update_layout(
    title='2D PCA Plot of User Data by Week with Trajectory Clustering',
    xaxis_title='PCA1',
    yaxis_title='PCA2',
    showlegend=True
)

# Add a legend to the plot
fig.update_layout(
    legend_title_text='Cluster',
    legend_title_font_size=16,
    legend_font_size=12
)

fig.show()

In [28]:
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist, squareform
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean

# Select a subset of unique CloudId values (e.g., the first 10 unique CloudIds)
unique_cloud_ids = reduced_bio['CloudId'].unique()[:100]

# Collect all user trajectories
trajectories = []
valid_cloud_ids = []
for cloud_id in unique_cloud_ids:
    user_data = reduced_bio[reduced_bio['CloudId'] == cloud_id]
    trajectory = user_data[['PCA1', 'PCA2']].values
    if len(trajectory) > 0:  # Filter out empty trajectories
        trajectories.append(trajectory)
        valid_cloud_ids.append(cloud_id)

# Compute pairwise DTW distances
n = len(trajectories)
distance_matrix = np.zeros((n, n))
for i in range(n):
    for j in range(i + 1, n):
        distance, path = fastdtw(trajectories[i], trajectories[j], dist=euclidean)
        distance_matrix[i, j] = distance
        distance_matrix[j, i] = distance

# Perform hierarchical clustering
Z = linkage(squareform(distance_matrix), method='ward')
cluster_labels = fcluster(Z, t=5, criterion='maxclust')  # Adjust the number of clusters as needed

# Compute representative trajectories (medoids)
representative_trajectories = []
for cluster in np.unique(cluster_labels):
    cluster_indices = np.where(cluster_labels == cluster)[0]
    cluster_trajectories = [trajectories[i] for i in cluster_indices]
    
    # Compute the medoid trajectory
    medoid_index = np.argmin([np.sum([fastdtw(t1, t2, dist=euclidean)[0] for t2 in cluster_trajectories]) for t1 in cluster_trajectories])
    representative_trajectories.append(cluster_trajectories[medoid_index])

# Initialize a new figure
fig = go.Figure()

# Plot the representative trajectories with cluster colors
for i, trajectory in enumerate(representative_trajectories):
    fig.add_trace(go.Scatter(
        x=trajectory[:, 0],
        y=trajectory[:, 1],
        mode='lines+markers',
        line_shape='spline',
        marker=dict(size=6, color=i, colorscale='Viridis'),  # Set marker size to 6
        line=dict(shape='spline', color=f'hsl({i * 40}, 70%, 50%)'),
        name=f'Cluster {i + 1}'
    ))

fig.update_layout(
    title='2D PCA Plot of User Data by Week with Representative Trajectories',
    xaxis_title='PCA1',
    yaxis_title='PCA2',
    showlegend=True
)

# Add a legend to the plot
fig.update_layout(
    legend_title_text='Cluster',
    legend_title_font_size=16,
    legend_font_size=12
)

fig.show()