## Contents

This doesn't produce any file which will be used later

- Keep only x states where we have the largest difference in biometrics
- Implement the convex hull algorithm to plot the cluster areas

In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from scipy.spatial import Voronoi, voronoi_plot_2d
from scipy.spatial import Voronoi, ConvexHull
import plotly.graph_objects as go

pd.set_option('display.max_columns', None)

In [21]:

# read file produced from biometrics_clustering.ipynb
bio = pd.read_pickle('../data_processed/biometrics_clustered_males.pkl')
bio.head(2)

Unnamed: 0,BMI,Degree Of Obesity Perc,Fat mass Perc,Intra Cellular Water,Left Arm Fat Perc,Left Leg Fat Perc,Right Arm Fat Perc,Right Leg Fat Perc,Standard Body Weight,Total Body Water Perc,Trunk Fat Perc,Visceral Fat Rating,Weight_per_height,Muscle Mass_per_height,Fat Free Mass_per_height,Bone Mass_per_height,Trunk Muscle Mass_per_height,Basal Metabolic Rate_per_age,Metabolic Age_per_age,Cluster,CloudId,MeasuredOnWeek,PCA1,PCA2
20,25.9,17.7,16.4,26.1,13.1,18.2,14.4,19.1,59.2,60.8,15.3,5.0,0.42561,0.338415,0.356098,0.017683,0.185976,45.621622,0.675676,4,0022b439d0d622b758a0e9d0a8c8ea47c9d4f871,4,-1.617988,-0.211255
21,26.1,18.8,19.2,24.7,16.4,18.3,16.7,19.3,59.2,59.3,19.8,6.0,0.428659,0.328659,0.346341,0.017683,0.178049,44.621622,0.891892,3,0022b439d0d622b758a0e9d0a8c8ea47c9d4f871,12,-0.770443,-0.953107


### Keep only 4 states
#### Methodology

1. Calculate the absolute differences between consecutive measurements.
2. Select the top N measurements with the largest changes.
3. Ensure the first measurement is always included.



In [22]:
NUMBER_OF_MEASUREMENTS = 5

In [23]:

def select_top_states(group):
    # Ensure chronological order
    group = group.sort_values(by='MeasuredOnWeek').reset_index(drop=True)
    
    # Calculate absolute differences between consecutive measurements for each column
    diff_cols = [col for col in group.columns if col not in ['gender_m', 'gender_f', 'CloudId', 'MeasuredOnWeek', 'Cluster']]
    group['Diff'] = group[diff_cols].diff().abs().sum(axis=1)
    
    # Always include the first measurement
    indices = [0]
    
    # Select the top 3 measurements with the largest changes
    if len(group) > 1:
        largest_changes = group.iloc[1:].nlargest(NUMBER_OF_MEASUREMENTS - 1, 'Diff').index
        indices.extend(largest_changes)
    
    # Sort indices to maintain chronological order
    indices = sorted(indices)
    
    return group.loc[indices].drop(columns='Diff')

# Apply the function to each user group
reduced_bio = bio.groupby('CloudId').apply(select_top_states).reset_index(drop=True)

reduced_bio.head(3)





Unnamed: 0,BMI,Degree Of Obesity Perc,Fat mass Perc,Intra Cellular Water,Left Arm Fat Perc,Left Leg Fat Perc,Right Arm Fat Perc,Right Leg Fat Perc,Standard Body Weight,Total Body Water Perc,Trunk Fat Perc,Visceral Fat Rating,Weight_per_height,Muscle Mass_per_height,Fat Free Mass_per_height,Bone Mass_per_height,Trunk Muscle Mass_per_height,Basal Metabolic Rate_per_age,Metabolic Age_per_age,Cluster,CloudId,MeasuredOnWeek,PCA1,PCA2
0,25.9,17.7,16.4,26.1,13.1,18.2,14.4,19.1,59.2,60.8,15.3,5.0,0.42561,0.338415,0.356098,0.017683,0.185976,45.621622,0.675676,4,0022b439d0d622b758a0e9d0a8c8ea47c9d4f871,4,-1.617988,-0.211255
1,26.1,18.8,19.2,24.7,16.4,18.3,16.7,19.3,59.2,59.3,19.8,6.0,0.428659,0.328659,0.346341,0.017683,0.178049,44.621622,0.891892,3,0022b439d0d622b758a0e9d0a8c8ea47c9d4f871,12,-0.770443,-0.953107
2,25.2,14.5,17.4,24.5,15.4,16.8,15.7,17.0,59.2,60.8,18.1,6.0,0.413415,0.32439,0.341463,0.017073,0.17622,43.864865,0.756757,4,0022b439d0d622b758a0e9d0a8c8ea47c9d4f871,16,-1.671386,-0.945742


In [24]:
# store reduced_bio
# reduced_bio.to_pickle('../data_cleaned/biometrics_m10_imputed_clustered_changes_reduced.pkl')

In [25]:
# Select a subset of unique CloudId values (e.g., the first 30 unique CloudIds)

unique_cloud_ids = reduced_bio['CloudId'].unique()[:30]

# Initialize a new figure
fig = go.Figure()

# Iterate over each selected CloudId and add a trace for each
for cloud_id in unique_cloud_ids:
    user_data = reduced_bio[reduced_bio['CloudId'] == cloud_id]
    fig.add_trace(go.Scatter(
        x=user_data['PCA1'],
        y=user_data['PCA2'],
        mode='lines+markers',
        line_shape='spline',
        text=user_data.apply(lambda row: '<br>'.join([f'{col}: {row[col]}' for col in ['Cluster']]), axis=1),
        # text=user_data.apply(lambda row: '<br>'.join([f'{col}: {row[col]}' for col in ['Weight', 'Basal Metabolic Rate', 'Basal Metabolic Rate Score', 'Degree Of Obesity Perc', 'Cluster']]), axis=1),
        # hoverinfo='text',
        marker_color=user_data['Cluster'],  # Adjusted marker size and color by cluster
        line=dict(shape='spline'),
    ))

fig.update_layout(
    title='2D PCA Plot of User Data by Week with Temporal Evolution',
    xaxis_title='PCA1',
    yaxis_title='PCA2',
    showlegend=True
)

# Add a legend to the plot
fig.update_layout(
    legend_title_text='Cluster',
    legend_title_font_size=16,
    legend_font_size=12
)

fig.show()

### Algorithm for clusters background 

In [26]:


# Select a subset of unique CloudId values (e.g., the first 10 unique CloudIds)
unique_cloud_ids = reduced_bio['CloudId'].unique()[:10]

# Initialize a new figure
fig = go.Figure()

# Collect all PCA points and their corresponding clusters
points = []
clusters = []
for cloud_id in unique_cloud_ids:
    user_data = reduced_bio[reduced_bio['CloudId'] == cloud_id]
    points.extend(user_data[['PCA1', 'PCA2']].values)
    clusters.extend(user_data['Cluster'].values)

points = np.array(points)
clusters = np.array(clusters)

# Compute the Convex Hull for each cluster
unique_clusters = np.unique(clusters)
for cluster in unique_clusters:
    cluster_points = points[clusters == cluster]
    if len(cluster_points) >= 3:  # ConvexHull requires at least 3 points
        hull = ConvexHull(cluster_points)
        hull_points = cluster_points[hull.vertices]
        fig.add_trace(go.Scatter(
            x=hull_points[:, 0],
            y=hull_points[:, 1],
            fill='toself',
            fillcolor=f'hsl({cluster * 40}, 70%, 50%)',
            line=dict(color='rgba(0,0,0,0)'),
            showlegend=False,
            hoverinfo='skip'
        ))

# Plot the user trajectories
for cloud_id in unique_cloud_ids:
    user_data = reduced_bio[reduced_bio['CloudId'] == cloud_id]
    fig.add_trace(go.Scatter(
        x=user_data['PCA1'],
        y=user_data['PCA2'],
        mode='lines+markers',
        line_shape='spline',
        text=user_data.apply(lambda row: '<br>'.join([f'{col}: {row[col]}' for col in ['Cluster']]), axis=1),
        marker=dict(size=2, color=user_data['Cluster'], colorscale='Viridis'),  # Set marker size to 6
        line=dict(shape='spline'),
    ))

fig.update_layout(
    title='2D PCA Plot of User Data by Week with Temporal Evolution',
    xaxis_title='PCA1',
    yaxis_title='PCA2',
    showlegend=True
)

# Add a legend to the plot
fig.update_layout(
    legend_title_text='Cluster',
    legend_title_font_size=16,
    legend_font_size=12
)

fig.show()

In [14]:
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist, squareform
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean

# Select a subset of unique CloudId values (e.g., the first 10 unique CloudIds)
unique_cloud_ids = reduced_bio['CloudId'].unique()[:50]

# Collect all user trajectories
trajectories = []
valid_cloud_ids = []
for cloud_id in unique_cloud_ids:
    user_data = reduced_bio[reduced_bio['CloudId'] == cloud_id]
    trajectory = user_data[['PCA1', 'PCA2']].values
    if len(trajectory) > 0:
        trajectories.append(trajectory)
        valid_cloud_ids.append(cloud_id)

# Compute pairwise DTW distances
n = len(trajectories)
distance_matrix = np.zeros((n, n))
for i in range(n):
    for j in range(i + 1, n):
        distance, path = fastdtw(trajectories[i], trajectories[j], dist=euclidean)
        distance_matrix[i, j] = distance
        distance_matrix[j, i] = distance

# Perform hierarchical clustering
Z = linkage(squareform(distance_matrix), method='ward')
cluster_labels = fcluster(Z, t=5, criterion='maxclust')  # Adjust the number of clusters as needed

# Initialize a new figure
fig = go.Figure()

# Plot the user trajectories with cluster colors
for i, cloud_id in enumerate(valid_cloud_ids):
    user_data = reduced_bio[reduced_bio['CloudId'] == cloud_id]
    fig.add_trace(go.Scatter(
        x=user_data['PCA1'],
        y=user_data['PCA2'],
        mode='lines+markers',
        line_shape='spline',
        text=user_data.apply(lambda row: '<br>'.join([f'{col}: {row[col]}' for col in ['Cluster']]), axis=1),
        marker=dict(size=6, color=cluster_labels[i], colorscale='Viridis'),  # Set marker size to 6
        line=dict(shape='spline', color=f'hsl({cluster_labels[i] * 40}, 70%, 50%)'),
        name=f'Cluster {cluster_labels[i]}'
    ))

fig.update_layout(
    title='2D PCA Plot of User Data by Week with Trajectory Clustering',
    xaxis_title='PCA1',
    yaxis_title='PCA2',
    showlegend=True
)

# Add a legend to the plot
fig.update_layout(
    legend_title_text='Cluster',
    legend_title_font_size=16,
    legend_font_size=12
)

fig.show()

In [15]:
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist, squareform
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean

# Select a subset of unique CloudId values (e.g., the first 10 unique CloudIds)
unique_cloud_ids = reduced_bio['CloudId'].unique()[:500]

# Collect all user trajectories
trajectories = []
valid_cloud_ids = []
for cloud_id in unique_cloud_ids:
    user_data = reduced_bio[reduced_bio['CloudId'] == cloud_id]
    trajectory = user_data[['PCA1', 'PCA2']].values
    if len(trajectory) > 0:  # Filter out empty trajectories
        trajectories.append(trajectory)
        valid_cloud_ids.append(cloud_id)

# Compute pairwise DTW distances
n = len(trajectories)
distance_matrix = np.zeros((n, n))
for i in range(n):
    for j in range(i + 1, n):
        distance, path = fastdtw(trajectories[i], trajectories[j], dist=euclidean)
        distance_matrix[i, j] = distance
        distance_matrix[j, i] = distance

# Perform hierarchical clustering
Z = linkage(squareform(distance_matrix), method='ward')
cluster_labels = fcluster(Z, t=30, criterion='maxclust')  # Adjust the number of clusters as needed

# Compute representative trajectories (medoids)
representative_trajectories = []
for cluster in np.unique(cluster_labels):
    cluster_indices = np.where(cluster_labels == cluster)[0]
    cluster_trajectories = [trajectories[i] for i in cluster_indices]
    
    # Compute the medoid trajectory
    medoid_index = np.argmin([np.sum([fastdtw(t1, t2, dist=euclidean)[0] for t2 in cluster_trajectories]) for t1 in cluster_trajectories])
    representative_trajectories.append(cluster_trajectories[medoid_index])

# Initialize a new figure
fig = go.Figure()

# Plot the representative trajectories with cluster colors
for i, trajectory in enumerate(representative_trajectories):
    fig.add_trace(go.Scatter(
        x=trajectory[:, 0],
        y=trajectory[:, 1],
        mode='lines+markers',
        line_shape='spline',
        marker=dict(size=6, color=i, colorscale='Viridis'),  # Set marker size to 6
        line=dict(shape='spline', color=f'hsl({i * 40}, 70%, 50%)'),
        name=f'Cluster {i + 1}'
    ))

fig.update_layout(
    title='2D PCA Plot of User Data by Week with Representative Trajectories',
    xaxis_title='PCA1',
    yaxis_title='PCA2',
    showlegend=True
)

# Add a legend to the plot
fig.update_layout(
    legend_title_text='Cluster',
    legend_title_font_size=16,
    legend_font_size=12
)

fig.show()