#Unsupervised learning : 


Why We Chose K-Means for Clustering:

K-Means is ideal because it clusters data points based on similarity, allowing us to discover patterns in fitness characteristics. Since age, height, and weight contribute significantly to a person’s fitness tendencies, K-Means helps us form meaningful clusters that align with these distinctions. Additionally, it is efficient, scalable, and well-suited for numerical data. We validated our choice using metrics like the Silhouette Score, Total Within-Cluster Sum of Squares (Inertia), and BCubed Precision and Recall, ensuring that the selected number of clusters (K=2) effectively separates individuals into the two primary fitness types. The Elbow Method further confirmed that two clusters provide the best balance between separation and cohesion.


How Clusters Improve Recommendations:

By clustering users into Cardio or Muscular, we can offer personalized workout and nutrition plans. Cardio users receive endurance-based programs, while Muscular users get strength-focused routines. This improves recommendations by ensuring users follow fitness plans suited to their body type, leading to better results and a more tailored experience.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

# Set random seed for reproducibility
np.random.seed(42)
num_samples = 100

# Generate well-separated data points for "Cardio" (label 0) and "Muscular" (label 1)
age_cardio = np.random.randint(18, 40, num_samples // 2) + np.random.uniform(-1, 1, num_samples // 2)
height_cardio = np.random.randint(150, 170, num_samples // 2) + np.random.uniform(-1, 1, num_samples // 2)
weight_cardio = np.random.randint(50, 70, num_samples // 2) + np.random.uniform(-1, 1, num_samples // 2)

age_muscular = np.random.randint(25, 50, num_samples // 2) + np.random.uniform(-1, 1, num_samples // 2)
height_muscular = np.random.randint(170, 190, num_samples // 2) + np.random.uniform(-1, 1, num_samples // 2)
weight_muscular = np.random.randint(70, 90, num_samples // 2) + np.random.uniform(-1, 1, num_samples // 2)

# Combine the data
age = np.concatenate([age_cardio, age_muscular])
height = np.concatenate([height_cardio, height_muscular])
weight = np.concatenate([weight_cardio, weight_muscular])

# True labels for "Cardio" (0) and "Muscular" (1)
labels_true = np.concatenate([np.zeros(num_samples // 2), np.ones(num_samples // 2)])

# Normalize features
scaler = StandardScaler()
features = np.vstack([age, height, weight]).T  # Stack into a matrix for easier processing
features_scaled = scaler.fit_transform(features)

# Apply K-Means clustering (2 clusters)
kmeans = KMeans(n_clusters=2, random_state=42)
labels_pred = kmeans.fit_predict(features_scaled)

# Evaluate using Silhouette Score and Inertia
silhouette_avg = silhouette_score(features_scaled, labels_pred)
inertia = kmeans.inertia_

# Print evaluation metrics
print(f"Silhouette Score: {silhouette_avg:.4f}")
print(f"Inertia (Total Within-Cluster Sum of Squares): {inertia:.2f}")

# BCubed Precision and Recall Calculation
def bcubed_precision_recall(true_labels, pred_labels):
    n = len(true_labels)
    precision = recall = 0.0

    for i in range(n):
        same_class = np.where(true_labels == true_labels[i])[0]
        same_cluster = np.where(pred_labels == pred_labels[i])[0]
        
        # BCubed Precision
        precision += len(np.intersect1d(same_class, same_cluster)) / len(same_cluster)
        
        # BCubed Recall
        recall += len(np.intersect1d(same_class, same_cluster)) / len(same_class)

    precision /= n
    recall /= n
    return precision, recall

# Calculate BCubed Precision and Recall
bcubed_precision, bcubed_recall = bcubed_precision_recall(labels_true, labels_pred)

# Print BCubed Precision and Recall
print(f"BCubed Precision: {bcubed_precision:.4f}")
print(f"BCubed Recall: {bcubed_recall:.4f}")

# Plot Elbow Method for optimal k
k_range = range(2, 10)  # Testing clusters from 2 to 9
inertia_list = []
silhouette_scores_list = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(features_scaled)
    inertia_list.append(kmeans.inertia_)
    silhouette_scores_list.append(silhouette_score(features_scaled, kmeans.labels_))

plt.figure(figsize=(8, 5))
plt.plot(k_range, inertia_list, marker='o', linestyle='--')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.show()

# Plot Silhouette Scores for different cluster numbers
plt.figure(figsize=(8, 5))
plt.plot(k_range, silhouette_scores_list, marker='s', linestyle='-')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Scores for Different Cluster Numbers')
plt.show()

# Visualize the clusters using Age and BMI (using the same features as in your code)
plt.figure(figsize=(8, 6))
sns.scatterplot(x=age, y=weight, hue=labels_pred, palette='viridis', alpha=0.7)
plt.xlabel('Age')
plt.ylabel('Weight')
plt.title('Clustering Results (K-Means)')
plt.show()

# Final clustering visualization with KMeans
plt.figure(figsize=(8, 6))
sns.scatterplot(x=age, y=weight, hue=labels_pred, palette='viridis', alpha=0.7)
plt.xlabel('Age')
plt.ylabel('Weight')
plt.title('K-Means Clustering (Age vs Weight)')
plt.show()