# Fast Food Nutrition Clustering

This notebook loads a fast food nutrition dataset, constructs a clustering-ready dataset, selects an appropriate number of clusters using the elbow method and silhouette scores, fits a K-Means model, and interprets the resulting clusters.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

# Plot settings
plt.rcParams["figure.figsize"] = (8, 4)
sns.set(style="whitegrid")


## 1. Load Dataset

In [None]:
# Update path if needed
csv_path = "Nutrition_Value_Dataset.csv"

df = pd.read_csv(csv_path)
print(df.shape)
df.head()


## 2. Construct Clustering Dataset (Feature Selection & Cleaning)

In [None]:
# Nutritional features used for clustering (Option A: all numeric fields)
features = [
    "Energy (kCal)",
    "Carbohydrates (g)",
    "Protein (g)",
    "Fiber (g)",
    "Sugar (g)",
    "Total Fat (g)",
    "Saturated Fat (g)",
    "Trans Fat (g)",
    "Cholesterol (mg)",
    "Sodium (mg)"
]

# Create a copy to avoid modifying the original directly
data = df[features].copy()

# Drop rows with missing values in the selected features
data_clean = data.dropna()
print("Original rows:", data.shape[0])
print("Rows after dropping NA:", data_clean.shape[0])

data_clean.describe().T


## 3. Scale Features

In [None]:
scaler = StandardScaler()
scaled = scaler.fit_transform(data_clean)

print("Scaled shape:", scaled.shape)


## 4. Choose k Using the Elbow Method

In [None]:
inertia = []
K = range(2, 10)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto")
    kmeans.fit(scaled)
    inertia.append(kmeans.inertia_)

plt.figure()
plt.plot(K, inertia, marker='o')
plt.title("Elbow Method For Optimal k")
plt.xlabel("Number of clusters (k)")
plt.ylabel("Inertia (Within-cluster Sum of Squares)")
plt.xticks(list(K))
plt.tight_layout()
plt.show()


## 5. Choose k Using Silhouette Scores

In [None]:
sil_scores = {}

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto")
    labels = kmeans.fit_predict(scaled)
    sil = silhouette_score(scaled, labels)
    sil_scores[k] = sil

sil_scores


In [None]:
plt.figure()
plt.plot(list(sil_scores.keys()), list(sil_scores.values()), marker='o')
plt.title("Silhouette Score by Number of Clusters")
plt.xlabel("Number of clusters (k)")
plt.ylabel("Silhouette score")
plt.xticks(list(K))
plt.tight_layout()
plt.show()


## 6. Fit Final K-Means Model

In [None]:
# <<< IMPORTANT >>> 
# After inspecting the elbow and silhouette plots, set best_k accordingly.
# For example, if k=4 looks best, set best_k = 4.

best_k = 4  # <-- change this after you inspect the plots

kmeans_final = KMeans(n_clusters=best_k, random_state=42, n_init="auto")
cluster_labels = kmeans_final.fit_predict(scaled)

# Map cluster labels back to the original dataframe
# Keep only the rows that were used in clustering (non-NA rows)
df_clustered = df.loc[data_clean.index].copy()
df_clustered["Cluster"] = cluster_labels

df_clustered[["Company", "Category", "Product", "Cluster"]].head()


## 7. Visualize Clusters with PCA (2D Projection)

In [None]:
pca = PCA(n_components=2)
components = pca.fit_transform(scaled)

pca_df = pd.DataFrame({
    "PC1": components[:, 0],
    "PC2": components[:, 1],
    "Cluster": cluster_labels
})

plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=pca_df,
    x="PC1",
    y="PC2",
    hue="Cluster",
    palette="Set2",
    alpha=0.8
)
plt.title("PCA Visualization of Nutrition Clusters")
plt.tight_layout()
plt.show()


## 8. Cluster Summary Statistics

In [None]:
cluster_summary = df_clustered.groupby("Cluster")[features].mean().round(2)
cluster_summary


## 9. Example Items from Each Cluster

In [None]:
example_items = df_clustered[["Company", "Category", "Product", "Cluster"]]    .sort_values("Cluster")    .groupby("Cluster")    .head(5)

example_items


## 10. (Optional) Save Results for Medium & GitHub

In [None]:
cluster_summary.to_csv("cluster_summary.csv")
example_items.to_csv("cluster_example_items.csv", index=False)

print("Saved: cluster_summary.csv and cluster_example_items.csv")
