In [None]:
import pandas as pd

df = pd.read_csv("meal_features_cleaned.csv")


In [None]:
import plotly.express as px

for col in ["calories", "protein_g", "carbs_g", "fat_g", "fiber_g"]:
    fig = px.histogram(df, x=col, nbins=30,
                       marginal="box",  # adds a boxplot above
                       title=f"Distribution of {col}",
                       color_discrete_sequence=["teal"])
    fig.update_layout(
        width=600,
        height=400,
        title_x=0.5,
        template="plotly_white"
    )
    fig.show()


## Summary of Nutrient Distributions

* All three nutrients — fiber, carbs, and fat — show right-skewed distributions.
* Most food items have low nutrient content, while a few are high in nutrients.
* Fiber: Majority below 1 g, few reach up to 4 g (high-fiber foods).
* Carbs: Most under 20 g, with some outliers above 60 g (high-carb foods).
* Fat: Majority below 5 g, few extend above 20 g (high-fat foods like oils).
* No major outliers beyond upper fences — data appears clean and accurate.
This distribution pattern is typical of real-world nutritional data.


In [None]:
import plotly.express as px

# Compute correlation matrix
corr = df.corr()

# Create heatmap
fig = px.imshow(
    corr,
    text_auto=True,       # shows correlation values
    color_continuous_scale="RdBu_r",  # similar to coolwarm
    title="Feature Correlation Heatmap",
    aspect="auto"
)

fig.update_layout(
    width=800,
    height=600,
    title_x=0.5,
    template="plotly_white"
)

fig.show()



###Correlation

Calories show strong positive correlation with both fat and carbohydrates, indicating these two nutrients are the main drivers of total calorie content.
Protein has a moderate correlation with calories and fat, suggesting it contributes to energy but less strongly.
Fiber is weakly correlated with other features, meaning it can be increased independently without significantly affecting calories.

In [None]:
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

X_features =  ["calories", "protein_g", "carbs_g", "fat_g", "fiber_g"] # keep only numeric features

models = {
    "KMeans": KMeans(n_clusters=4, random_state=42),
    "Agglomerative": AgglomerativeClustering(n_clusters=4),
    "DBSCAN": DBSCAN(eps=5, min_samples=5),  # tune eps/min_samples
    "GMM": GaussianMixture(n_components=4, random_state=42)
}

results = []

for name, model in models.items():
    if name == "GMM":
        labels = model.fit_predict(X_features)
    else:
        labels = model.fit_predict(X_features)

    if len(set(labels)) > 1:  # avoid error if DBSCAN finds 1 cluster
        silhouette = silhouette_score(X_features, labels)
        ch_score = calinski_harabasz_score(X_features, labels)
        db_score = davies_bouldin_score(X_features, labels)
    else:
        silhouette, ch_score, db_score = None, None, None

    results.append({
        "Model": name,
        "Silhouette": silhouette,
        "Calinski-Harabasz": ch_score,
        "Davies-Bouldin": db_score
    })

df_results = pd.DataFrame(results)
print(df_results)


           Model  Silhouette  Calinski-Harabasz  Davies-Bouldin
0         KMeans    0.574376       20816.014107        0.570225
1  Agglomerative    0.551309       19631.750831        0.562961
2         DBSCAN   -0.591254         160.948966        2.875458
3            GMM    0.060652        1216.597244        3.419533


In [None]:
import plotly.express as px
from sklearn.decomposition import PCA

# Perform PCA
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_features)

# Create a list to hold all figures
figs = []

for name, model in models.items():
    labels = model.fit_predict(X_features)
    df_pca = pd.DataFrame({
        "PCA1": X_pca[:, 0],
        "PCA2": X_pca[:, 1],
        "Cluster": labels
    })

    fig = px.scatter(
        df_pca,
        x="PCA1",
        y="PCA2",
        color=df_pca["Cluster"].astype(str),
        title=f"{name} Clustering (PCA Projection)",
        color_discrete_sequence=px.colors.qualitative.Set2
    )

    fig.update_layout(
        width=600,
        height=500,
        title_x=0.5,
        template="plotly_white"
    )
    fig.show()



* KMeans and Agglomerative Clustering best capture the dataset’s structure, forming clear and interpretable clusters in the PCA space.
* DBSCAN struggles with over-fragmentation, implying non-uniform density.
GMM performs decently but produces overlapping clusters, which might complicate interpretation.
* Best choice: KMeans or Agglomerative Clustering — both provide clean separation and meaningful groupings for this dataset.

In [None]:
import plotly.graph_objects as go
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

scores_kmeans = []
scores_agglom = []
K = range(2, 11)

for k in K:
    km = KMeans(n_clusters=k, random_state=42)
    labels_km = km.fit_predict(df)
    scores_kmeans.append(silhouette_score(df, labels_km))

    ac = AgglomerativeClustering(n_clusters=k)
    labels_ac = ac.fit_predict(df)
    scores_agglom.append(silhouette_score(df, labels_ac))

# Create interactive line plot
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=list(K), y=scores_kmeans,
    mode='lines+markers',
    name='KMeans',
    marker=dict(symbol='circle', size=8),
    line=dict(color='royalblue', width=2)
))

fig.add_trace(go.Scatter(
    x=list(K), y=scores_agglom,
    mode='lines+markers',
    name='Agglomerative',
    marker=dict(symbol='square', size=8),
    line=dict(color='firebrick', width=2)
))

fig.update_layout(
    title="KMeans vs Agglomerative Clustering (Silhouette Analysis)",
    xaxis_title="Number of Clusters (k)",
    yaxis_title="Silhouette Score",
    template="plotly_white",
    title_x=0.5,
    width=800,
    height=500
)

fig.show()


k = 4 provides a strong trade-off between cluster separation and interpretability. It maintains a good silhouette score and yields well-defined, meaningful clusters in both KMeans and Agglomerative methods.

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go

# --- Preprocessing ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)  # assuming df = your nutrition dataframe

# --- KMeans clustering with k=4 ---
kmeans = KMeans(n_clusters=4, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)

# --- Cluster Means ---
cluster_means = df.groupby('Cluster').mean().reset_index()
print("Cluster Means:\n", cluster_means)

# ======================
# 1️⃣ Donut Chart (Cluster Distribution)
# ======================
cluster_counts = df['Cluster'].value_counts().sort_index().reset_index()
cluster_counts.columns = ['Cluster', 'Count']

fig_donut = px.pie(
    cluster_counts,
    names='Cluster',
    values='Count',
    hole=0.4,
    title='Meal Cluster Distribution (KMeans k=4)',
    color_discrete_sequence=px.colors.qualitative.Pastel
)
fig_donut.update_traces(textposition='inside', textinfo='percent+label')
fig_donut.show()

# ======================
# 2️⃣ Radar Chart (Cluster Nutrition Profiles)
# ======================

# normalize cluster means for fair comparison
normalized_means = cluster_means.copy()
for col in cluster_means.columns[1:]:
    normalized_means[col] = cluster_means[col] / cluster_means[col].max()

categories = normalized_means.columns[1:]  # exclude 'Cluster'

fig_radar = go.Figure()

for i, row in normalized_means.iterrows():
    fig_radar.add_trace(go.Scatterpolar(
        r=row[categories].values,
        theta=categories,
        fill='toself',
        name=f'Cluster {int(row["Cluster"])}'
    ))

fig_radar.update_layout(
    polar=dict(
        radialaxis=dict(visible=True, range=[0, 1])
    ),
    title="Normalized Nutrition Profile by Cluster",
    showlegend=True
)

fig_radar.show()


Cluster Means:
    Cluster    calories  protein_g    carbs_g      fat_g   fiber_g
0        0  225.697453  13.859973  15.592511  11.600447  1.012011
1        1   61.283225   3.334978   6.582880   1.807670  0.322753
2        2  327.719853   7.174495  42.058972  11.720587  1.953211
3        3  109.007753   3.698961  13.880462   3.578358  1.973621


In [None]:
import plotly.express as px
df_plot = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2'])
df_plot['cluster'] = labels
df_plot['meal_name'] = df.index  # optional, if your DataFrame has meal names
df_plot['calories'] = df['calories']

# Plot
fig = px.scatter(
    df_plot,
    x='PCA1',
    y='PCA2',
    color='cluster',
    hover_data=['meal_name', 'calories'],
    title='PCA Visualization of Meal Clusters (Interactive)'
)
fig.show()

In [None]:
import pandas as pd
from sklearn.cluster import KMeans

# Assuming X is a DataFrame
k = 4
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(df)

# Add cluster labels to your DataFrame
df['cluster'] = labels

# Compute mean of each cluster
cluster_means = df.groupby('cluster').mean()
print(cluster_means)


           calories  protein_g    carbs_g      fat_g   fiber_g   Cluster
cluster                                                                 
0        260.916098  10.334430  27.310521  11.010384  1.581524  1.038436
1         44.642436   2.786357   5.647869   1.322221  0.717003  1.420802
2        418.361162   6.787325  43.173794  14.977039  1.568421  1.921053
3        133.805489   6.537425  14.790071   5.074232  1.200656  1.845271


In [None]:
import plotly.graph_objects as go

# Example for Cluster 0
labels = [ "protein_g", "carbs_g", "fat_g","fiber_g"]
values = [10.334430 , 27.310521 , 11.010384 , 1.581524]  # From your cluster 0 means
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=0.3)])
fig.update_layout(title_text='Macro Composition - High Protein Cluster')
fig.show()

# Example for Cluster 1
labels = [ "protein_g", "carbs_g", "fat_g","fiber_g"]
values = [ 2.787102  , 5.650554  , 1.322455 , 0.718017]  # From your cluster 0 means
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=0.3)])
fig.update_layout(title_text='Macro Composition - Low calorie')
fig.show()

# Example for Cluster 2
labels = [ "protein_g", "carbs_g", "fat_g","fiber_g"]
values = [   6.787325 , 43.173794,  14.977039 , 1.568421]  # From your cluster 0 means
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=0.3)])
fig.update_layout(title_text='Macro Composition - High calorie')
fig.show()

# Example for Cluster 3
labels = [ "protein_g", "carbs_g", "fat_g","fiber_g"]
values = [    6.538277 , 14.790749 ,  5.075908 , 1.199289]  # From your cluster 0 means
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=0.3)])
fig.update_layout(title_text='Macro Composition - Balanced diet')
fig.show()

In [None]:
 cluster_labels = {
    0: 'High Protein',
    1: 'Low calorie',
    2: 'High Calorie',
    3: 'Balanced Meal'
}

df['meal_type'] = df['cluster'].map(cluster_labels)


###FINAL DATASET

In [None]:
df.head(5)

Unnamed: 0,calories,protein_g,carbs_g,fat_g,fiber_g,Cluster,cluster,meal_type
0,102.0,3.8,18.7,1.58,2.1,3,3,Balanced Meal
1,94.0,3.15,2.65,0.64,1.6,3,3,Balanced Meal
2,62.0,4.8,10.1,0.5,0.8,1,1,Low calorie
3,96.0,2.36,4.89,0.86,3.2,3,3,Balanced Meal
4,43.0,3.46,7.02,1.12,2.1,3,1,Low calorie


In [None]:
df.to_csv("Clustered_data.csv", index=False)