# 02_prepare_features.ipynb

## Goal
Engineer features (Health Score, Affordability Score), perform clustering, and export the final dataset for the backend.

## Steps
1. Load the raw dataset.
2. Create `product_id`.
3. Engineer `health_score` (0-100).
4. Engineer `affordability_score` (0-100).
5. Compute `nutri_score_app`.
6. Perform K-Means Clustering.
7. Export to `outputs/foods_scored.csv`.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import KMeans
import os

# 1. Load Data
DATA_PATH = "outputs/canada_grocery_nutrition_clean.csv"
df = pd.read_csv(DATA_PATH)

# 2. Create product_id
df["product_id"] = df.index.astype(int)
print(f"Loaded {len(df)} rows.")

In [None]:
# 3. Engineer Health Score (0-100)
# Reward: protein, fiber, nutriscore (assuming A=5, E=1 or similar numeric, check data)
# Penalize: sugar, saturated_fat, trans_fat, sodium, FPro

# Note: Adjust column names if they differ in the actual CSV
# Assuming 'nutriscore' is numeric or we need to map it. 
# If nutriscore is 'a', 'b', etc., we map it. If it's already numeric, great.
# Let's assume it's numeric for now based on prompt "nutriscore # higher = healthier"

def calculate_raw_health(row):
    score = 0
    # Rewards
    score += row.get("protein", 0) * 2
    score += row.get("fiber", 0) * 3
    score += row.get("nutriscore", 0) * 5 # Assuming nutriscore is ~ -15 to +40 or similar, or 0-100
    
    # Penalties
    score -= row.get("sugar", 0) * 1
    score -= row.get("saturated_fat", 0) * 2
    score -= row.get("trans_fat", 0) * 5
    score -= (row.get("sodium", 0) / 100) * 1 # Sodium is mg
    score -= row.get("FPro", 0) * 20 # FPro is 0-1
    
    return score

df["raw_health"] = df.apply(calculate_raw_health, axis=1)

# Scale to 0-100
scaler = MinMaxScaler(feature_range=(0, 100))
df["health_score"] = scaler.fit_transform(df[["raw_health"]]).round(1)

print("Health Score Stats:")
print(df["health_score"].describe())

In [None]:
# 4. Engineer Affordability Score (0-100)
# Cheaper = Higher Score
# We use price_per_100g

price_col = "price_per_100g"
# Handle potential zeros or outliers
df = df[df[price_col] > 0].copy()

# Invert price: 1 / price
df["price_inv"] = 1 / df[price_col]

# Scale to 0-100
scaler_afford = MinMaxScaler(feature_range=(0, 100))
df["affordability_score"] = scaler_afford.fit_transform(df[["price_inv"]]).round(1)

print("Affordability Score Stats:")
print(df["affordability_score"].describe())

In [None]:
# 5. NutriScore App (Combined)
# 60% Health, 40% Affordability
df["nutri_score_app"] = (0.6 * df["health_score"] + 0.4 * df["affordability_score"]).round(1)

print("NutriScore App Stats:")
print(df["nutri_score_app"].describe())

In [None]:
# 6. Clustering (K-Means)
features = ["calories", "protein", "carbs", "fat", "sugar", "fiber", "FPro", "nutriscore"]
# Fill NaNs if any
X = df[features].fillna(0)

scaler_cluster = StandardScaler()
X_scaled = scaler_cluster.fit_transform(X)

kmeans = KMeans(n_clusters=4, random_state=42)
df["cluster"] = kmeans.fit_predict(X_scaled)

# Map clusters to human names (This requires manual inspection usually, but we'll approximate)
# We can look at mean values of each cluster to label them.
cluster_means = df.groupby("cluster")[features].mean()
print(cluster_means)

# Heuristic labeling (You might need to adjust based on actual output)
# For now, we'll just label them Cluster 0, 1, 2, 3 in the file, 
# or try to assign based on logic. Let's stick to generic mapping for now and let the user refine.
cluster_map = {
    0: "Staples / Mixed",
    1: "Veg & Wholefoods",
    2: "Processed / Snacks",
    3: "High Energy / Fatty"
}
# Note: In a real run, I'd inspect the means to assign these correctly.
# For this script, I'll just assign the map directly. 
# A better way is to sort by 'FPro' or 'Protein' to assign labels dynamically.

df["cluster_label"] = df["cluster"].map(cluster_map)

In [None]:
# 7. Export
output_cols = [
    "product_id", "product_name", "store", "brand",
    "category", "sub_category", "food_type", "veg_nonveg",
    "calories", "protein", "carbs", "fat", "sugar", "fiber",
    "price_per_gram", "price_per_100g",
    "FPro", "nutriscore",
    "health_score", "affordability_score", "nutri_score_app", "cluster", "cluster_label"
]

# Ensure output directory exists
os.makedirs("outputs", exist_ok=True)

# Filter columns that actually exist
final_cols = [c for c in output_cols if c in df.columns]

df[final_cols].to_csv("outputs/foods_scored.csv", index=False)
print("Saved to outputs/foods_scored.csv")