In [79]:
import pandas as pd
import numpy as np

# --- Load pizza types dataset ---
# Each row looks like:
# cali_ckn,The California Chicken Pizza,Chicken,"Chicken, Artichoke, Spinach, Garlic, Jalapeno Peppers, Fontina Cheese, Gouda Cheese"

def parse_ingredients(s):
    # Handles quoted comma-separated ingredients
    if pd.isna(s):
        return []
    return [i.strip() for i in s.split(",")]

pizza_types = pd.read_csv("data/pizza_types.csv", converters={"ingredients": parse_ingredients})


# --- Define base ingredient units (for display) ---
ingredient_units = {
    "Tomatoes": "kg",
    "Red Peppers": "kg",
    "Green Peppers": "kg",
    "Red Onions": "kg",
    "Mushrooms": "kg",
    "Spinach": "kg",
    "Corn": "kg",
    "Garlic": "kg",
    "Jalapeno Peppers": "kg",
    "Pineapple": "kg",
    "Artichokes": "kg",
    "Zucchini": "kg",
    "Eggplant": "kg",
    "Bacon": "packet",
    "Chicken": "packet",
    "Pepperoni": "packet",
    "Sausage": "packet",
    "Mozzarella Cheese": "packet",
    "Feta Cheese": "packet",
    "Fontina Cheese": "packet",
    "Gouda Cheese": "packet",
    "Asiago Cheese": "packet",
    "Goat Cheese": "packet",
    "Provolone Cheese": "packet",
    "Ricotta Cheese": "packet",
    "Blue Cheese": "packet",
    "Barbecue Sauce": "bucket",
    "Pesto Sauce": "bucket",
    "Tomato Sauce": "bucket",
    "Alfredo Sauce": "bucket",
    "Chipotle Sauce": "bucket",
    "Thai Sweet Chilli Sauce": "bucket"
}


# --- Default sauce & cheese rule ---
def ensure_base_ingredients(ingredients):
    has_sauce = any("Sauce" in i for i in ingredients)
    has_cheese = any("Cheese" in i for i in ingredients)
    if not has_sauce:
        ingredients.append("Tomato Sauce")
    if not has_cheese:
        ingredients.append("Mozzarella Cheese")
    return ingredients


# --- Expand dataset ---
rows = []
for _, row in pizza_types.iterrows():
    ingredients = ensure_base_ingredients(row["ingredients"])

    for ing in ingredients:
        unit = ingredient_units.get(ing, "unit")

        # Random quantity of ingredient used (between 0.1 and 1.0)
        quantity = round(np.random.uniform(0.1, 1.0), 2)

        rows.append({
            "pizza_type_id": row["pizza_type_id"],
            "pizza_name": row["name"],
            "category": row["category"],
            "ingredient": ing,
            "quantity_unit": unit,
            "fraction_used": quantity
        })


# --- Create final augmented dataset ---
augmented_df = pd.DataFrame(rows)

# Normalize fractions per pizza to sum exactly to 1
augmented_df["fraction_used"] = (
    augmented_df.groupby("pizza_type_id")["fraction_used"]
    .transform(lambda x: x / x.sum())
    .round(2)
)

print(augmented_df.head(10))


  pizza_type_id                    pizza_name category         ingredient  \
0       bbq_ckn    The Barbecue Chicken Pizza  Chicken  Barbecued Chicken   
1       bbq_ckn    The Barbecue Chicken Pizza  Chicken        Red Peppers   
2       bbq_ckn    The Barbecue Chicken Pizza  Chicken      Green Peppers   
3       bbq_ckn    The Barbecue Chicken Pizza  Chicken           Tomatoes   
4       bbq_ckn    The Barbecue Chicken Pizza  Chicken         Red Onions   
5       bbq_ckn    The Barbecue Chicken Pizza  Chicken     Barbecue Sauce   
6       bbq_ckn    The Barbecue Chicken Pizza  Chicken  Mozzarella Cheese   
7      cali_ckn  The California Chicken Pizza  Chicken            Chicken   
8      cali_ckn  The California Chicken Pizza  Chicken          Artichoke   
9      cali_ckn  The California Chicken Pizza  Chicken            Spinach   

  quantity_unit  fraction_used  
0          unit           0.13  
1            kg           0.10  
2            kg           0.19  
3            kg     

In [80]:
# --- Pivot to wide (sparse) format ---
ingredient_matrix = augmented_df.pivot_table(
    index=["pizza_type_id", "pizza_name", "category"],
    columns="ingredient",
    values="fraction_used",
    fill_value=0
)

ingredient_matrix = ingredient_matrix.astype("Sparse[float]")

# --- Save to CSV ---
ingredient_matrix.to_csv("data/pizza_ingredient_matrix.csv")

print("✅ Sparse pizza ingredient matrix saved as: data/pizza_ingredient_matrix.csv")
print(ingredient_matrix.head())


✅ Sparse pizza ingredient matrix saved as: data/pizza_ingredient_matrix.csv
ingredient                                           Alfredo Sauce  Anchovies  \
pizza_type_id pizza_name                   category                             
bbq_ckn       The Barbecue Chicken Pizza   Chicken             0.0        0.0   
big_meat      The Big Meat Pizza           Classic             0.0        0.0   
brie_carre    The Brie Carre Pizza         Supreme             0.0        0.0   
calabrese     The Calabrese Pizza          Supreme             0.0        0.0   
cali_ckn      The California Chicken Pizza Chicken             0.0        0.0   

ingredient                                           Artichoke  Artichokes  \
pizza_type_id pizza_name                   category                          
bbq_ckn       The Barbecue Chicken Pizza   Chicken         0.0         0.0   
big_meat      The Big Meat Pizza           Classic         0.0         0.0   
brie_carre    The Brie Carre Pizza         S