**Final Things**

In [1]:
import pandas as pd
import json
import re
import ast

In [2]:

# --- Configuration ---
input_file = 'recipes.csv'
output_file = 'recipes_clean_1000.json'
num_recipes_to_keep = 1000

# --- Data Cleaning Functions ---
def simplify_category(cat):
    if not isinstance(cat, str) or cat.strip() == "":
        return "savory"
    
    cat = cat.lower()

    dessert_keywords = ["dessert", "cake", "cookie", "pie", "pudding", "brownie", "chocolate", "ice cream", "sweet"]
    beverage_keywords = ["drink", "beverage", "juice", "smoothie", "coffee", "tea", "cocktail", "punch"]

    if any(word in cat for word in dessert_keywords):
        return "dessert"
    elif any(word in cat for word in beverage_keywords):
        return "beverages"
    else:
        return "savory"

def parse_ingredients(ing_str):
    if not isinstance(ing_str, str):
        return []
    
    try:
        ingredients = json.loads(ing_str)
        if isinstance(ingredients, list):
            cleaned_ingredients = [str(i).lower() for i in ingredients if isinstance(i, (str, int, float))]
        else:
            cleaned_ingredients = []
    except json.JSONDecodeError:
        cleaned_ingredients = []

    if not cleaned_ingredients and ing_str.strip().startswith("c("):
        matches = re.findall(r'"([^"]+)"', ing_str)
        cleaned_ingredients = [m.lower() for m in matches]

    if not cleaned_ingredients:
        matches = re.findall(r"['\"]([^'\"]+)['\"]", ing_str)
        if matches:
            cleaned_ingredients = [m.lower() for m in matches]
            
    unique_ingredients = list(set(cleaned_ingredients))

    return unique_ingredients

def infer_diet(ingredients):
    non_veg = ["chicken", "beef", "pork", "fish", "shrimp", "bacon", "turkey"]
    dairy_eggs = ["milk", "cheese", "butter", "cream", "egg", "yogurt"]
    
    ingredients_str = " ".join(ingredients)
    
    if any(item in ingredients_str for item in non_veg):
        return "non-vegetarian"
    if any(item in ingredients_str for item in dairy_eggs):
        return "vegetarian"
    return "vegan"

def infer_gluten(ingredients):
    gluten_sources = ["wheat", "flour", "barley", "rye", "pasta", "bread"]
    ingredients_str = " ".join(ingredients)
    
    if any(item in ingredients_str for item in gluten_sources):
        return "contains gluten"
    return "gluten-free"

# --- Main Script Workflow ---
print(f"Loading data from {input_file}...")
try:
    df = pd.read_csv(input_file)

    print("Cleaning and filtering the full dataset...")

    # 1. Drop rows with missing critical fields
    df.dropna(subset=["Name", "RecipeInstructions", "RecipeIngredientParts"], inplace=True)

    # 2. Parse ingredients and drop recipes with insufficient ingredients
    df["ingredients"] = df["RecipeIngredientParts"].apply(parse_ingredients)
    
    # Drop recipes that have 0 or 1 ingredient, or nulls
    df = df[df["ingredients"].str.len() > 1]
    
    # Reset index after dropping rows
    df.reset_index(drop=True, inplace=True)

    # 3. Apply all other cleaning functions
    df["RecipeCategory"] = df["RecipeCategory"].apply(simplify_category)
    df["diet"] = df["ingredients"].apply(infer_diet)
    df["gluten"] = df["ingredients"].apply(infer_gluten)
    
    # 4. Take a random sample of 1000 recipes from the now clean dataset
    df_sampled = df.sample(n=min(num_recipes_to_keep, len(df)), random_state=42)

    # 5. Build and save the final JSON structure
    recipes = []
    for _, row in df_sampled.iterrows():
        recipe = {
            "name": row["Name"],
            "category": row["RecipeCategory"],
            "calories": row["Calories"],
            "fat": row["FatContent"],
            "carbs": row["CarbohydrateContent"],
            "protein": row["ProteinContent"],
            "instructions": row["RecipeInstructions"],
            "ingredients": row["ingredients"],
            "diet": row["diet"],
            "gluten": row["gluten"]
        }
        recipes.append(recipe)

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(recipes, f, indent=2, ensure_ascii=False)

    print(f"✅ Successfully processed and saved {len(recipes)} recipes to {output_file}")
    
except FileNotFoundError:
    print(f"Error: The file '{input_file}' was not found. Please check the file path.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Loading data from recipes.csv...
Cleaning and filtering the full dataset...
✅ Successfully processed and saved 1000 recipes to recipes_clean_1000.json
