# Cuisine Classification

This notebook trains a classifier to predict cuisine type from ingredient lists using network-based features.

## Overview

The classification process:
1. Load the ingredient network
2. Extract network-based features from recipes
3. Train a classifier (Random Forest) on recipe-cuisine pairs
4. Evaluate classifier performance
5. Test on new recipes


In [None]:
# Setup
import sys
from pathlib import Path
import networkx as nx
import pandas as pd
import numpy as np

pipeline_root = Path.cwd().parent
if str(pipeline_root) not in sys.path:
    sys.path.insert(0, str(pipeline_root))

from network.graph import IngredientGraph
from inference.cuisine_classifier import CuisineClassifier


## Step 1: Load Network and Data


In [None]:
# Load network
graph_path = Path("./data/ingredient_network.graphml")
if graph_path.exists():
    graph = nx.read_graphml(graph_path)
    ing_graph = IngredientGraph(graph)
    print(f"Loaded network: {ing_graph.num_nodes:,} nodes, {ing_graph.num_edges:,} edges")
else:
    print(f"Network not found: {graph_path}")
    print("Please run 01_build_network.ipynb first!")

# Load recipe data
data_path = Path("../preprocess_pipeline/data/encoded_combined_datasets_with_cuisine_encoded.parquet")
df = pd.read_parquet(data_path)
print(f"\nLoaded {len(df):,} recipes")


## Step 2: Prepare Training Data

Extract recipes and cuisine labels for training.


In [None]:
# Prepare recipes and labels
recipes = []
cuisines = []

for _, row in df.iterrows():
    ingredients = row.get('encoded_ingredients', [])
    cuisine = row.get('cuisine_encoded', [])
    
    # Handle list formats
    if isinstance(ingredients, (list, tuple)):
        ing_list = [int(i) for i in ingredients if i and i != 0]
    else:
        ing_list = []
    
    # Use first cuisine if multiple
    if isinstance(cuisine, (list, tuple)) and len(cuisine) > 0:
        cuisine_id = int(cuisine[0]) if cuisine[0] and cuisine[0] != 0 else None
    elif isinstance(cuisine, (int, float)) and cuisine != 0:
        cuisine_id = int(cuisine)
    else:
        cuisine_id = None
    
    if ing_list and cuisine_id is not None:
        recipes.append(ing_list)
        cuisines.append(cuisine_id)

print(f"Prepared {len(recipes)} recipe-cuisine pairs")
print(f"Unique cuisines: {len(set(cuisines))}")


## Step 3: Train Classifier

Train a Random Forest classifier using network-based features.


In [None]:
# Initialize classifier
classifier = CuisineClassifier(ing_graph)

# Prepare training data
X_train, X_test, y_train, y_test = classifier.prepare_training_data(
    recipes=recipes,
    cuisines=cuisines,
    test_size=0.2,
)

# Train classifier
classifier.train(X_train, y_train)
print("Classifier trained!")


## Step 4: Evaluate Classifier

Evaluate performance on test set.


In [None]:
# Evaluate
results = classifier.evaluate(X_test, y_test)

print(f"Test Accuracy: {results['accuracy']:.4f}")
print(f"\nClassification Report:")
print(results['classification_report'])


## Step 5: Test on New Recipes

Test the classifier on example recipes.


In [None]:
# Test on sample recipes
test_recipes = [
    recipes[0] if recipes else [],
    recipes[100] if len(recipes) > 100 else [],
    recipes[500] if len(recipes) > 500 else [],
]

print("Testing classifier on sample recipes:")
for i, recipe in enumerate(test_recipes):
    if recipe:
        predicted_cuisine, confidence = classifier.predict(recipe)
        actual_cuisine = cuisines[i] if i < len(cuisines) else None
        print(f"\n  Recipe {i+1}:")
        print(f"    Ingredients: {len(recipe)} ingredients")
        print(f"    Predicted cuisine: {predicted_cuisine} (confidence: {confidence:.4f})")
        if actual_cuisine:
            print(f"    Actual cuisine: {actual_cuisine}")
            print(f"    Correct: {predicted_cuisine == actual_cuisine}")
