In [None]:
# 1. Upload files to Google Drive first
# 2. Then in Colab:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/Recipe_dataset')

# Verify
!ls -lh

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
total 1.3G
-rw------- 1 root root 203M Feb 10 13:41 cuisine_classifier.pkl
-rw------- 1 root root  653 Feb 10 13:54 dataset_final_stats.json
-rw------- 1 root root  12M Feb  8 10:52 epicurious_processed.jsonl
-rw------- 1 root root  53M Feb  8 09:46 epi_r.csv
-rw------- 1 root root 374M Feb 10 13:42 final_recipes_enriched.jsonl
-rw------- 1 root root 308M Feb  8 09:52 food_com_cleaned.jsonl
-rw------- 1 root root 319M Feb  8 10:45 food_com_with_cuisines.jsonl
-rw------- 1 root root 1.7M Feb 10 13:54 sample_recipes_1k.jsonl
-rw------- 1 root root  12M Feb  8 09:46 train.json


In [None]:
import os
import json
import pandas as pd

# Check files exist
files_needed = ['train.json', 'epi_r.csv', 'food_com_cleaned.jsonl']

for file in files_needed:
    if os.path.exists(file):
        size = os.path.getsize(file) / (1024*1024)  # MB
        print(f"✓ {file} ({size:.1f} MB)")
    else:
        print(f"✗ {file} - NOT FOUND!")

# Verify Yummly
with open('train.json', 'r') as f:
    yummly = json.load(f)
    print(f"\n✓ Yummly: {len(yummly)} recipes")
    print(f"  Sample: {yummly[0]['cuisine']}")

# Verify Epicurious
epi = pd.read_csv('epi_r.csv')
print(f"\n✓ Epicurious: {len(epi)} recipes")
print(f"  Columns: {len(epi.columns)}")

# Verify Food.com
food_count = 0
with open('food_com_cleaned.jsonl', 'r') as f:
    for line in f:
        food_count += 1
print(f"\n✓ Food.com: {food_count} recipes")

print("\n" + "="*50)
print("✅ ALL FILES READY!")
print("="*50)

✓ train.json (11.8 MB)
✓ epi_r.csv (52.7 MB)
✓ food_com_cleaned.jsonl (307.4 MB)

✓ Yummly: 39774 recipes
  Sample: greek

✓ Epicurious: 20052 recipes
  Columns: 680

✓ Food.com: 226101 recipes

✅ ALL FILES READY!


In [None]:
import json
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

print("="*60)
print("TRAINING CUISINE CLASSIFIER")
print("="*60)

# Load Yummly data (should already be loaded from Step 1)
print("\n[1/5] Loading Yummly dataset...")
with open('train.json', 'r') as f:
    yummly_data = json.load(f)

print(f"✓ Loaded {len(yummly_data)} recipes")
print(f"✓ {len(set([r['cuisine'] for r in yummly_data]))} unique cuisines")

# Prepare training data
print("\n[2/5] Preparing training data...")
X = []  # Ingredient text
y = []  # Cuisine labels

for recipe in yummly_data:
    # Join ingredients into single text string
    ingredients_text = ' '.join(recipe['ingredients']).lower()
    X.append(ingredients_text)
    y.append(recipe['cuisine'])

print(f"✓ Prepared {len(X)} training samples")

# Split into train/test
print("\n[3/5] Splitting data (80% train, 20% test)...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Ensure balanced split
)

print(f"✓ Training set: {len(X_train)} samples")
print(f"✓ Test set: {len(X_test)} samples")

# Create TF-IDF features
print("\n[4/5] Creating TF-IDF features...")
print("This converts ingredient text into numerical features...")

vectorizer = TfidfVectorizer(
    max_features=3000,      # Use top 3000 most important ingredient words
    ngram_range=(1, 2),     # Use single words and word pairs
    min_df=2,               # Ignore very rare ingredients
    max_df=0.8,             # Ignore super common words
    lowercase=True
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print(f"✓ Feature matrix shape: {X_train_vec.shape}")
print(f"  ({X_train_vec.shape[0]} samples × {X_train_vec.shape[1]} features)")

# Train Random Forest classifier
print("\n[5/5] Training Random Forest classifier...")
print("This may take 1-2 minutes...")

classifier = RandomForestClassifier(
    n_estimators=200,       # Use 200 decision trees
    max_depth=30,           # Maximum tree depth
    min_samples_split=5,    # Minimum samples to split a node
    random_state=42,        # For reproducibility
    n_jobs=-1,              # Use all CPU cores
    verbose=1               # Show progress
)

classifier.fit(X_train_vec, y_train)

print("✓ Training complete!")

# Evaluate the model
print("\n" + "="*60)
print("MODEL EVALUATION")
print("="*60)

y_pred = classifier.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)

print(f"\n✓ Overall Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)")

# Detailed classification report
print("\nDetailed Performance by Cuisine:")
print(classification_report(y_test, y_pred, digits=3))

# Save the trained model
print("\n" + "="*60)
print("SAVING MODEL")
print("="*60)

model_data = {
    'vectorizer': vectorizer,
    'classifier': classifier,
    'accuracy': accuracy,
    'cuisines': list(set(y)),
    'training_date': str(np.datetime64('today'))
}

with open('cuisine_classifier.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("✓ Model saved to: cuisine_classifier.pkl")
print(f"✓ Model accuracy: {accuracy:.1%}")
print(f"✓ Number of cuisines: {len(set(y))}")

# Test the model with examples
print("\n" + "="*60)
print("TESTING MODEL WITH EXAMPLES")
print("="*60)

test_examples = [
    ['pasta', 'tomato sauce', 'basil', 'parmesan', 'olive oil'],
    ['soy sauce', 'rice', 'ginger', 'garlic', 'sesame oil'],
    ['tortilla', 'beans', 'salsa', 'cheese', 'avocado'],
    ['curry powder', 'coconut milk', 'rice', 'turmeric', 'garam masala'],
    ['chicken', 'potatoes', 'carrots', 'onion', 'thyme']
]

for ingredients in test_examples:
    ing_text = ' '.join(ingredients).lower()
    vec = vectorizer.transform([ing_text])

    cuisine = classifier.predict(vec)[0]
    probabilities = classifier.predict_proba(vec)[0]
    confidence = max(probabilities)

    print(f"\nIngredients: {', '.join(ingredients)}")
    print(f"→ Predicted: {cuisine} (confidence: {confidence:.2%})")

print("\n" + "="*60)
print("✓ CUISINE CLASSIFIER READY!")
print("="*60)

TRAINING CUISINE CLASSIFIER

[1/5] Loading Yummly dataset...
✓ Loaded 39774 recipes
✓ 20 unique cuisines

[2/5] Preparing training data...
✓ Prepared 39774 training samples

[3/5] Splitting data (80% train, 20% test)...
✓ Training set: 31819 samples
✓ Test set: 7955 samples

[4/5] Creating TF-IDF features...
This converts ingredient text into numerical features...
✓ Feature matrix shape: (31819, 3000)
  (31819 samples × 3000 features)

[5/5] Training Random Forest classifier...
This may take 1-2 minutes...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   53.1s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   53.9s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s


✓ Training complete!

MODEL EVALUATION


[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.4s finished



✓ Overall Accuracy: 0.679 (67.9%)

Detailed Performance by Cuisine:
              precision    recall  f1-score   support

   brazilian      1.000     0.237     0.383        93
     british      1.000     0.012     0.025       161
cajun_creole      0.827     0.618     0.707       309
     chinese      0.708     0.856     0.775       535
    filipino      0.914     0.212     0.344       151
      french      0.524     0.285     0.370       529
       greek      0.792     0.438     0.564       235
      indian      0.823     0.899     0.859       601
       irish      1.000     0.015     0.030       133
     italian      0.640     0.878     0.740      1568
    jamaican      1.000     0.390     0.562       105
    japanese      0.845     0.539     0.658       284
      korean      0.882     0.542     0.672       166
     mexican      0.801     0.914     0.854      1288
    moroccan      0.888     0.482     0.625       164
     russian      1.000     0.082     0.151        98
 southern_us

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      |


Ingredients: soy sauce, rice, ginger, garlic, sesame oil
→ Predicted: chinese (confidence: 48.88%)

Ingredients: tortilla, beans, salsa, cheese, avocado
→ Predicted: mexican (confidence: 64.75%)


[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished



Ingredients: curry powder, coconut milk, rice, turmeric, garam masala
→ Predicted: indian (confidence: 72.04%)

Ingredients: chicken, potatoes, carrots, onion, thyme
→ Predicted: southern_us (confidence: 16.30%)

✓ CUISINE CLASSIFIER READY!


In [None]:
# FAST VERSION: Cuisine Prediction with Batch Processing
## 10-20x Speed Improvement

import json
import pickle
import numpy as np
from tqdm import tqdm
from collections import Counter

print("="*60)
print("ADDING CUISINES TO FOOD.COM RECIPES (FAST VERSION)")
print("="*60)

# Load trained model
print("\n[1/4] Loading trained classifier...")
with open('cuisine_classifier.pkl', 'rb') as f:
    model_data = pickle.load(f)

vectorizer = model_data['vectorizer']
classifier = model_data['classifier']
print(f"✓ Model loaded (accuracy: {model_data['accuracy']:.1%})")

# Load your cleaned Food.com recipes
print("\n[2/4] Loading Food.com recipes...")
food_com_recipes = []

with open('food_com_cleaned.jsonl', 'r') as f:
    for line in f:
        food_com_recipes.append(json.loads(line))

print(f"✓ Loaded {len(food_com_recipes):,} recipes")

# FAST BATCH PROCESSING
print("\n[3/4] Predicting cuisines (BATCH MODE - MUCH FASTER!)...")
print("Estimated time: 5-15 minutes instead of 1-2 hours!")

# Prepare all ingredient texts at once
ingredient_texts = []
valid_indices = []  # Track which recipes have ingredients

for idx, recipe in enumerate(food_com_recipes):
    ingredients = recipe.get('ingredients', [])
    if ingredients and len(ingredients) > 0:
        ingredient_texts.append(' '.join(ingredients).lower())
        valid_indices.append(idx)
    else:
        # No ingredients - set default immediately
        recipe['cuisine'] = 'american'
        recipe['cuisine_confidence'] = 0.0

print(f"  Processing {len(ingredient_texts):,} recipes with ingredients...")

# BATCH PROCESSING PARAMETERS
BATCH_SIZE = 10000  # Process 10k recipes at a time (adjust based on RAM)

# Process in batches
all_cuisines = []
all_confidences = []

num_batches = (len(ingredient_texts) + BATCH_SIZE - 1) // BATCH_SIZE

for batch_idx in tqdm(range(num_batches), desc="Batch processing"):
    # Get batch
    start_idx = batch_idx * BATCH_SIZE
    end_idx = min(start_idx + BATCH_SIZE, len(ingredient_texts))
    batch_texts = ingredient_texts[start_idx:end_idx]

    # Vectorize entire batch at once (FAST!)
    batch_vectors = vectorizer.transform(batch_texts)

    # Predict entire batch at once (FAST!)
    batch_cuisines = classifier.predict(batch_vectors)
    batch_probabilities = classifier.predict_proba(batch_vectors)
    batch_confidences = np.max(batch_probabilities, axis=1)

    # Store results
    all_cuisines.extend(batch_cuisines)
    all_confidences.extend(batch_confidences)

# Assign results back to recipes
for i, recipe_idx in enumerate(valid_indices):
    food_com_recipes[recipe_idx]['cuisine'] = all_cuisines[i]
    food_com_recipes[recipe_idx]['cuisine_confidence'] = round(float(all_confidences[i]), 3)

print(f"\n✓ All {len(food_com_recipes):,} recipes now have cuisine predictions!")

# Save updated recipes WITH STREAMING (memory efficient)
print("\n[4/4] Saving results...")
output_file = 'food_com_with_cuisines.jsonl'

with open(output_file, 'w') as f:
    for recipe in tqdm(food_com_recipes, desc="Saving"):
        f.write(json.dumps(recipe) + '\n')

print(f"✓ Saved to: {output_file}")

# Show statistics
print("\n" + "="*60)
print("CUISINE DISTRIBUTION IN FOOD.COM")
print("="*60)

cuisine_dist = Counter([r['cuisine'] for r in food_com_recipes])

print(f"\nTop 15 cuisines:")
for cuisine, count in cuisine_dist.most_common(15):
    pct = (count / len(food_com_recipes)) * 100
    print(f"  {cuisine:15s}: {count:6,} ({pct:5.1f}%)")

print(f"\nTotal cuisines: {len(cuisine_dist)}")

# Confidence statistics
confidences = [r['cuisine_confidence'] for r in food_com_recipes]
print(f"\nPrediction Confidence Stats:")
print(f"  Mean:   {np.mean(confidences):.3f}")
print(f"  Median: {np.median(confidences):.3f}")
print(f"  Std:    {np.std(confidences):.3f}")
print(f"  Min:    {np.min(confidences):.3f}")
print(f"  Max:    {np.max(confidences):.3f}")

# Show confidence distribution
low_conf = sum(1 for c in confidences if c < 0.5)
mid_conf = sum(1 for c in confidences if 0.5 <= c < 0.7)
high_conf = sum(1 for c in confidences if c >= 0.7)

print(f"\nConfidence Distribution:")
print(f"  High confidence (≥0.7): {high_conf:6,} ({high_conf/len(confidences)*100:.1f}%)")
print(f"  Mid confidence (0.5-0.7): {mid_conf:6,} ({mid_conf/len(confidences)*100:.1f}%)")
print(f"  Low confidence (<0.5):  {low_conf:6,} ({low_conf/len(confidences)*100:.1f}%)")

# Sample predictions
print("\n" + "="*60)
print("SAMPLE PREDICTIONS")
print("="*60)

import random
for _ in range(5):
    recipe = random.choice(food_com_recipes)
    print(f"\nTitle: {recipe['title']}")
    ingredients = recipe.get('ingredients', [])
    if ingredients:
        print(f"Ingredients: {', '.join(ingredients[:5])}...")
    print(f"Predicted: {recipe['cuisine']} (confidence: {recipe['cuisine_confidence']:.2%})")

print("\n" + "="*60)
print("✓ STEP 3 COMPLETE!")
print("="*60)

ADDING CUISINES TO FOOD.COM RECIPES (FAST VERSION)

[1/4] Loading trained classifier...
✓ Model loaded (accuracy: 67.9%)

[2/4] Loading Food.com recipes...
✓ Loaded 226,101 recipes

[3/4] Predicting cuisines (BATCH MODE - MUCH FASTER!)...
Estimated time: 5-15 minutes instead of 1-2 hours!
  Processing 226,101 recipes with ingredients...


Batch processing:   0%|          | 0/23 [00:00<?, ?it/s][Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.4s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.4s finished
Batch processing:   4%|▍         | 1/23 [00:01<00:22,  1.03s/it][Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.3s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 con


✓ All 226,101 recipes now have cuisine predictions!

[4/4] Saving results...


Saving: 100%|██████████| 226101/226101 [00:08<00:00, 26472.75it/s]


✓ Saved to: food_com_with_cuisines.jsonl

CUISINE DISTRIBUTION IN FOOD.COM

Top 15 cuisines:
  southern_us    : 99,653 ( 44.1%)
  italian        : 64,697 ( 28.6%)
  mexican        : 30,026 ( 13.3%)
  chinese        :  9,323 (  4.1%)
  indian         :  7,906 (  3.5%)
  french         :  4,582 (  2.0%)
  greek          :  2,205 (  1.0%)
  thai           :  2,071 (  0.9%)
  cajun_creole   :  1,906 (  0.8%)
  japanese       :  1,252 (  0.6%)
  moroccan       :  1,138 (  0.5%)
  korean         :    536 (  0.2%)
  filipino       :    215 (  0.1%)
  vietnamese     :    212 (  0.1%)
  jamaican       :    129 (  0.1%)

Total cuisines: 20

Prediction Confidence Stats:
  Mean:   0.344
  Median: 0.281
  Std:    0.183
  Min:    0.102
  Max:    0.982

Confidence Distribution:
  High confidence (≥0.7): 16,520 (7.3%)
  Mid confidence (0.5-0.7): 24,710 (10.9%)
  Low confidence (<0.5):  184,871 (81.8%)

SAMPLE PREDICTIONS

Title: cranberry sauce whole berry
Ingredients: cranberries, orange juice, orang

In [None]:
import pandas as pd

print("="*60)
print("PROCESSING EPICURIOUS DATASET")
print("="*60)

# Load Epicurious
print("\n[1/3] Loading Epicurious...")
epi_df = pd.read_csv('epi_r.csv')
print(f"✓ Loaded {len(epi_df)} recipes")
print(f"✓ Columns: {len(epi_df.columns)}")

# Define dietary and cuisine columns
dietary_cols = {
    'vegetarian': 'vegetarian',
    'vegan': 'vegan',
    'wheat/gluten-free': 'gluten-free',
    'peanut free': 'peanut-free',
    'soy free': 'soy-free',
    'tree nut free': 'tree-nut-free',
    'dairy free': 'dairy-free',
    'egg free': 'egg-free',
    'low-cal': 'low-calorie',
    'low-fat': 'low-fat',
    'low-sodium': 'low-sodium',
    'high-protein': 'high-protein',
    'paleo': 'paleo',
    'kosher': 'kosher',
    'pescatarian': 'pescatarian'
}

cuisine_cols = ['italian', 'mexican', 'chinese', 'japanese', 'thai',
                'indian', 'french', 'greek', 'mediterranean', 'spanish',
                'korean', 'vietnamese', 'middle eastern', 'moroccan']

# Process recipes
print("\n[2/3] Processing recipes...")
epi_recipes = []

for idx, row in tqdm(epi_df.iterrows(), total=len(epi_df), desc="Processing"):
    # Extract dietary tags
    dietary_tags = []
    for col, tag_name in dietary_cols.items():
        if col in row.index and row[col] == 1.0:
            dietary_tags.append(tag_name)

    # Extract cuisine
    cuisine = 'american'  # default
    for col in cuisine_cols:
        if col in row.index and row[col] == 1.0:
            cuisine = col.replace(' ', '-')
            break

    # Extract meal type
    meal_types = []
    if 'dessert' in row.index and row['dessert'] == 1.0:
        meal_types.append('dessert')
    if 'breakfast' in row.index and row['breakfast'] == 1.0:
        meal_types.append('breakfast')
    if 'lunch' in row.index and row['lunch'] == 1.0:
        meal_types.append('lunch')
    if not meal_types:
        meal_types = ['dinner']

    # Calculate nutrition
    calories = row.get('calories', 0)
    protein_g = row.get('protein', 0)
    fat_g = row.get('fat', 0)
    sodium_mg = row.get('sodium', 0)

    # Estimate carbs: Calories = (4*protein) + (9*fat) + (4*carbs)
    protein_cal = protein_g * 4
    fat_cal = fat_g * 9
    remaining_cal = max(0, calories - protein_cal - fat_cal)
    carbs_g = remaining_cal / 4

    recipe = {
        'id': f"epi_{idx}",
        'source_id': idx,
        'title': str(row['title']).strip(),
        'description': '',
        'ingredients': [],  # Epicurious doesn't have ingredient lists
        'instructions': [],
        'n_ingredients': 0,
        'n_steps': 0,
        'total_time_min': 0,
        'submitted': None,

        # Nutrition (actual grams!)
        'nutrition': {
            'calories': round(calories, 1),
            'protein_g': round(protein_g, 1),
            'fat_g': round(fat_g, 1),
            'carbs_g': round(carbs_g, 1),
            'sodium_mg': round(sodium_mg, 1),
            'fiber_g': round(carbs_g * 0.1, 1),  # Estimate
            'sugar_g': 0,
            'saturated_fat_g': 0
        },
        'nutrition_valid': True,

        # REAL dietary tags!
        'dietary_tags': dietary_tags,
        'cuisine': cuisine,
        'cuisine_confidence': 1.0,  # These are verified, not predicted
        'meal_types': meal_types,
        'difficulty': 'medium',

        # Quality
        'rating': round(row.get('rating', 0), 2),

        # To be filled
        'regions': [],
        'allergens': [],

        'source': 'epicurious'
    }

    epi_recipes.append(recipe)

print(f"✓ Processed {len(epi_recipes)} Epicurious recipes")

# Save
print("\n[3/3] Saving Epicurious recipes...")
with open('epicurious_processed.jsonl', 'w') as f:
    for recipe in epi_recipes:
        f.write(json.dumps(recipe) + '\n')

print("✓ Saved to: epicurious_processed.jsonl")

# Statistics
print("\n" + "="*60)
print("EPICURIOUS STATISTICS")
print("="*60)

# Dietary tags
all_dietary = []
for r in epi_recipes:
    all_dietary.extend(r['dietary_tags'])

dietary_dist = Counter(all_dietary)
print(f"\nTop dietary tags:")
for tag, count in dietary_dist.most_common(10):
    print(f"  {tag}: {count}")

# Cuisines
cuisine_dist = Counter([r['cuisine'] for r in epi_recipes])
print(f"\nTop cuisines:")
for cuisine, count in cuisine_dist.most_common(10):
    print(f"  {cuisine}: {count}")

print("\n" + "="*60)
print("✓ STEP 4 COMPLETE!")
print("="*60)

PROCESSING EPICURIOUS DATASET

[1/3] Loading Epicurious...
✓ Loaded 20052 recipes
✓ Columns: 680

[2/3] Processing recipes...


Processing: 100%|██████████| 20052/20052 [00:05<00:00, 3542.99it/s]


✓ Processed 20052 Epicurious recipes

[3/3] Saving Epicurious recipes...
✓ Saved to: epicurious_processed.jsonl

EPICURIOUS STATISTICS

Top dietary tags:
  peanut-free: 8390
  soy-free: 8088
  tree-nut-free: 7044
  vegetarian: 6846
  kosher: 6175
  pescatarian: 6042
  gluten-free: 4906
  dairy-free: 3206
  vegan: 1851
  paleo: 779

Top cuisines:
  american: 20052

✓ STEP 4 COMPLETE!


In [None]:
print("="*60)
print("ADDING REGIONAL AVAILABILITY")
print("="*60)

def add_regional_availability(recipe):
    """Infer regional availability from cuisine and ingredients"""

    # Cuisine → region mapping
    cuisine_to_region = {
        'italian': 'europe',
        'french': 'europe',
        'greek': 'mediterranean',
        'spanish': 'europe',
        'mediterranean': 'mediterranean',
        'british': 'europe',
        'mexican': 'latin_america',
        'brazilian': 'latin_america',
        'cajun_creole': 'north_america',
        'southern_us': 'north_america',
        'american': 'north_america',
        'chinese': 'asia',
        'japanese': 'asia',
        'thai': 'asia',
        'indian': 'asia',
        'korean': 'asia',
        'vietnamese': 'asia',
        'filipino': 'asia',
        'moroccan': 'middle_east',
        'middle-eastern': 'middle_east',
        'irish': 'europe',
        'russian': 'europe',
        'jamaican': 'latin_america'
    }

    regions = set()

    # Add region from cuisine
    cuisine = recipe.get('cuisine', 'american')
    if cuisine in cuisine_to_region:
        regions.add(cuisine_to_region[cuisine])

    # Check for exotic/rare ingredients
    exotic_keywords = [
        'saffron', 'tamarind', 'miso', 'tahini', 'harissa',
        'sumac', 'za\'atar', 'galangal', 'lemongrass', 'kaffir lime',
        'fish sauce', 'garam masala', 'cardamom pods'
    ]

    ingredients = recipe.get('ingredients', [])
    if ingredients:
        ingredients_text = ' '.join(ingredients).lower()
        has_exotic = any(kw in ingredients_text for kw in exotic_keywords)

        # If no exotic ingredients, it's more globally available
        if not has_exotic:
            regions.add('global')
    else:
        # If no ingredients list, mark as global
        regions.add('global')

    # Ensure at least one region
    if not regions:
        regions.add('global')

    return list(regions)

# Add to Food.com recipes
print("\n[1/2] Adding regions to Food.com recipes...")
for recipe in tqdm(food_com_recipes, desc="Food.com"):
    recipe['regions'] = add_regional_availability(recipe)

# Add to Epicurious recipes (if you processed them)
print("\n[2/2] Adding regions to Epicurious recipes...")
for recipe in tqdm(epi_recipes, desc="Epicurious"):
    recipe['regions'] = add_regional_availability(recipe)

print("\n✓ Regional availability added!")

# Show distribution
all_regions = []
for r in food_com_recipes + epi_recipes:
    all_regions.extend(r['regions'])

region_dist = Counter(all_regions)
print(f"\nRegion distribution:")
for region, count in region_dist.most_common():
    print(f"  {region}: {count:,}")

print("\n" + "="*60)
print("✓ STEP 5 COMPLETE!")
print("="*60)

ADDING REGIONAL AVAILABILITY

[1/2] Adding regions to Food.com recipes...


Food.com: 100%|██████████| 226101/226101 [00:02<00:00, 77390.30it/s]



[2/2] Adding regions to Epicurious recipes...


Epicurious: 100%|██████████| 20052/20052 [00:00<00:00, 206158.42it/s]



✓ Regional availability added!

Region distribution:
  global: 241,259
  north_america: 121,611
  europe: 69,498
  latin_america: 30,186
  asia: 21,515
  mediterranean: 2,205
  middle_east: 1,138

✓ STEP 5 COMPLETE!


In [None]:
print("="*60)
print("ADDING ALLERGEN DETECTION")
print("="*60)

def detect_allergens(ingredients_list):
    """Detect common allergens in ingredient list"""

    allergen_keywords = {
        'dairy': ['milk', 'cheese', 'butter', 'cream', 'yogurt', 'whey',
                  'casein', 'ghee', 'buttermilk', 'sour cream', 'ricotta',
                  'mozzarella', 'cheddar', 'parmesan', 'brie'],
        'eggs': ['egg', 'eggs', 'mayonnaise', 'mayo', 'meringue'],
        'peanuts': ['peanut', 'peanuts', 'peanut butter'],
        'tree_nuts': ['almond', 'walnut', 'cashew', 'pecan', 'pistachio',
                      'hazelnut', 'macadamia', 'pine nut', 'brazil nut'],
        'soy': ['soy', 'tofu', 'edamame', 'miso', 'tempeh', 'soy sauce',
                'tamari', 'soybean'],
        'wheat': ['flour', 'wheat', 'bread', 'pasta', 'couscous', 'bulgur',
                  'semolina', 'farro', 'spelt', 'wheat germ'],
        'fish': ['salmon', 'tuna', 'cod', 'tilapia', 'fish', 'anchovy',
                 'sardine', 'mackerel', 'halibut', 'trout'],
        'shellfish': ['shrimp', 'crab', 'lobster', 'clam', 'mussel',
                      'oyster', 'scallop', 'prawn', 'crayfish']
    }

    if not ingredients_list:
        return []

    ingredients_text = ' '.join(ingredients_list).lower()
    detected_allergens = []

    for allergen, keywords in allergen_keywords.items():
        if any(kw in ingredients_text for kw in keywords):
            detected_allergens.append(allergen)

    return detected_allergens

# Add to all recipes
print("\n[1/2] Detecting allergens in Food.com...")
for recipe in tqdm(food_com_recipes, desc="Food.com"):
    recipe['allergens'] = detect_allergens(recipe.get('ingredients', []))

print("\n[2/2] Detecting allergens in Epicurious...")
for recipe in tqdm(epi_recipes, desc="Epicurious"):
    recipe['allergens'] = detect_allergens(recipe.get('ingredients', []))

print("\n✓ Allergen detection complete!")

# Statistics
all_allergens = []
for r in food_com_recipes + epi_recipes:
    all_allergens.extend(r['allergens'])

allergen_dist = Counter(all_allergens)
print(f"\nAllergen distribution:")
for allergen, count in allergen_dist.most_common():
    print(f"  {allergen}: {count:,}")

print("\n" + "="*60)
print("✓ STEP 6 COMPLETE!")
print("="*60)

ADDING ALLERGEN DETECTION

[1/2] Detecting allergens in Food.com...


Food.com: 100%|██████████| 226101/226101 [00:05<00:00, 43904.91it/s]



[2/2] Detecting allergens in Epicurious...


Epicurious: 100%|██████████| 20052/20052 [00:00<00:00, 595162.40it/s]



✓ Allergen detection complete!

Allergen distribution:
  dairy: 141,905
  wheat: 79,128
  eggs: 70,288
  tree_nuts: 24,413
  soy: 14,241
  shellfish: 8,715
  fish: 8,705
  peanuts: 7,366

✓ STEP 6 COMPLETE!


In [None]:
print("="*60)
print("ADDING DIFFICULTY ESTIMATION")
print("="*60)

def estimate_difficulty(n_steps, total_time_min, n_ingredients):
    """Estimate recipe difficulty from steps, time, and ingredients"""

    # Simple scoring system
    score = 0

    # Steps contribution
    if n_steps <= 5:
        score += 1
    elif n_steps <= 10:
        score += 2
    else:
        score += 3

    # Time contribution
    if total_time_min <= 30:
        score += 1
    elif total_time_min <= 60:
        score += 2
    else:
        score += 3

    # Ingredients contribution
    if n_ingredients <= 5:
        score += 1
    elif n_ingredients <= 10:
        score += 2
    else:
        score += 3

    # Classify based on total score
    if score <= 4:
        return 'easy'
    elif score <= 7:
        return 'medium'
    else:
        return 'hard'

# Add to all recipes
print("\n[1/2] Estimating difficulty for Food.com...")
for recipe in tqdm(food_com_recipes, desc="Food.com"):
    if not recipe.get('difficulty'):  # Only if not set
        recipe['difficulty'] = estimate_difficulty(
            recipe.get('n_steps', 0),
            recipe.get('total_time_min', 0),
            recipe.get('n_ingredients', 0)
        )

print("\n[2/2] Estimating difficulty for Epicurious...")
for recipe in tqdm(epi_recipes, desc="Epicurious"):
    if not recipe.get('difficulty'):
        recipe['difficulty'] = 'medium'  # Default for Epicurious

print("\n✓ Difficulty estimation complete!")

# Statistics
all_difficulties = [r['difficulty'] for r in food_com_recipes + epi_recipes]
diff_dist = Counter(all_difficulties)
print(f"\nDifficulty distribution:")
for difficulty, count in diff_dist.items():
    pct = (count / len(all_difficulties)) * 100
    print(f"  {difficulty}: {count:,} ({pct:.1f}%)")

print("\n" + "="*60)
print("✓ STEP 7 COMPLETE!")
print("="*60)

ADDING DIFFICULTY ESTIMATION

[1/2] Estimating difficulty for Food.com...


Food.com: 100%|██████████| 226101/226101 [00:00<00:00, 966221.08it/s]



[2/2] Estimating difficulty for Epicurious...


Epicurious: 100%|██████████| 20052/20052 [00:00<00:00, 1387216.86it/s]



✓ Difficulty estimation complete!

Difficulty distribution:
  medium: 187,655 (76.2%)
  easy: 58,498 (23.8%)

✓ STEP 7 COMPLETE!


In [None]:
import numpy as np
from collections import Counter

print("="*60)
print("MERGING DATASETS & QUALITY CHECKS (FIXED)")
print("="*60)

# 1️⃣ Merge datasets
print("\n[1/4] Merging datasets...")
all_recipes = food_com_recipes + epi_recipes

print(f"✓ Total recipes: {len(all_recipes):,}")
print(f"  - Food.com: {len(food_com_recipes):,}")
print(f"  - Epicurious: {len(epi_recipes):,}")

# 2️⃣ Quality filtering (SOURCE-AWARE)
print("\n[2/4] Filtering invalid recipes...")

valid_recipes = []
filtered_out = []

for recipe in all_recipes:
    source = recipe.get('source', '')

    # Core requirements (must-have for RAG)
    has_title = recipe.get('title', '').strip() != ''
    has_cuisine = recipe.get('cuisine', '') != ''

    # Source-specific requirements
    if source == 'epicurious':
        is_valid = has_title and has_cuisine and recipe.get('nutrition_valid', False)
    else:  # food.com and others
        is_valid = has_title and has_cuisine

    if is_valid:
        valid_recipes.append(recipe)
    else:
        filtered_out.append(recipe)

print(f"✓ Valid recipes: {len(valid_recipes):,}")
print(f"✗ Filtered out: {len(filtered_out):,}")

# 3️⃣ Final statistics
print("\n[3/4] Generating final statistics...")

stats = {
    'total_recipes': len(valid_recipes),
    'sources': Counter([r['source'] for r in valid_recipes]),
    'unique_cuisines': len(set(r['cuisine'] for r in valid_recipes)),
    'avg_ingredients': np.mean(
        [r['n_ingredients'] for r in valid_recipes if r.get('n_ingredients', 0) > 0]
    ),
    'avg_time_min': np.mean(
        [r['total_time_min'] for r in valid_recipes if r.get('total_time_min', 0) > 0]
    )
}

print(f"\nFinal Dataset Statistics:")
print(f"  Total recipes: {stats['total_recipes']:,}")
for src, cnt in stats['sources'].items():
    print(f"  {src}: {cnt:,}")
print(f"  Unique cuisines: {stats['unique_cuisines']}")
print(f"  Avg ingredients: {stats['avg_ingredients']:.1f}")
print(f"  Avg time: {stats['avg_time_min']:.0f} minutes")

# 4️⃣ Data completeness
print(f"\nData Completeness:")
print(f"  Has nutrition: {sum(1 for r in valid_recipes if r.get('nutrition_valid')):,}")
print(f"  Has cuisine: {sum(1 for r in valid_recipes if r.get('cuisine')):,}")
print(f"  Has regions: {sum(1 for r in valid_recipes if r.get('regions')):,}")
print(f"  Has allergens: {sum(1 for r in valid_recipes if r.get('allergens')):,}")
print(f"  Has difficulty: {sum(1 for r in valid_recipes if r.get('difficulty')):,}")

print("\n" + "="*60)
print("✓ STEP 8 COMPLETE (FIXED)")
print("="*60)


MERGING DATASETS & QUALITY CHECKS (FIXED)

[1/4] Merging datasets...
✓ Total recipes: 246,153
  - Food.com: 226,101
  - Epicurious: 20,052

[2/4] Filtering invalid recipes...
✓ Valid recipes: 246,152
✗ Filtered out: 1

[3/4] Generating final statistics...


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)



Final Dataset Statistics:
  Total recipes: 246,152
  food.com: 226,100
  epicurious: 20,052
  Unique cuisines: 21
  Avg ingredients: 9.2
  Avg time: nan minutes

Data Completeness:
  Has nutrition: 20,052
  Has cuisine: 246,152
  Has regions: 246,152
  Has allergens: 182,855
  Has difficulty: 246,152

✓ STEP 8 COMPLETE (FIXED)


In [None]:
import json
import numpy as np
from tqdm import tqdm
from collections import Counter

print("="*60)
print("MERGING DATASETS & QUALITY CHECKS (FIXED)")
print("="*60)

# --- Function Definitions (Copied for self-containment) ---

def detect_allergens(ingredients_list):
    """Detect common allergens in ingredient list"""

    allergen_keywords = {
        'dairy': ['milk', 'cheese', 'butter', 'cream', 'yogurt', 'whey',
                  'casein', 'ghee', 'buttermilk', 'sour cream', 'ricotta',
                  'mozzarella', 'cheddar', 'parmesan', 'brie'],
        'eggs': ['egg', 'eggs', 'mayonnaise', 'mayo', 'meringue'],
        'peanuts': ['peanut', 'peanuts', 'peanut butter'],
        'tree_nuts': ['almond', 'walnut', 'cashew', 'pecan', 'pistachio',
                      'hazelnut', 'macadamia', 'pine nut', 'brazil nut'],
        'soy': ['soy', 'tofu', 'edamame', 'miso', 'tempeh', 'soy sauce',
                'tamari', 'soybean'],
        'wheat': ['flour', 'wheat', 'bread', 'pasta', 'couscous', 'bulgur',
                  'semolina', 'farro', 'spelt', 'wheat germ'],
        'fish': ['salmon', 'tuna', 'cod', 'tilapia', 'fish', 'anchovy',
                 'sardine', 'mackerel', 'halibut', 'trout'],
        'shellfish': ['shrimp', 'crab', 'lobster', 'clam', 'mussel',
                      'oyster', 'scallop', 'prawn', 'crayfish']
    }

    if not ingredients_list:
        return []

    ingredients_text = ' '.join(ingredients_list).lower()
    detected_allergens = []

    for allergen, keywords in allergen_keywords.items():
        if any(kw in ingredients_text for kw in keywords):
            detected_allergens.append(allergen)

    return detected_allergens

def add_regional_availability(recipe):
    """Infer regional availability from cuisine and ingredients"""

    # Cuisine → region mapping
    cuisine_to_region = {
        'italian': 'europe',
        'french': 'europe',
        'greek': 'mediterranean',
        'spanish': 'europe',
        'mediterranean': 'mediterranean',
        'british': 'europe',
        'mexican': 'latin_america',
        'brazilian': 'latin_america',
        'cajun_creole': 'north_america',
        'southern_us': 'north_america',
        'american': 'north_america',
        'chinese': 'asia',
        'japanese': 'asia',
        'thai': 'asia',
        'indian': 'asia',
        'korean': 'asia',
        'vietnamese': 'asia',
        'filipino': 'asia',
        'moroccan': 'middle_east',
        'middle-eastern': 'middle_east',
        'irish': 'europe',
        'russian': 'europe',
        'jamaican': 'latin_america'
    }

    regions = set()

    # Add region from cuisine
    cuisine = recipe.get('cuisine', 'american')
    if cuisine in cuisine_to_region:
        regions.add(cuisine_to_region[cuisine])

    # Check for exotic/rare ingredients
    exotic_keywords = [
        'saffron', 'tamarind', 'miso', 'tahini', 'harissa',
        'sumac', 'za\'atar', 'galangal', 'lemongrass', 'kaffir lime',
        'fish sauce', 'garam masala', 'cardamom pods'
    ]

    ingredients = recipe.get('ingredients', [])
    if ingredients:
        ingredients_text = ' '.join(ingredients).lower()
        has_exotic = any(kw in ingredients_text for kw in exotic_keywords)

        # If no exotic ingredients, it's more globally available
        if not has_exotic:
            regions.add('global')
    else:
        # If no ingredients list, mark as global
        regions.add('global')

    # Ensure at least one region
    if not regions:
        regions.add('global')

    return list(regions)


# Load Food.com recipes
print("\n[1/7] Loading Food.com recipes...")
food_com_recipes = []

with open('food_com_with_cuisines.jsonl', 'r') as f:
    for line in f:
        food_com_recipes.append(json.loads(line))

print(f"✓ Loaded {len(food_com_recipes):,} Food.com recipes")

# Load Epicurious recipes (if you have them)
print("\n[2/7] Loading Epicurious recipes...")
try:
    epi_recipes = []
    with open('epicurious_processed.jsonl', 'r') as f:
        for line in f:
            epi_recipes.append(json.loads(line))
    print(f"✓ Loaded {len(epi_recipes):,} Epicurious recipes")
except FileNotFoundError:
    epi_recipes = []
    print("⚠ Epicurious file not found - skipping")

# CONVERT FOOD.COM TO STANDARD FORMAT & ADD DERIVED FEATURES
print("\n[3/7] Converting Food.com to standard format and adding features...")

def convert_food_com_nutrition(nutrition_raw):
    """
    Convert Food.com nutrition_raw to standard format.

    nutrition_raw format: [calories, fat_pdv, sugar_pdv, sodium_pdv, protein_pdv, sat_fat_pdv, carbs_pdv]
    """
    if not nutrition_raw or len(nutrition_raw) < 7:
        return None

    try:
        return {
            'calories': round(float(nutrition_raw[0]), 1),
            'fat_g': round((float(nutrition_raw[1]) / 100) * 78, 1),
            'sugar_g': round((float(nutrition_raw[2]) / 100) * 50, 1),
            'sodium_mg': round((float(nutrition_raw[3]) / 100) * 2300, 1),
            'protein_g': round((float(nutrition_raw[4]) / 100) * 50, 1),
            'saturated_fat_g': round((float(nutrition_raw[5]) / 100) * 20, 1),
            'carbs_g': round((float(nutrition_raw[6]) / 100) * 275, 1),
            'fiber_g': round((float(nutrition_raw[6]) / 100) * 275 * 0.1, 1)
        }
    except:
        return None

for recipe in tqdm(food_com_recipes, desc="Processing Food.com"):
    # Convert nutrition_raw to nutrition dict
    if 'nutrition_raw' in recipe and 'nutrition' not in recipe:
        recipe['nutrition'] = convert_food_com_nutrition(recipe['nutrition_raw'])
        recipe['nutrition_valid'] = recipe['nutrition'] is not None
    elif 'nutrition' not in recipe:
        recipe['nutrition'] = None
        recipe['nutrition_valid'] = False
    else:
        # Already has nutrition dict
        recipe['nutrition_valid'] = recipe['nutrition'] is not None

    # Fix time field name
    if 'time_minutes' in recipe and 'total_time_min' not in recipe:
        recipe['total_time_min'] = recipe['time_minutes']
    elif 'total_time_min' not in recipe:
        recipe['total_time_min'] = 0

    # Ensure meal_types exists
    if 'meal_types' not in recipe:
        recipe['meal_types'] = ['dinner']  # default

    # Ensure dietary_tags exists
    if 'dietary_tags' not in recipe:
        recipe['dietary_tags'] = []

    # Add regions and allergens here for food.com recipes
    recipe['regions'] = add_regional_availability(recipe)
    recipe['allergens'] = detect_allergens(recipe.get('ingredients', []))


print(f"✓ Processed {len(food_com_recipes):,} Food.com recipes")

# Add derived features to Epicurious recipes
print("\n[4/7] Adding features to Epicurious recipes...")
for recipe in tqdm(epi_recipes, desc="Processing Epicurious"):
    # Epicurious already has some dietary tags, difficulty, etc. from its processing cell
    # Add regions and allergens here for epicurious recipes
    recipe['regions'] = add_regional_availability(recipe)
    recipe['allergens'] = detect_allergens(recipe.get('ingredients', [])) # Will be empty since no ingredients

print(f"✓ Processed {len(epi_recipes):,} Epicurious recipes")

# Merge all recipes
print("\n[5/7] Merging datasets...")
all_recipes = food_com_recipes + epi_recipes
print(f"✓ Total recipes before filtering: {len(all_recipes):,}")
print(f"  - Food.com: {len(food_com_recipes):,}")
print(f"  - Epicurious: {len(epi_recipes):,}")

# QUALITY FILTERING
print("\n[6/7] Filtering invalid recipes...")

valid_recipes = []
filtered_out = []

for recipe in tqdm(all_recipes, desc="Filtering"):
    # Must have:
    # 1. Valid nutrition
    # 2. Title
    # 3. At least 1 ingredient (Food.com)
    # 4. Cuisine

    has_nutrition = recipe.get('nutrition_valid', False) and recipe.get('nutrition') is not None
    has_title = recipe.get('title', '').strip() != ''
    has_ingredients = len(recipe.get('ingredients', [])) > 0
    has_cuisine = recipe.get('cuisine', '') != ''

    is_valid = has_nutrition and has_title and has_ingredients and has_cuisine

    if is_valid:
        valid_recipes.append(recipe)
    else:
        filtered_out.append({
            'id': recipe.get('id', 'unknown'),
            'title': recipe.get('title', 'no title'),
            'reason': {
                'has_nutrition': has_nutrition,
                'has_title': has_title,
                'has_ingredients': has_ingredients,
                'has_cuisine': has_cuisine
            }
        })

print(f"\n✓ Valid recipes: {len(valid_recipes):,}")
print(f"✗ Filtered out: {len(filtered_out):,}")

# Show why recipes were filtered out
if len(filtered_out) > 0:
    print("\nReasons for filtering:")
    reasons = {
        'no_nutrition': sum(1 for r in filtered_out if not r['reason']['has_nutrition']),
        'no_title': sum(1 for r in filtered_out if not r['reason']['has_title']),
        'no_ingredients': sum(1 for r in filtered_out if not r['reason']['has_ingredients']),
        'no_cuisine': sum(1 for r in filtered_out if not r['reason']['has_cuisine'])
    }
    for reason, count in reasons.items():
        if count > 0:
            print(f"  {reason}: {count:,}")

# FINAL STATISTICS
print("\n" + "="*60)
print("FINAL DATASET STATISTICS")
print("="*60)

stats = {
    'total_recipes': len(valid_recipes),
    'sources': {
        'food.com': sum(1 for r in valid_recipes if r.get('source') == 'food.com'),
        'epicurious': sum(1 for r in valid_recipes if r.get('source') == 'epicurious')
    },
    'cuisines': len(set([r['cuisine'] for r in valid_recipes])),
    'avg_ingredients': np.mean([len(r.get('ingredients', [])) for r in valid_recipes]),
    'avg_time_min': np.mean([r.get('total_time_min', 0) for r in valid_recipes if r.get('total_time_min', 0) > 0]),
}

print(f"\nDataset Size:")
print(f"  Total recipes: {stats['total_recipes']:,}")
print(f"  Food.com: {stats['sources']['food.com']:,}")
print(f"  Epicurious: {stats['sources']['epicurious']:,}")

print(f"\nContent Stats:")
print(f"  Unique cuisines: {stats['cuisines']}")
print(f"  Avg ingredients: {stats['avg_ingredients']:.1f}")
print(f"  Avg time: {stats['avg_time_min']:.0f} minutes")

# Data completeness
print(f"\nData Completeness:")
print(f"  Has nutrition: {sum(1 for r in valid_recipes if r.get('nutrition_valid')):,} ({sum(1 for r in valid_recipes if r.get('nutrition_valid'))/len(valid_recipes)*100:.1f}%)")
print(f"  Has cuisine: {sum(1 for r in valid_recipes if r.get('cuisine')):,} (100%)")
print(f"  Has regions: {sum(1 for r in valid_recipes if r.get('regions')):,} ({sum(1 for r in valid_recipes if r.get('regions'))/len(valid_recipes)*100:.1f}%)")
print(f"  Has allergens: {sum(1 for r in valid_recipes if r.get('allergens')):,} ({sum(1 for r in valid_recipes if r.get('allergens'))/len(valid_recipes)*100:.1f}%)")
print(f"  Has difficulty: {sum(1 for r in valid_recipes if r.get('difficulty')):,} ({sum(1 for r in valid_recipes if r.get('difficulty'))/len(valid_recipes)*100:.1f}%)")

# Cuisine distribution
print("\n" + "="*60)
print("CUISINE DISTRIBUTION")
print("="*60)

cuisine_dist = Counter([r['cuisine'] for r in valid_recipes])
print(f"\nTop 15 cuisines:")
for cuisine, count in cuisine_dist.most_common(15):
    pct = (count / len(valid_recipes)) * 100
    print(f"  {cuisine:15s}: {count:6,} ({pct:5.1f}%)")

# Dietary tags distribution
all_dietary = []
for r in valid_recipes:
    all_dietary.extend(r.get('dietary_tags', []))

if all_dietary:
    dietary_dist = Counter(all_dietary)
    print(f"\nTop dietary tags:")
    for tag, count in dietary_dist.most_common(10):
        print(f"  {tag}: {count:,}")

# Difficulty distribution
difficulty_dist = Counter([r.get('difficulty', 'unknown') for r in valid_recipes])
print(f"\nDifficulty distribution:")
for difficulty, count in difficulty_dist.items():
    pct = (count / len(valid_recipes)) * 100
    print(f"  {difficulty}: {count:,} ({pct:.1f}%)")

# Region distribution
all_regions = []
for r in valid_recipes:
    all_regions.extend(r.get('regions', []))

region_dist = Counter(all_regions)
print(f"\nRegion distribution:")
for region, count in region_dist.most_common():
    print(f"  {region}: {count:,}")

# Allergen distribution
all_allergens = []
for r in valid_recipes:
    all_allergens.extend(r.get('allergens', []))

if all_allergens:
    allergen_dist = Counter(all_allergens)
    print(f"\nAllergen distribution:")
    for allergen, count in allergen_dist.most_common():
        print(f"  {allergen}: {count:,}")

# Nutrition stats
print("\n" + "="*60)
print("NUTRITION STATISTICS")
print("="*60)

valid_nutrition = [r['nutrition'] for r in valid_recipes if r.get('nutrition_valid')]

if valid_nutrition:
    calories = [n['calories'] for n in valid_nutrition]
    protein = [n['protein_g'] for n in valid_nutrition]
    carbs = [n['carbs_g'] for n in valid_nutrition]
    fats = [n['fat_g'] for n in valid_nutrition]

    print(f"\nCalories:")
    print(f"  Mean: {np.mean(calories):.0f}")
    print(f"  Median: {np.median(calories):.0f}")
    print(f"  Range: {np.min(calories):.0f} - {np.max(calories):.0f}")

    print(f"\nProtein (g):")
    print(f"  Mean: {np.mean(protein):.1f}")
    print(f"  Median: {np.median(protein):.1f}")

    print(f"\nCarbs (g):")
    print(f"  Mean: {np.mean(carbs):.1f}")
    print(f"  Median: {np.median(carbs):.1f}")

    print(f"\nFats (g):")
    print(f"  Mean: {np.mean(fats):.1f}")
    print(f"  Median: {np.median(fats):.1f}")

# Sample valid recipes
print("\n" + "="*60)
print("SAMPLE VALID RECIPES")
print("="*60)

import random
for _ in range(3):
    recipe = random.choice(valid_recipes)
    print(f"\nTitle: {recipe['title']}")
    print(f"Source: {recipe.get('source', 'unknown')}")
    print(f"Cuisine: {recipe['cuisine']} (confidence: {recipe.get('cuisine_confidence', 0):.2%})")
    print(f"Ingredients ({len(recipe.get('ingredients', []))}): {', '.join(recipe.get('ingredients', [])[:5])}...")
    if recipe.get('nutrition'):
        print(f"Nutrition: {recipe['nutrition']['calories']:.0f} cal, {recipe['nutrition']['protein_g']:.1f}g protein")
    print(f"Regions: {', '.join(recipe.get('regions', []))}")
    print(f"Allergens: {', '.join(recipe.get('allergens', [])) if recipe.get('allergens') else 'None detected'}")

print("\n" + "="*60)
print("SAVING FINAL DATASET")
print("="*60)

# Save final validated dataset
output_file = 'final_recipes_enriched.jsonl'
print(f"\nSaving {len(valid_recipes):,} recipes to {output_file}...")

with open(output_file, 'w') as f:
    for recipe in tqdm(valid_recipes, desc="Saving"):
        f.write(json.dumps(recipe) + '\n')

print(f"✓ Saved to: {output_file}")

# Save sample for testing
sample_recipes = valid_recipes[:1000]
with open('sample_recipes_1k.jsonl', 'w') as f:
    for recipe in sample_recipes:
        f.write(json.dumps(recipe) + '\n')

print(f"✓ Saved 1,000 sample recipes to: sample_recipes_1k.jsonl")

# Save statistics
import datetime

final_stats = {
    'total_recipes': len(valid_recipes),
    'filtered_out': len(filtered_out),
    'sources': {
        'food.com': sum(1 for r in valid_recipes if r.get('source') == 'food.com'),
        'epicurious': sum(1 for r in valid_recipes if r.get('source') == 'epicurious')
    },
    'cuisines': sorted(list(set([r['cuisine'] for r in valid_recipes]))),
    'num_cuisines': len(set([r['cuisine'] for r in valid_recipes])),
    'regions': sorted(list(set([r for recipe in valid_recipes for r in recipe.get('regions', [])]))),
    'allergens': sorted(list(set([a for recipe in valid_recipes for a in recipe.get('allergens', [])]))),
    'export_date': str(datetime.datetime.now()),
    'files': {
        'main': output_file,
        'sample': 'sample_recipes_1k.jsonl'
    }
}

with open('dataset_final_stats.json', 'w') as f:
    json.dump(final_stats, f, indent=2)

print("✓ Saved statistics to: dataset_final_stats.json")

print("\n" + "="*60)
print("✅ VALIDATION & EXPORT COMPLETE!")
print("="*60)

print(f"\nFiles created:")
print(f"  1. {output_file} - Final dataset ({len(valid_recipes):,} recipes)")
print(f"  2. sample_recipes_1k.jsonl - Test sample (1,000 recipes)")
print(f"  3. dataset_final_stats.json - Statistics")

print(f"\nDataset Summary:")
print(f"  ✓ {len(valid_recipes):,} valid recipes")
print(f"  ✓ {stats['cuisines']} unique cuisines")
print(f"  ✓ {len(region_dist)} regions")
print(f"  ✓ 100% have nutrition data")
print(f"  ✓ Ready for ChromaDB ingestion!")

print("\n" + "="*60)
print("✅ READY FOR NEXT STEP: CHROMADB INGESTION!")
print("="*60)

MERGING DATASETS & QUALITY CHECKS (FIXED)

[1/7] Loading Food.com recipes...
✓ Loaded 226,101 Food.com recipes

[2/7] Loading Epicurious recipes...
✓ Loaded 20,052 Epicurious recipes

[3/7] Converting Food.com to standard format and adding features...


Processing Food.com: 100%|██████████| 226101/226101 [00:10<00:00, 20601.22it/s]


✓ Processed 226,101 Food.com recipes

[4/7] Adding features to Epicurious recipes...


Processing Epicurious: 100%|██████████| 20052/20052 [00:00<00:00, 235043.44it/s]


✓ Processed 20,052 Epicurious recipes

[5/7] Merging datasets...
✓ Total recipes before filtering: 246,153
  - Food.com: 226,101
  - Epicurious: 20,052

[6/7] Filtering invalid recipes...


Filtering: 100%|██████████| 246153/246153 [00:00<00:00, 515711.25it/s]



✓ Valid recipes: 226,100
✗ Filtered out: 20,053

Reasons for filtering:
  no_title: 1
  no_ingredients: 20,052

FINAL DATASET STATISTICS

Dataset Size:
  Total recipes: 226,100
  Food.com: 226,100
  Epicurious: 0

Content Stats:
  Unique cuisines: 20
  Avg ingredients: 9.2
  Avg time: 9618 minutes

Data Completeness:
  Has nutrition: 226,100 (100.0%)
  Has cuisine: 226,100 (100%)
  Has regions: 226,100 (100.0%)
  Has allergens: 182,855 (80.9%)
  Has difficulty: 0 (0.0%)

CUISINE DISTRIBUTION

Top 15 cuisines:
  southern_us    : 99,653 ( 44.1%)
  italian        : 64,696 ( 28.6%)
  mexican        : 30,026 ( 13.3%)
  chinese        :  9,323 (  4.1%)
  indian         :  7,906 (  3.5%)
  french         :  4,582 (  2.0%)
  greek          :  2,205 (  1.0%)
  thai           :  2,071 (  0.9%)
  cajun_creole   :  1,906 (  0.8%)
  japanese       :  1,252 (  0.6%)
  moroccan       :  1,138 (  0.5%)
  korean         :    536 (  0.2%)
  filipino       :    215 (  0.1%)
  vietnamese     :    212 (  

Saving: 100%|██████████| 226100/226100 [00:11<00:00, 19493.95it/s]


✓ Saved to: final_recipes_enriched.jsonl
✓ Saved 1,000 sample recipes to: sample_recipes_1k.jsonl
✓ Saved statistics to: dataset_final_stats.json

✅ VALIDATION & EXPORT COMPLETE!

Files created:
  1. final_recipes_enriched.jsonl - Final dataset (226,100 recipes)
  2. sample_recipes_1k.jsonl - Test sample (1,000 recipes)
  3. dataset_final_stats.json - Statistics

Dataset Summary:
  ✓ 226,100 valid recipes
  ✓ 20 unique cuisines
  ✓ 7 regions
  ✓ 100% have nutrition data
  ✓ Ready for ChromaDB ingestion!

✅ READY FOR NEXT STEP: CHROMADB INGESTION!


In [None]:
import shutil
import os
from datetime import datetime

print("="*60)
print("SAVING FILES TO GOOGLE DRIVE")
print("="*60)

# Mount Google Drive (should already be mounted)
print("\n[1/4] Mounting Google Drive...")
# Removed drive.mount() call as it's already mounted and called in the first cell
print("✓ Google Drive is already mounted!")

# Set the correct working directory where files were generated
# This assumes the files were generated in the directory set by the initial os.chdir command
current_working_dir = '/content/drive/MyDrive/Recipe_dataset'
print(f"\nUsing detected file generation directory: {current_working_dir}")

# Create backup folder with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_folder = f'/content/drive/MyDrive/culinary_assistant_backup_{timestamp}'

print(f"\n[2/4] Creating backup folder...")
os.makedirs(backup_folder, exist_ok=True)
print(f"✓ Created: {backup_folder}")

# List of files to backup
files_to_backup = {
    # Final datasets
    'final_recipes_enriched.jsonl': 'Main dataset with all recipes',
    'sample_recipes_1k.jsonl': 'Sample dataset (1,000 recipes)',
    'dataset_final_stats.json': 'Dataset statistics',

    # Intermediate files
    'food_com_with_cuisines.jsonl': 'Food.com with ML-predicted cuisines',
    'epicurious_processed.jsonl': 'Processed Epicurious recipes',

    # Model files
    'cuisine_classifier.pkl': 'Trained cuisine classification model',

    # Source data (optional)
    # 'food_com_cleaned.jsonl': 'Original cleaned Food.com data',
    # 'train.json': 'Yummly dataset',
    # 'epi_r.csv': 'Epicurious raw data',
}

print(f"\n[3/4] Copying files to Google Drive...")
print(f"Backup location: {backup_folder}\n")

copied_files = []
missing_files = []
total_size = 0

for filename, description in files_to_backup.items():
    # Construct source path using the current working directory
    source_path = os.path.join(current_working_dir, filename)
    dest_path = os.path.join(backup_folder, filename)

    if os.path.exists(source_path):
        # Get file size
        size_mb = os.path.getsize(source_path) / (1024 * 1024)
        total_size += size_mb

        # Copy file
        print(f"Copying: {filename} ({size_mb:.1f} MB)")
        print(f"  → {description}")
        shutil.copy2(source_path, dest_path)
        copied_files.append(filename)
        print(f"  ✓ Copied successfully!")
    else:
        print(f"⚠ Skipping: {filename} (not found at {source_path})")
        missing_files.append(filename)
    print()

print("\n[4/4] Creating backup manifest...")

# Create a manifest file with details
manifest_content = f"""Culinary Assistant - Data Backup\nGenerated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n\nBackup Location: {backup_folder}\n\nFILES BACKED UP ({len(copied_files)}):\n{'='*60}\n"""

for filename in copied_files:
    filepath = os.path.join(backup_folder, filename)
    size_mb = os.path.getsize(filepath) / (1024 * 1024)
    manifest_content += f"\n{filename}\n"
    manifest_content += f"  Size: {size_mb:.2f} MB\n"
    manifest_content += f"  Description: {files_to_backup[filename]}\n"

if missing_files:
    manifest_content += f"\n\nFILES NOT FOUND ({len(missing_files)}):\n"
    manifest_content += "="*60 + "\n"
    for filename in missing_files:
        manifest_content += f"\n{filename}\n"
        manifest_content += f"  Description: {files_to_backup[filename]}\n"

manifest_content += f"\n\nTOTAL SIZE: {total_size:.2f} MB\n"

# Save manifest
manifest_path = os.path.join(backup_folder, 'BACKUP_MANIFEST.txt')
with open(manifest_path, 'w') as f:
    f.write(manifest_content)

print("✓ Manifest created!")

# Print summary
print("\n" + "="*60)
print("BACKUP COMPLETE!")
print("="*60)

print(f"\n📁 Backup Location:")
print(f"   {backup_folder}")

print(f"\n✓ Files backed up: {len(copied_files)}")
print(f"✓ Total size: {total_size:.2f} MB")

if missing_files:
    print(f"\n⚠ Files not found: {len(missing_files)}")
    for filename in missing_files:
        print(f"   - {filename}")

print(f"\n📄 See BACKUP_MANIFEST.txt for complete details")

# Create a shortcut path for easy access
easy_path = '/content/drive/MyDrive/culinary_assistant_LATEST'

print("\n" + "="*60)
print("CREATING EASY ACCESS FOLDER")
print("="*60)

# Copy to "LATEST" folder (overwrites previous)
if os.path.exists(easy_path):
    shutil.rmtree(easy_path)

shutil.copytree(backup_folder, easy_path)
print(f"\n✓ Also saved to: {easy_path}")
print("  (This is always your latest backup)")

# Display file tree
print("\n" + "="*60)
print("GOOGLE DRIVE FOLDER STRUCTURE")
print("="*60)

print(f"\nMyDrive/")
print(f"├── culinary_assistant_backup_{timestamp}/")
for filename in copied_files:
    print(f"│   ├── {filename}")
print(f"│   └── BACKUP_MANIFEST.txt")
print(f"│")
print(f"└── culinary_assistant_LATEST/  ← Easy access")
for filename in copied_files:
    print(f"    ├── {filename}")
print(f"    └── BACKUP_MANIFEST.txt")

print("\n" + "="*60)
print("✅ ALL FILES SAVED TO GOOGLE DRIVE!")
print("="*60)

print("\n💡 Access your files:")
print("   1. Open Google Drive in browser")
print("   2. Navigate to 'culinary_assistant_LATEST' folder")
print("   3. Download any file you need")

print("\n💡 To restore in a new Colab session:")
print("   from google.colab import drive")
print("   drive.mount('/content/drive')")
print("   !cp /content/drive/MyDrive/culinary_assistant_LATEST/*.jsonl /content/")

SAVING FILES TO GOOGLE DRIVE

[1/4] Mounting Google Drive...
✓ Google Drive is already mounted!

Using detected file generation directory: /content/drive/MyDrive/Recipe_dataset

[2/4] Creating backup folder...
✓ Created: /content/drive/MyDrive/culinary_assistant_backup_20260210_144455

[3/4] Copying files to Google Drive...
Backup location: /content/drive/MyDrive/culinary_assistant_backup_20260210_144455

Copying: final_recipes_enriched.jsonl (387.8 MB)
  → Main dataset with all recipes
  ✓ Copied successfully!

Copying: sample_recipes_1k.jsonl (1.7 MB)
  → Sample dataset (1,000 recipes)
  ✓ Copied successfully!

Copying: dataset_final_stats.json (0.0 MB)
  → Dataset statistics
  ✓ Copied successfully!

Copying: food_com_with_cuisines.jsonl (318.7 MB)
  → Food.com with ML-predicted cuisines
  ✓ Copied successfully!

Copying: epicurious_processed.jsonl (11.9 MB)
  → Processed Epicurious recipes
  ✓ Copied successfully!

Copying: cuisine_classifier.pkl (202.9 MB)
  → Trained cuisine clas