In [40]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bishn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\bishn\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [41]:
import pandas as pd

# Define file paths
recipe_file = '../data/raw/food-com/RAW_recipes.csv'
ratings_file = '../data/raw/food-com/interactions_train.csv'

# Load them
df_recipes = pd.read_csv(recipe_file)
df_ratings = pd.read_csv(ratings_file)

# See what you have
print(df_recipes.info())
print(df_recipes.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB
None
                                         name      id  minutes  \
0  arriba   baked winter squash mexican style  137739       55   
1            a bit different  breakfast pizza   31490       30   
2    

In [42]:
import re

def extract_oven_data(steps_text):
    temperature = None
    duration = None

    # 1. Find temperature
    # Looks for patterns like "400 F", "350°F", "200C", "oven to 375"
    temp_pattern = re.compile(r'(\d{3})\s?°?[FfCc]|\b(at|to)\s(\d{3})\b')
    temp_match = temp_pattern.search(str(steps_text))

    if temp_match:
        # Grab the first number found (e.g., '350' from "at 350")
        temperature = temp_match.group(1) or temp_match.group(3)

    # 2. Find duration
    # Looks for patterns like "30 minutes", "1 hour", "25-30 min"
    duration_pattern = re.compile(r'(\d+)\s?(to|-)\s?(\d+)\s?(minutes|min)|(\d+)\s?(minutes|min|hour|hr)')
    duration_match = duration_pattern.search(str(steps_text))

    if duration_match:
        if duration_match.group(1): # It's a range like "25-30"
            avg_time = (int(duration_match.group(1)) + int(duration_match.group(3))) / 2
            duration = avg_time
        elif duration_match.group(5): # It's a single time like "45"
            duration = int(duration_match.group(5))
            if 'hour' in duration_match.group(6):
                duration = duration * 60 # Standardize to minutes

    # Only return if we found *both* (it's an oven recipe)
    if temperature and duration:
        return int(temperature), int(duration)
    else:
        return None, None

# --- Apply this to your DataFrame ---
# This will create two new columns: 'Oven_Temp' and 'Oven_Duration'
df_recipes[['Oven_Temp', 'Oven_Duration']] = df_recipes['steps'].apply(
    lambda x: pd.Series(extract_oven_data(x))
)

# CRITICAL: Drop all recipes that are not for an oven
df_oven_recipes = df_recipes.dropna(subset=['Oven_Temp', 'Oven_Duration']).copy()

print(f"Found {len(df_oven_recipes)} oven-specific recipes out of {len(df_recipes)}")

Found 82832 oven-specific recipes out of 231637


In [43]:
import pandas as pd
import numpy as np

# Define file paths
pp_recipe_file = '../data/raw/food-com/PP_recipes.csv'
raw_recipe_file = '../data/raw/food-com/RAW_recipes.csv'

# Load them
df_pp_recipes = pd.read_csv(pp_recipe_file)
df_raw_recipes = pd.read_csv(raw_recipe_file)
df_avg_ratings = df_ratings.groupby('recipe_id')['rating'].mean().reset_index()
# --- 1. FIND THE 'SERVINGS' COLUMN ---
# Let's inspect the columns in PP_recipes.csv
print("--- Columns in PP_recipes.csv ---")
print(df_pp_recipes.columns.to_list())
print("\n--- PP_recipes.csv Head ---")
print(df_pp_recipes.head())

# --- 2. FIND 'INGREDIENTS' AND 'TAGS' ---
# We already know these are in RAW_recipes.csv
print("\n--- Columns in RAW_recipes.csv ---")
print(df_raw_recipes.columns.to_list())

--- Columns in PP_recipes.csv ---
['id', 'i', 'name_tokens', 'ingredient_tokens', 'steps_tokens', 'techniques', 'calorie_level', 'ingredient_ids']

--- PP_recipes.csv Head ---
       id       i                                        name_tokens  \
0  424415      23  [40480, 37229, 2911, 1019, 249, 6878, 6878, 28...   
1  146223   96900       [40480, 18376, 7056, 246, 1531, 2032, 40481]   
2  312329  120056     [40480, 21044, 16954, 8294, 556, 10837, 40481]   
3   74301  168258                       [40480, 10025, 31156, 40481]   
4   76272  109030  [40480, 17841, 252, 782, 2373, 1641, 2373, 252...   

                                   ingredient_tokens  \
0  [[2911, 1019, 249, 6878], [1353], [6953], [153...   
1  [[17918], [25916], [2507, 6444], [8467, 1179],...   
2  [[5867, 24176], [1353], [6953], [1301, 11332],...   
3  [[1270, 1645, 28447], [21601], [27952, 29471, ...   
4  [[1430, 11434], [1430, 17027], [1615, 23, 695,...   

                                        steps_tokens  

In [44]:
print("Merging all data sources...")
# Start with our oven recipes (which already has 'id', 'name', 'tags', 'Oven_Temp', 'Oven_Duration')
df_merged = df_oven_recipes

# Merge 1: Add ingredient_ids
df_merged = pd.merge(df_merged, df_pp_recipes, on='id', how='inner')

# Merge 2: Add average ratings
df_merged = pd.merge(df_merged, df_avg_ratings, left_on='id', right_on='recipe_id', how='left')

# Drop any rows where key context is missing
df_final_v2 = df_merged.dropna(subset=['ingredient_ids', 'tags', 'rating']).copy()
print(f"Created merged dataset with {len(df_final_v2)} recipes.")

Merging all data sources...
Created merged dataset with 56588 recipes.


In [45]:
import numpy as np

print("Augmenting data with random environment values...")

# Get the number of recipes we have
num_recipes = len(df_final_v2) # This is 82,832

# Create a random temperature for each recipe (e.g., between 15°C and 30°C)
df_final_v2['Room_Temp'] = np.random.uniform(low=15.0, high=30.0, size=num_recipes).round(1)

# Create a random humidity for each recipe (e.g., between 30% and 70%)
df_final_v2['Room_Humidity'] = np.random.uniform(low=30.0, high=70.0, size=num_recipes).round(1)

print("Data augmentation complete.")
print(df_final_v2[['name', 'Room_Temp', 'Room_Humidity']].head())

final_features = [
    'id', 'name', 'rating',
    'Room_Temp', 'Room_Humidity',   # Inputs
    'ingredient_ids', 'tags',       # Inputs
    'Oven_Temp', 'Oven_Duration'    # Outputs
]
df_final_v2_clean = df_final_v2[final_features]
# 4. Save your new, clean dataset!
output_path = '../data/processed/processed_oven_recipes_v2.csv'
df_final_v2_clean.to_csv(output_path, index=False)

print(f"Optimal dataset saved to {output_path}")
print(df_final_v2_clean.head())

Augmenting data with random environment values...
Data augmentation complete.
                                         name  Room_Temp  Room_Humidity
0  arriba   baked winter squash mexican style       20.5           67.5
1            a bit different  breakfast pizza       19.0           63.9
2                          alouette  potatoes       25.5           50.3
3                     beat this  banana bread       24.9           35.8
4             better then bush s  baked beans       19.0           33.4
Optimal dataset saved to ../data/processed/processed_oven_recipes_v2.csv
       id                                        name    rating  Room_Temp  \
0  137739  arriba   baked winter squash mexican style  5.000000       20.5   
1   31490            a bit different  breakfast pizza  3.000000       19.0   
2   59389                          alouette  potatoes  4.000000       25.5   
3   75452                     beat this  banana bread  4.250000       24.9   
4   67547             bette