# Recipe Dataset Cleaning

This notebook cleans and processes the recipe dataset, adding useful metadata columns.

## 1. Import Libraries

In [None]:
import pandas as pd

## 2. Load Dataset

In [None]:
df = pd.read_csv(r"C:\Users\User\Desktop\ELO2-Smart-Pantry-Manager\the_app\data\Recipe_Dataset.csv")
print(f"Dataset shape: {df.shape}")
df.head()

## 3. Add Ingredient Count

In [None]:
df["ingredient_count"] = df["Ingredients"].apply(len)
print(f"Average ingredients per recipe: {df['ingredient_count'].mean():.1f}")
df["ingredient_count"].head()

## 4. Count Instruction Steps

In [None]:
df["instruction_steps"] = df["Instructions"].apply(lambda x: len([s for s in str(x).split(".") if s.strip()]))
print(f"Average steps per recipe: {df['instruction_steps'].mean():.1f}")
df["instruction_steps"].head()

## 5. Extract Keywords from Titles

In [None]:
def extract_keywords(title):
    exclude = {"with", "and", "the", "for", "from", "or"}
    words = str(title).lower().split()
    keywords = [w for w in words if w not in exclude and len(w) > 3]
    return keywords[:3]

df["keywords"] = df["Title"].apply(extract_keywords)
df[["Title", "keywords"]].head()

## 6. Identify Vegetarian Recipes

In [None]:
def is_vegetarian(ingredients_list):
    non_veg_keywords = ["chicken","beef","pork","lamb","turkey","duck","veal","fish","salmon","tuna","shrimp","prawn","lobster","crab","anchovy","anchovies","bacon","ham","sausage","chorizo","meat","steak","ribs","wings","drumstick","thigh","cod","halibut","snapper","sardine","clam","oyster","mussels","scallop","octopus","squid","gelatin"]
    ingredients_text = " ".join([str(ing).lower() for ing in ingredients_list])
    return not any(keyword in ingredients_text for keyword in non_veg_keywords)

df["vegetarian"] = df["Ingredients"].apply(is_vegetarian)
df[["Title", "ingredient_count", "instruction_steps", "vegetarian"]].head()

## 7. Check for Duplicates

In [None]:
duplicates = df.duplicated(subset=["Title"], keep="first")
print(f"Duplicate recipes found: {duplicates.sum()}")
df = df.drop_duplicates(subset=["Title"], keep="first").reset_index(drop=True)

## 8. Final Summary

In [None]:
print("Final shape:", df.shape)
print(df.dtypes)
print(df.isnull().sum())
df[["ingredient_count", "instruction_steps"]].describe()

## 9. Save Cleaned Dataset

In [None]:
output_path = "cleaned_recipes.csv"
df.to_csv(output_path, index=False)
df.head()

## 10. Load and Explore Pantry Data

In [None]:
df2 = pd.read_excel(r"C:\Users\User\Desktop\ELO2-Smart-Pantry-Manager\the_app\data\pantry_data.xlsx")
print(df2.shape)
df2.head()

## 11. Pantry Data Information

In [None]:
df2.info()

## 12. Product Distribution

In [None]:
df2["Product"].value_counts()