In [2]:
import os
import pickle
import pandas as pd

# ---------- INPUT ----------
META_PATH = r"D:\food_recommender\embeddings\meta.pkl"      # original meta.pkl (with some broken paths)
DRIVE_IMAGE_FOLDER = r"/content/drive/MyDrive/food_recommender/data/Food Images"
# ---------- OUTPUT ----------
OUT_ORIGINAL = r"D:\food_recommender\embeddings\meta_clean_original_paths.pkl"
OUT_DRIVE = "meta_clean_drive_paths.pkl"
# ----------------------------

# Load meta.pkl
with open(META_PATH, "rb") as f:
    meta = pickle.load(f)

df = pd.DataFrame(meta)
print("Before cleaning:", len(df))

# Function to check if file exists
def is_valid(path):
    return isinstance(path, str) and os.path.exists(path)

# Mark valid / missing paths
df["Image_Exists"] = df["image_path"].apply(is_valid)
missing = (~df["Image_Exists"]).sum()

print(f"Missing images: {missing}")

# Filter out rows where image does NOT exist
df_clean = df[df["Image_Exists"] == True].reset_index(drop=True)
df_clean = df_clean.drop(columns=["Image_Exists"])

print("After cleaning:", len(df_clean))

# -------------------------------
# 1️⃣ meta_clean_original_paths.pkl
# -------------------------------
meta_clean_original = {
    "title": df_clean["title"].tolist(),
    "ingredients": df_clean["ingredients"].tolist(),
    "cleaned_ingredients": df_clean["cleaned_ingredients"].tolist(),
    "instructions": df_clean["instructions"].tolist(),
    "image_path": df_clean["image_path"].tolist(),  # ORIGINAL WINDOWS PATH
}

with open(OUT_ORIGINAL, "wb") as f:
    pickle.dump(meta_clean_original, f)

print("Saved:", OUT_ORIGINAL)



Before cleaning: 13501
Missing images: 30
After cleaning: 13471
Saved: D:\food_recommender\embeddings\meta_clean_original_paths.pkl


In [3]:
df_clean.to_csv(r"D:\food_recommender\data\cleaned_df.csv")

In [3]:
df_clean.head()

Unnamed: 0,title,ingredients,cleaned_ingredients,instructions,image_path
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...",purpose bread onion oil total cored olive as r...,"Pat chicken dry with paper towels, season all ...",D:\food_recommender\data\Food Images\miso-butt...
1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",salt about in thyme kosher new pepper potatoes...,Preheat oven to 400°F and line a rimmed baking...,D:\food_recommender\data\Food Images\crispy-sa...
2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",onion cheese sharp full elbow macaroni pepper ...,Place a rack in middle of oven; preheat to 400...,D:\food_recommender\data\Food Images\thanksgiv...
3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",celery turkey heavy ribs cubes oil olive butte...,Preheat oven to 350°F with rack in middle. Gen...,D:\food_recommender\data\Food Images\italian-s...
4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",hot butter water garnish cinnamon apple orange...,Stir together brown sugar and hot water in a c...,D:\food_recommender\data\Food Images\newtons-l...


In [7]:
df_clean['image_path'].head()

0    D:\food_recommender\data\Food Images\miso-butt...
1    D:\food_recommender\data\Food Images\crispy-sa...
2    D:\food_recommender\data\Food Images\thanksgiv...
3    D:\food_recommender\data\Food Images\italian-s...
4    D:\food_recommender\data\Food Images\newtons-l...
Name: image_path, dtype: object