In [19]:
import json
import os
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from PIL import Image
import shutil


from settings.config import RECIPES1M_PATH, RAW_RECIPES1M_PATH, VAL_SIZE, TEST_SIZE, IMG_STATS_FILENAME, METADATA_FILENAME
from settings.commons import tokenize_category

In [20]:
BASE_PATH = RECIPES1M_PATH
TEST_SIZE = 0.05
VAL_SIZE = 0.05

In [21]:
def trunk_str(string, max_len=15, char="_"):
    if len(string) <= max_len:
        return string
    
    trunk_id = string[:max_len].rfind(char)
    trunk_id = max_len if trunk_id == -1 else trunk_id
    
    return string[:trunk_id]

def download_img(url, save_path):
    if not url.startswith("https://"):
        url = "https://" + url    
    try:
        img_data = requests.get(url).content
        
        with open(save_path, "wb") as handler:
            handler.write(img_data)
    except Exception as e:
        # print(f"Error downloading image: {e}")
        raise FileNotFoundError

In [22]:
N_SAMPLES = 100000
seed = 42

recipes_raw = pd.read_csv(os.path.join(RAW_RECIPES1M_PATH, 'full_dataset.csv'), index_col=0)

In [30]:
recipes = recipes_raw.sample(frac=1, random_state=seed).reset_index().drop(columns=["index"]).reset_index()
recipes['name'] = recipes["title"].str.replace(" ", "_").str.lower().str.strip("_").fillna("UNK")
recipes = recipes.rename(columns={"link": "src_url", "index": 'id', 'NER': 'ingredients_ner'}).drop(columns=["title", "source", 'directions'])
recipes['image'] = recipes['name'].str.replace(":", "").apply(lambda name: trunk_str(name)).str.rstrip("_")
recipes['image'] = recipes['id'].astype(str) + "_____" + recipes['image'] + ".jpg"

recipes['ingredients'] = recipes['ingredients'].apply(lambda ingredients: json.loads(ingredients))
recipes['ingredients_ner'] = recipes['ingredients_ner'].apply(lambda ingredients_ok: json.loads(ingredients_ok))

In [31]:
recipes_downloaded_ids = [int(img.split("_____")[0]) for img in os.listdir(os.path.join(BASE_PATH, "download")) if img.endswith(".jpg")]
recipes_downloaded = recipes.loc[recipes['id'].isin(recipes_downloaded_ids), :]

recipes_downloaded.to_json(os.path.join(BASE_PATH, "download", METADATA_FILENAME), orient="records", indent=4)



In [32]:

tmp_name = recipes_downloaded['name'].apply(lambda name: trunk_str(name)).str.rstrip("_").values
recipes_downloaded["old_old_image"] = recipes_downloaded['id'].astype(str) + "_____" +  tmp_name + ".jpg"
recipes_downloaded

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recipes_downloaded["old_old_image"] = recipes_downloaded['id'].astype(str) + "_____" +  tmp_name + ".jpg"


Unnamed: 0,id,ingredients,src_url,ingredients_ner,name,image,old_old_image
0,0,"[1 1/2 pound flank steak, 1/2 c. finely minced...",cookeatshare.com/recipes/marinated-flank-steak...,"[flank steak, green onions, red wine, soy sauc...",marinated_flank_steak_recipe,0_____marinated.jpg,0_____marinated.jpg
1,1,"[1 tablespoon rosemary, 1 teaspoon thyme, 3 ba...",www.yummly.com/recipe/French-Chicken-Stew-1433580,"[rosemary, thyme, bay leaves, paprika, pepper,...",french_chicken_stew,1_____french_chicken.jpg,1_____french_chicken.jpg
3,3,"[4.5 Cups Flour, 1.5 Tsp Salt, Pinch Baking Po...",www.epicurious.com/recipes/member/views/moms-p...,"[Flour, Salt, Baking Powder, Sugar, Crisco, eg...",moms_pie_dough,3_____moms_pie_dough.jpg,3_____moms_pie_dough.jpg
5,5,"[3/4 cup sugar, 1/2 cup fresh orange juice, 1/...",www.epicurious.com/recipes/food/views/citrus-s...,"[sugar, orange juice, lemon juice]",citrus_syrup,5_____citrus_syrup.jpg,5_____citrus_syrup.jpg
6,6,"[1 large navel orange with skin, 7 cups water,...",www.epicurious.com/recipes/food/views/cranberr...,"[orange with skin, water, sugar, cinnamon stic...",cranberry_and_candied_orange_chutney,6_____cranberry_and.jpg,6_____cranberry_and.jpg
...,...,...,...,...,...,...,...
172146,172146,[1/2 cup sliced strawberries Safeway 1 lb For ...,www.kraftrecipes.com/recipes/patriotic-floatin...,"[boiling water, Gelatin, cold water, Topping]",patriotic_floating_fruit_parfaits,172146_____patriotic.jpg,172146_____patriotic.jpg
172147,172147,"[5 -6 cups noodles, 5 (15 ounce) cans tomato s...",www.food.com/recipe/mexican-soupa-11170,"[noodles, tomato sauce, hamburger, onion, butt...",mexican_soupa,172147_____mexican_soupa.jpg,172147_____mexican_soupa.jpg
172148,172148,"[4 medium artichokes, 2 lemons, halved, 1 cup ...",food52.com/recipes/35537-stuffed-artichokes,"[artichokes, lemons, breadcrumbs, Romano chees...",stuffed_artichokes,172148_____stuffed.jpg,172148_____stuffed.jpg
172152,172152,"[4 ounces granulated sugar, 3 to 4 tablespoons...",www.seriouseats.com/recipes/2012/08/strawberry...,"[sugar, starch, fresh strawberries, Egg, sandi...",strawberry_galette_recipe,172152_____strawberry.jpg,172152_____strawberry.jpg


In [33]:
train_size = len(recipes_downloaded) - int(len(recipes_downloaded) * (VAL_SIZE + TEST_SIZE))
val_size = int(len(recipes_downloaded) * VAL_SIZE)
test_size = int(len(recipes_downloaded) * TEST_SIZE)

recipes_train, recipes_test = train_test_split(recipes_downloaded, shuffle=True, test_size=test_size, random_state=seed)
recipes_train, recipes_val = train_test_split(recipes_train, shuffle=True, test_size=val_size, random_state=seed)
recipes_train, recipes_val, recipes_test = recipes_train.sort_values("id").reset_index(drop=True), recipes_val.sort_values("id").reset_index(drop=True), recipes_test.sort_values("id").reset_index(drop=True)

In [34]:
index_offset = 0
for recipes in [recipes_train, recipes_val, recipes_test]:
    recipes['old_id'] = recipes['id']
    recipes['old_image'] = recipes['image']
    recipes['id'] = recipes.index + index_offset
    recipes['image'] = recipes['id'].astype(str) + "_____" + recipes['image'].str.split("_____").str[1]
    
    index_offset += len(recipes)

In [35]:
for phase, recipes_dataset in zip(["train", "val", "test"], [recipes_train, recipes_val, recipes_test]):
    os.makedirs(os.path.join(BASE_PATH, phase), exist_ok=True)
    pbar = tqdm(total=len(recipes_dataset), desc=f"Copying {phase} recipes")
    
    for _, recipe in recipes_dataset.iterrows():
        try:
            shutil.copy(str(os.path.join(BASE_PATH, "download", recipe['old_image'])), str(os.path.join(BASE_PATH, phase, recipe['image'])))
        except FileNotFoundError:
            try:
                shutil.copy(str(os.path.join(BASE_PATH, "download", recipe['old_old_image'])), str(os.path.join(BASE_PATH, phase, recipe['image'])))
                recipe['old_image'] = recipe['old_old_image']
            except FileNotFoundError:
                print(f"Error copying image: {recipe['old_image']}")


        pbar.update(1)
    
    recipes_dataset.drop(columns=["old_old_image"]).to_json(os.path.join(BASE_PATH, phase, METADATA_FILENAME), orient="records", indent=4)
        

Copying train recipes:   0%|          | 0/89982 [00:00<?, ?it/s]

Copying val recipes:   0%|          | 0/4998 [00:00<?, ?it/s]

Copying test recipes:   0%|          | 0/4998 [00:00<?, ?it/s]

In [36]:
img_stats = os.path.join(BASE_PATH, IMG_STATS_FILENAME)
if os.path.exists(img_stats):
    os.remove(img_stats)