In [1]:
!pip install opendatasets --upgrade
!pip install isodate



In [2]:
import os
import requests
import random
import opendatasets as od
import pandas as pd
import isodate # for total, prep and cook times
import matplotlib.pyplot as plt
import numpy as np
import os
import h5py
import shutil
import zipfile
import torch
import h5py
import tensorflow as tf


from keras.applications import vgg16, vgg19, resnet50
from keras.models import Model
from keras.utils import load_img, img_to_array
from keras.applications.imagenet_utils import preprocess_input


from PIL import Image
from torchvision import models, transforms


from tqdm import tqdm
from joblib import Parallel, delayed


# Descarga de imágenes

In [None]:
od.download_kaggle_dataset("https://www.kaggle.com/datasets/irkaal/foodcom-recipes-and-reviews", "data")

In [None]:
recipes = pd.read_parquet("data/foodcom-recipes-and-reviews/recipes.parquet")
recipes.head()

In [None]:
recipes["Images"][0]

In [None]:
FLAG = False
def download_image(url, recipe_idx, image_idx):
    recipe_folder = f"images/recipe_{recipe_idx}"
    os.makedirs(recipe_folder, exist_ok=True)
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            with open(f"{recipe_folder}/image_{image_idx}.jpg", "wb") as file:
                file.write(response.content)
        return True
    except requests.RequestException as e:
        print(f"Error al descargar la imagen {image_idx} de la receta {recipe_idx}: {e}")
        return False
if FLAG:
    # download_tasks = [
    #     (url, idx, j)
    #     for idx, image_links in enumerate(recipes["Images"])
    #     if image_links is not None
    #     for j, url in enumerate(image_links)
    #     if url
    # ]
    # Eligiendo solo 1 imagen por set (para facilitar todo)
    download_tasks = [
        (random.choice(image_links), idx, 0)
        for idx, image_links in enumerate(recipes["Images"])
        if image_links is not None and len(image_links) > 0
    ]

    num_threads = 10

    results = Parallel(n_jobs=num_threads)(
        delayed(download_image)(url, recipe_idx, image_idx)
        for url, recipe_idx, image_idx in tqdm(download_tasks)
    )

    print("Descarga completada.")


# Embeddings ResNet50

In [None]:
model = models.resnet50(pretrained=True)
model = torch.nn.Sequential(*list(model.children())[:-1])
model.eval() 

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [None]:
def get_embedding(image_path):
    try:
        image = Image.open(image_path).convert("RGB")  
    except:
        return None
    input_tensor = preprocess(image)
    input_batch = input_tensor.unsqueeze(0)  
    with torch.no_grad():
        embedding = model(input_batch)
    return embedding.squeeze().numpy()

def process_recipe(recipe_idx):
    recipe_folder = os.path.join('images', recipe_idx)
    image_path = os.path.join(recipe_folder, 'image_0.jpg')
    if os.path.exists(image_path): 
        embedding = get_embedding(image_path)
        return recipe_idx, embedding
    return None

In [None]:
recipe_folders = [folder for folder in os.listdir('images') if os.path.isdir(os.path.join('images', folder))]

num_threads = 1 
results = Parallel(n_jobs=num_threads)(delayed(process_recipe)(recipe_idx) for recipe_idx in tqdm(recipe_folders))

In [None]:
final_results = []
for i in range(len(results)):
    try:
        if type(results[i][1]) != np.ndarray:
            print("No se puede cargar la foto")
        else:
            final_results.append(results[i])
    except:
        print("No tiene fotos")


In [None]:
with h5py.File('food_embeddings_ResNet1.h5', 'w') as h5f:
    for result in final_results:
        if result is not None:
            recipe_idx, embedding = result
            h5f.create_dataset(f"{recipe_idx}_embedding", data=embedding)
print("Embeddings generados y guardados.")

# Embeddings vistos en Ayudantia

In [None]:
with tf.device('/GPU:0'): 
    vgg19_model = vgg19.VGG19(weights='imagenet')
    feat_extractor = Model(inputs=vgg19_model.input, outputs=vgg19_model.get_layer("fc2").output)


imgs_model_width, imgs_model_height = 224, 224

recipe_folders = [folder for folder in os.listdir('images') if os.path.isdir(os.path.join('images', folder))]
embeddings = []
ids = []

with h5py.File('food_embeddings_vgg19.h5', 'w') as h5f:
    for folder in tqdm(recipe_folders):
        image_path = os.path.join('images', folder, 'image_0.jpg')
        if os.path.exists(image_path): 
            try:
                original = load_img(image_path, target_size=(224, 224))
                numpy_image = img_to_array(original)
                image_batch = np.expand_dims(numpy_image, axis=0)
                processed_img = preprocess_input(image_batch)
            except:
                continue
            with tf.device('/GPU:0'): 
                embedding = feat_extractor.predict(processed_img, verbose=0)
        
            h5f.create_dataset(f"{folder}_embedding", data=embedding.flatten())
            
print("Embeddings generados y guardados.")

100%|██████████| 165896/165896 [9:51:41<00:00,  4.67it/s]  

Embeddings generados y guardados.





In [None]:
modelo_escogido = 'vgg16' 

with tf.device('/GPU:0'):
    if modelo_escogido == 'resnet50':
        model = resnet50.ResNet50(weights='imagenet')
        feat_extractor = Model(inputs=model.input, outputs=model.get_layer("avg_pool").output) 
    elif modelo_escogido == 'vgg16':
        model = vgg16.VGG16(weights='imagenet')
        feat_extractor = Model(inputs=model.input, outputs=model.get_layer("fc2").output)

imgs_model_width, imgs_model_height = 224, 224

recipe_folders = [folder for folder in os.listdir('images') if os.path.isdir(os.path.join('images', folder))]
embeddings = []
ids = []

with h5py.File('food_embeddings_vgg16.h5', 'w') as h5f:
    for folder in recipe_folders:
        image_path = os.path.join('images', folder, 'image_0.jpg')
        if os.path.exists(image_path):
            try:
                original = load_img(image_path, target_size=(224, 224))
                numpy_image = img_to_array(original)
                image_batch = np.expand_dims(numpy_image, axis=0)
                processed_img = preprocess_input(image_batch)
            except:
                continue
            with tf.device('/GPU:0'):
                embedding = feat_extractor.predict(processed_img)
            
            h5f.create_dataset(f"{folder}_embedding", data=embedding.flatten())
            
print("Embeddings generados y guardados.")

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
[1m553467096/553467096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 0us/step
