In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import nltk
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [12]:
# Descargar stopwords si no están disponibles
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# URL principal
main_url = "https://www.allrecipes.com/recipes-a-z-6735880"  

In [3]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.6778.265 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1"
}

In [4]:
# Listas para almacenar los datos
data = []

# Hacer la solicitud inicial
response = requests.get(main_url, headers=headers)

In [5]:
# Verificar si la solicitud fue exitosa
if response.status_code == 200:
    # Parsear el HTML de la página principal
    soup = BeautifulSoup(response.text, 'html.parser')

    # Buscar los enlaces principales
    main_items = soup.find_all('li', class_='mntl-link-list__item')

    # Usar tqdm para mostrar la barra de progreso
    for main_item in tqdm(main_items, desc="Procesando enlaces principales"):
        main_link = main_item.find('a')
        if main_link:
            main_text = main_link.get_text(strip=True)  # Texto del enlace principal
            main_href = main_link['href']  # URL del enlace principal
            
            # Hacer una solicitud a la URL del enlace principal
            inner_response = requests.get(main_href, headers=headers)
            
            # Verificar si la solicitud fue exitosa
            if inner_response.status_code == 200:
                # Parsear el HTML de la página interna
                inner_soup = BeautifulSoup(inner_response.text, 'html.parser')

                # Buscar los enlaces internos (según la clase indicada en tu imagen)
                inner_items = inner_soup.find_all('a', class_='comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image')

                for inner_item in inner_items:
                    inner_text = inner_item.get_text(strip=True) if inner_item else None
                    inner_href = inner_item['href'] if 'href' in inner_item.attrs else None

                    # Agregar datos al conjunto de resultados
                    data.append({
                        'Texto': main_text,
                        'URL': main_href,
                        'Receta': inner_text,
                        'Receta_URL': inner_href
                    })

else:
    print(f"Error al acceder a la página principal: {response.status_code}")

# Crear un DataFrame con los datos recopilados
df = pd.DataFrame(data)




Procesando enlaces principales: 100%|██████████| 378/378 [07:01<00:00,  1.11s/it]


In [6]:
df

Unnamed: 0,Texto,URL,Receta,Receta_URL
0,Air Fryer Recipes,https://www.allrecipes.com/recipes/23070/every...,Air Fryer Lemon Garlic Parmesan Chicken2Ratings,https://www.allrecipes.com/air-fryer-lemon-gar...
1,Air Fryer Recipes,https://www.allrecipes.com/recipes/23070/every...,Our 15 Best Air Fryer Thanksgiving Recipes,https://www.allrecipes.com/best-air-fryer-than...
2,Air Fryer Recipes,https://www.allrecipes.com/recipes/23070/every...,Air Fryer S’Mores1Rating,https://www.allrecipes.com/air-fryer-s-mores-r...
3,Air Fryer Recipes,https://www.allrecipes.com/recipes/23070/every...,Air Fryer Baked Yams,https://www.allrecipes.com/air-fryer-baked-yam...
4,Air Fryer Recipes,https://www.allrecipes.com/recipes/23070/every...,Lemon Garlic Butter Chicken Spiedini26Ratings,https://www.allrecipes.com/lemon-garlic-butter...
...,...,...,...,...
19088,Zucchini Breads,https://www.allrecipes.com/recipes/348/bread/q...,Gluten-Free Zucchini Bread (or Muffins)11Ratings,https://www.allrecipes.com/recipe/244775/glute...
19089,Zucchini Breads,https://www.allrecipes.com/recipes/348/bread/q...,Cherry-Zucchini Bread2Ratings,https://www.allrecipes.com/recipe/277978/cherr...
19090,Zucchini Breads,https://www.allrecipes.com/recipes/348/bread/q...,Savory Zucchini Muffins38Ratings,https://www.allrecipes.com/recipe/204983/savor...
19091,Zucchini Breads,https://www.allrecipes.com/recipes/348/bread/q...,Andy's Jalapeno Zucchini Bread3Ratings,https://www.allrecipes.com/recipe/239859/andys...


In [7]:
# Guardar en un archivo excel
df.to_excel('resultados_recetas_bs4.xlsx', index=False)

In [8]:
# Reduciremos los datos a 1000
df = df[:1000]

In [9]:
# Agregar nuevas columnas vacías al DataFrame
df['Nombre'] = ''
df['Ingredientes'] = ''
df['Preparación'] = ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Nombre'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Ingredientes'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Preparación'] = ''


In [10]:
# Iterar por las primeras 50 URLs en la columna "Receta_URL" con barra de progreso
for i, url in enumerate(tqdm(df['Receta_URL'], desc="Procesando URLs")):
    try:
        # Hacer la solicitud HTTP
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extraer el nombre
        nombre = soup.find('h1', class_='article-heading text-headline-400')
        df.at[i, 'Nombre'] = nombre.get_text(strip=True) if nombre else 'No encontrado'

        # Extraer los ingredientes
        ingredientes_list = soup.find_all('li', class_='mm-recipes-structured-ingredients__list-item')
        ingredientes_text = []
        for ingrediente in ingredientes_list:
            cantidad = ingrediente.find('span', {'data-ingredient-quantity': 'true'})
            unidad = ingrediente.find('span', {'data-ingredient-unit': 'true'})
            nombre_ingrediente = ingrediente.find('span', {'data-ingredient-name': 'true'})
            
            item = f"{cantidad.get_text(strip=True) if cantidad else ''} {unidad.get_text(strip=True) if unidad else ''} {nombre_ingrediente.get_text(strip=True) if nombre_ingrediente else ''}"
            ingredientes_text.append(item.strip())

        df.at[i, 'Ingredientes'] = ', '.join(ingredientes_text) if ingredientes_text else 'No encontrado'

        # Extraer la preparación
        preparacion_list = soup.find('ol', class_='comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--OL')
        pasos = []
        if preparacion_list:
            for paso in preparacion_list.find_all('li'):
                pasos.append(paso.get_text(strip=True))
        
        df.at[i, 'Preparación'] = ' '.join(pasos) if pasos else 'No encontrado'

    except Exception as e:
        # Manejo de errores
        df.at[i, 'Nombre'] = 'Error'
        df.at[i, 'Ingredientes'] = 'Error'
        df.at[i, 'Preparación'] = 'Error'
        print(f"Error procesando la URL {url}: {e}")

Procesando URLs: 100%|██████████| 1000/1000 [20:24<00:00,  1.22s/it]


In [11]:
# Guardar el DataFrame actualizado en un nuevo archivo Excel
output_file = 'resultados_recetas_actualizado_completo.xlsx'
df.to_excel(output_file, index=False)

In [13]:
def clean_text(raw_text):
    """
    Preprocesa texto eliminando caracteres especiales, palabras vacías (stopwords),
    y dejando solo palabras relevantes.

    Args:
        raw_text (str): Texto original a procesar.

    Returns:
        str: Texto limpio y preprocesado.
    """
    # Convertir a minúsculas
    raw_text = raw_text.lower()
    # Sustituir caracteres especiales por espacios
    raw_text = re.sub(r'[^a-zA-Z]', ' ', raw_text)
    # Tokenizar texto
    tokens = word_tokenize(raw_text)
    # Filtrar palabras que no sean relevantes (stopwords y palabras cortas)
    meaningful_words = filter(lambda word: word not in stop_words and len(word) > 2, tokens)
    return ' '.join(meaningful_words)

In [15]:
# Combinar columnas de texto y preprocesar
df['corpus'] = df['Ingredientes'].fillna('') + ' ' + df['Preparación'].fillna('')
df['tokens'] = df['corpus'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['corpus'] = df['Ingredientes'].fillna('') + ' ' + df['Preparación'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens'] = df['corpus'].apply(clean_text)


In [16]:
# Generación de embeddings utilizando TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
recipe_embeddings = tfidf_vectorizer.fit_transform(df['tokens'])

In [17]:
def cluster_recipes(embedding_matrix, num_clusters=10):
    """
    Realiza agrupamiento (clustering) de recetas basado en los embeddings generados.

    Args:
        embedding_matrix (sparse matrix): Matriz de embeddings generada.
        num_clusters (int): Número de clusters a formar.

    Returns:
        list: Etiquetas de cluster asignadas a cada receta.
    """
    print("Realizando clustering con K-means...")
    kmeans_model = KMeans(n_clusters=num_clusters, random_state=42)
    cluster_labels = kmeans_model.fit_predict(embedding_matrix)
    return cluster_labels

In [18]:
# Agregar etiquetas de cluster a los datos
df['cluster'] = cluster_recipes(recipe_embeddings, num_clusters=10)

Realizando clustering con K-means...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cluster'] = cluster_recipes(recipe_embeddings, num_clusters=10)


In [23]:
def search_similar_recipes(user_query, embedding_matrix, recipe_df):
    """
    Busca recetas similares a una consulta proporcionada por el usuario usando similitud de coseno.

    Args:
        user_query (str): Consulta del usuario.
        embedding_matrix (sparse matrix): Matriz de embeddings de las recetas.
        recipe_df (DataFrame): DataFrame que contiene las recetas originales.

    Returns:
        DataFrame: DataFrame con las recetas más similares a la consulta.
    """
    
    # Preprocesar la consulta del usuario
    processed_query = clean_text(user_query)
    # Convertir la consulta en embeddings usando el mismo vectorizador
    query_embedding = tfidf_vectorizer.transform([processed_query])
    # Calcular similitudes de coseno
    similarity_scores = cosine_similarity(query_embedding, embedding_matrix).flatten()
    # Agregar las similitudes al DataFrame
    recipe_df['similarity'] = similarity_scores
    # Ordenar recetas por similitud y devolver las 10 mejores
    top_recipes = recipe_df.sort_values(by='similarity', ascending=False).head(10)
    return top_recipes[['Receta_URL', 'Receta', 'similarity', 'cluster', 'Ingredientes', 'Preparación']]

In [24]:
def example_query_run(user_query, recipe_df):
    """
    Ejecuta una consulta de ejemplo y muestra los resultados en formato legible.

    Args:
        user_query (str): Consulta del usuario.
        recipe_df (DataFrame): DataFrame con las recetas ya procesadas.
    """
    results = search_similar_recipes(user_query, recipe_embeddings, recipe_df)
    print("Recetas similares encontradas:")
    for _, row in results.iterrows():
        ingredientes = '\n'.join(f"- {ing.strip()}" for ing in row['Ingredientes'].split(','))
        preparacion = '\n'.join(f"- {step.strip()}" for step in row['Preparación'].split('.'))
        print(f"\nReceta: {row['Receta']}\n{row['Receta_URL']}\nSimilitud: {row['similarity']:.2f}\nCluster: {row['cluster']}\n\nIngredientes:\n{ingredientes}\n\nPreparación:\n{preparacion}")

In [25]:
# Ejecutar una consulta de ejemplo
user_query_example = "chicken with lemon"
example_query_run(user_query_example, df)

Recetas similares encontradas:

Receta: Sheet Pan Lemon Garlic Chicken with Vegetables
https://www.allrecipes.com/sheet-pan-lemon-garlic-chicken-with-vegetables-recipe-8750712
Similitud: 0.69
Cluster: 9

Ingredientes:
- 2 pounds skinless
- boneless chicken thighs
- salt and freshly ground black pepper to taste
- 1/4 cup unsalted butter
- 1/2 cup red onion
- 4  cloves garlic
- 2 teaspoons seasoning
- 1 pound asparagus
- 6  mini bell peppers
- 1/2 cup chicken broth
- 1  lemon
- zested and juiced
- lemon slices
- for garnish (optional)
- fresh parsley sprigs
- for garnish (optional)

Preparación:
- Preheat the oven to 400 degrees F (200 degrees C) and line a sheet pan with foil or parchment paper
- Pat chicken thighs dry with paper towels and season with salt and pepper
- Melt butter in a large skillet over medium-high heat
- When butter is sizzling, add chicken and cook until browned, 3 to 5 minutes per side
- Place chicken on the prepared sheet pan
- Roast chicken in the preheated oven 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recipe_df['similarity'] = similarity_scores
