# Web Scrapping

## Nombre: Michael Pillaga


## Instalación de dependencias


In [1]:
!pip install requests beautifulsoup4



In [2]:
!pip install tqdm



## Paso 1: Web Scraping

Este código realiza **web scraping** en el sitio web de AllRecipes para extraer información detallada de recetas. Primero, recopila enlaces de todas las categorías de recetas disponibles. Luego, para cada categoría, extrae las URL de las recetas y recopila detalles como el título, descripción, ingredientes y pasos de cada receta. Los resultados, organizados por categoría, se almacenan en un archivo JSON llamado `all_recipes.json`. Se utiliza la biblioteca `BeautifulSoup` para analizar el contenido HTML, y se limita la extracción a un máximo de 500 recetas para evitar sobrecargar el servidor.


In [2]:
import requests
from bs4 import BeautifulSoup
import time
import json
from tqdm import tqdm

# Encabezados para simular un navegador
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive"
}

BASE_URL = "https://www.allrecipes.com"
CATEGORIES_URL = f"{BASE_URL}/recipes-a-z-6735880"
MAX_RECIPES = 500

# Función para obtener todas las URLs de las categorías
def get_category_urls():
    response = requests.get(CATEGORIES_URL, headers=HEADERS)
    if response.status_code != 200:
        print(f"Error fetching categories page: {response.status_code}")
        return []

    soup = BeautifulSoup(response.content, "html.parser")

    category_links = []
    for group in soup.select(".mntl-alphabetical-list__group"):
        for a_tag in group.select("a[href]"):
            category_links.append(a_tag["href"])

    print(f"Found {len(category_links)} categories.")
    return category_links

# Función para obtener todas las recetas de una categoría
def get_recipes_from_category(category_url):
    recipes = []

    while category_url and len(recipes) < MAX_RECIPES:
        print(f"Fetching recipes from: {category_url}")
        response = requests.get(category_url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Error fetching category page: {response.status_code}")
            break

        soup = BeautifulSoup(response.content, "html.parser")

        # Obtener las URLs de las recetas en los contenedores con IDs dinámicos
        for group_container in soup.select('[id^="mntl-taxonomysc-article-list-group_"]'):
            for recirc_container in group_container.select('[id^="tax-sc__recirc-list-container_"]'):
                for recipe_tag in recirc_container.select("a[href]"):
                    recipes.append(recipe_tag["href"])
                    if len(recipes) >= MAX_RECIPES:
                        break
                if len(recipes) >= MAX_RECIPES:
                    break
            if len(recipes) >= MAX_RECIPES:
                break

        print(f"Found {len(recipes)} recipes so far.")

        # Buscar el enlace a la siguiente página
        next_page_tag = soup.select_one(".category-page-list-related-nav-next-button a")
        category_url = next_page_tag["href"] if next_page_tag else None

        # Espera para evitar bloqueos
        time.sleep(1)

    return recipes

# Función para extraer información de una receta
def get_recipe_details(recipe_url):
    print(f"Fetching details for recipe: {recipe_url}")
    response = requests.get(recipe_url, headers=HEADERS)
    if response.status_code != 200:
        print(f"Error fetching recipe page: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, "html.parser")

    # Extraer título
    title_tag = soup.find(class_="article-heading text-headline-400")
    title = title_tag.text.strip() if title_tag else "No title found"

    # Extraer descripción
    description_tag = soup.find(class_="article-subheading text-body-100")
    description = description_tag.text.strip() if description_tag else "No description found"

    # Extraer ingredientes
    ingredients_tag = soup.find(class_="mm-recipes-structured-ingredients__list")
    ingredients = [li.text.strip() for li in ingredients_tag.find_all("li")] if ingredients_tag else []

    # Extraer pasos
    steps_tag = soup.find(id="mm-recipes-steps__content_1-0")
    steps = [p.text.strip() for p in steps_tag.find_all("p")] if steps_tag else []

    return {
        "title": title,
        "description": description,
        "ingredients": ingredients,
        "steps": steps,
        "url": recipe_url
    }

# Función principal para obtener todas las recetas por categoría
def scrape_all_recipes():
    categories = get_category_urls()
    if not categories:
        print("No categories found. Exiting.")
        return {}

    all_recipes = {}

    total_recipes = 0
    for category_url in tqdm(categories, desc="Scraping categories"):
        if total_recipes >= MAX_RECIPES:
            print("Reached the maximum number of recipes.")
            break

        category_name = category_url.split("/")[-2]
        print(f"Scraping category: {category_name}")

        recipes = get_recipes_from_category(category_url)
        recipe_details = []

        for recipe_url in recipes:
            if len(recipe_details) >= MAX_RECIPES:
                break
            details = get_recipe_details(recipe_url)
            if details:
                recipe_details.append(details)

        all_recipes[category_name] = {
            "count": len(recipe_details),
            "recipes": recipe_details
        }

        total_recipes += len(recipe_details)
        print(f"Category '{category_name}' has {len(recipe_details)} recipes.")

        # Espera para evitar bloqueos
        time.sleep(2)

    return all_recipes

if __name__ == "__main__":
    all_recipes_data = scrape_all_recipes()

    # Mostrar resultados en el output (solo las primeras 3 recetas de cada categoría)
    for category, data in all_recipes_data.items():
        print(f"Category: {category}")
        print(f"Total Recipes: {data['count']}")
        print("Sample Recipes:")
        for recipe in data["recipes"][:3]:
            print(f"- {recipe['title']}: {recipe['url']}")
        print()

    # Guardar los datos en un archivo JSON (con todas las recetas hasta el límite de 500)
    with open("all_recipes.json", "w", encoding="utf-8") as f:
        json.dump(all_recipes_data, f, ensure_ascii=False, indent=4)

    print("Scraping completado. Datos guardados en 'all_recipes.json'.")


Found 378 categories.


Scraping categories:   0%|          | 0/378 [00:00<?, ?it/s]

Scraping category: air-fryer
Fetching recipes from: https://www.allrecipes.com/recipes/23070/everyday-cooking/cookware-and-equipment/air-fryer/
Found 64 recipes so far.
Fetching details for recipe: https://www.allrecipes.com/air-fryer-lemon-garlic-parmesan-chicken-recipe-8726749
Fetching details for recipe: https://www.allrecipes.com/best-air-fryer-thanksgiving-recipes-8729089
Fetching details for recipe: https://www.allrecipes.com/air-fryer-s-mores-recipe-8736955
Fetching details for recipe: https://www.allrecipes.com/air-fryer-baked-yams-recipe-8737640
Fetching details for recipe: https://www.allrecipes.com/lemon-garlic-butter-chicken-spiedini-recipe-8727930
Fetching details for recipe: https://www.allrecipes.com/air-fryer-grilled-pimento-cheese-recipe-8720986
Fetching details for recipe: https://www.allrecipes.com/air-fryer-chicken-parmesan-recipe-8698442
Fetching details for recipe: https://www.allrecipes.com/air-fryer-eggplant-recipe-8700773
Fetching details for recipe: https://ww

Scraping categories:   0%|          | 1/378 [00:33<3:29:35, 33.36s/it]

Scraping category: allrecipes-allstars
Fetching recipes from: https://www.allrecipes.com/recipes/16492/everyday-cooking/special-collections/allrecipes-allstars/
Found 64 recipes so far.
Fetching details for recipe: https://www.allrecipes.com/halibut-en-papillote-recipe-8775088
Fetching details for recipe: https://www.allrecipes.com/thai-peanut-butter-ramen-recipe-8775397
Fetching details for recipe: https://www.allrecipes.com/italian-ricotta-cookie-bars-recipe-8775003
Fetching details for recipe: https://www.allrecipes.com/passion-fruit-mojito-recipe-8774704
Fetching details for recipe: https://www.allrecipes.com/apple-cider-pork-tenderloin-with-sweet-potatoes-recipe-8774666
Fetching details for recipe: https://www.allrecipes.com/mediterranean-salmon-baked-in-parchment-recipe-8768509
Fetching details for recipe: https://www.allrecipes.com/nordstrom-s-tomato-basil-soup-recipe-8769886
Fetching details for recipe: https://www.allrecipes.com/baked-sweet-and-sour-chicken-thighs-with-pineapp

Scraping categories:   1%|          | 2/378 [01:06<3:28:13, 33.23s/it]

Scraping category: angel-food-cake
Fetching recipes from: https://www.allrecipes.com/recipes/385/desserts/cakes/angel-food-cake/
Found 15 recipes so far.
Fetching details for recipe: https://www.allrecipes.com/recipe/7868/mock-angel-food-cake/
Fetching details for recipe: https://www.allrecipes.com/recipe/244321/heavenly-raspberry-dessert/
Fetching details for recipe: https://www.allrecipes.com/recipe/25981/cherry-angel-food-cake/
Fetching details for recipe: https://www.allrecipes.com/recipe/72998/high-altitude-angel-food-cake/
Fetching details for recipe: https://www.allrecipes.com/recipe/283630/2-ingredient-pineapple-angel-food-cake/
Fetching details for recipe: https://www.allrecipes.com/recipe/8428/homemade-angel-food-cake/
Fetching details for recipe: https://www.allrecipes.com/recipe/283629/microwave-mini-angel-food-cake/
Fetching details for recipe: https://www.allrecipes.com/article/2-ingredient-cake-makes-cheap-and-easy-dessert/
Fetching details for recipe: https://www.allrec

Scraping categories:   1%|          | 3/378 [01:16<2:21:59, 22.72s/it]

Scraping category: antipasto
Fetching recipes from: https://www.allrecipes.com/recipes/102/appetizers-and-snacks/antipasto/
Found 34 recipes so far.
Fetching details for recipe: https://www.allrecipes.com/recipe/106695/prosciutto-e-melone-italian-ham-and-melon/
Fetching details for recipe: https://www.allrecipes.com/recipe/53144/cherry-pepper-poppers/
Fetching details for recipe: https://www.allrecipes.com/antipasti-garlic-bread-recipe-8716000
Fetching details for recipe: https://www.allrecipes.com/recipe/104806/tomato-and-mozzarella-bites/
Fetching details for recipe: https://www.allrecipes.com/recipe/234616/fried-stuffed-squash-blossoms/
Fetching details for recipe: https://www.allrecipes.com/recipe/23401/antipasto/
Fetching details for recipe: https://www.allrecipes.com/recipe/57808/antipasto-squares/
Fetching details for recipe: https://www.allrecipes.com/recipe/213849/caprese-on-a-stick/
Fetching details for recipe: https://www.allrecipes.com/recipe/254419/watermelon-caprese-appet

Scraping categories:   1%|          | 4/378 [01:36<2:14:50, 21.63s/it]

Scraping category: appetizers-and-snacks
Fetching recipes from: https://www.allrecipes.com/recipes/76/appetizers-and-snacks/
Found 64 recipes so far.
Fetching details for recipe: https://www.allrecipes.com/recipe/51326/chinese-tea-leaf-eggs/
Fetching details for recipe: https://www.allrecipes.com/cheese-puff-sliders-recipe-8768983
Fetching details for recipe: https://www.allrecipes.com/miso-aioli-recipe-8768230
Fetching details for recipe: https://www.allrecipes.com/bang-bang-chicken-nuggets-recipe-8767654
Fetching details for recipe: https://www.allrecipes.com/mediterranean-marinated-cheese-recipe-8767492
Fetching details for recipe: https://www.allrecipes.com/the-best-salmon-cakes-recipe-8764931
Fetching details for recipe: https://www.allrecipes.com/salmon-cream-cheese-recipe-8764414
Fetching details for recipe: https://www.allrecipes.com/justin-rowlands-philly-cheesesteak-egg-rolls-recipe-8762569
Fetching details for recipe: https://www.allrecipes.com/baked-boursin-tomato-dip-recip

Scraping categories:   1%|▏         | 5/378 [02:10<2:41:06, 25.92s/it]

Scraping category: apple-pie
Fetching recipes from: https://www.allrecipes.com/recipes/788/desserts/pies/apple-pie/
Found 64 recipes so far.
Fetching details for recipe: https://www.allrecipes.com/caramel-apple-pie-cookies-recipe-7642173
Fetching details for recipe: https://www.allrecipes.com/recipe/230243/moms-cranberry-apple-pie/
Fetching details for recipe: https://www.allrecipes.com/recipe/12682/apple-pie-by-grandma-ople/
Fetching details for recipe: https://www.allrecipes.com/recipe/12681/apple-pie-filling/
Fetching details for recipe: https://www.allrecipes.com/recipe/230610/apple-tarte-tatin/
Fetching details for recipe: https://www.allrecipes.com/recipe/15942/canned-apple-pie-filling/
Fetching details for recipe: https://www.allrecipes.com/recipe/12324/apple-pie-i/
Fetching details for recipe: https://www.allrecipes.com/recipe/15587/apple-crumb-pie/
Fetching details for recipe: https://www.allrecipes.com/croatian-apple-pie-squares-recipe-7508367
Fetching details for recipe: htt

Scraping categories:   2%|▏         | 6/378 [02:47<3:04:23, 29.74s/it]

Scraping category: applesauce
Fetching recipes from: https://www.allrecipes.com/recipes/1333/side-dish/applesauce/
Found 31 recipes so far.
Fetching details for recipe: https://www.allrecipes.com/recipe/234078/applesauce-for-the-freezer/
Fetching details for recipe: https://www.allrecipes.com/applesauce-muffins-recipe-8702299
Fetching details for recipe: https://www.allrecipes.com/recipe/13643/cranberry-sauce-extraordinaire/
Fetching details for recipe: https://www.allrecipes.com/recipe/267958/cranberry-sauce-with-apples/
Fetching details for recipe: https://www.allrecipes.com/recipe/223485/no-cook-applesauce/
Fetching details for recipe: https://www.allrecipes.com/recipe/233160/willis-farm-applesauce/
Fetching details for recipe: https://www.allrecipes.com/recipe/142643/blushing-applesauce/
Fetching details for recipe: https://www.allrecipes.com/recipe/228653/slow-cooker-cider-applesauce-no-sugar-added/
Fetching details for recipe: https://www.allrecipes.com/recipe/219275/dougs-easy-a

Scraping categories:   2%|▏         | 7/378 [03:05<2:40:28, 25.95s/it]

Scraping category: artichoke-dip
Fetching recipes from: https://www.allrecipes.com/recipes/14913/appetizers-and-snacks/dips-and-spreads/artichoke-dip/
Found 51 recipes so far.
Fetching details for recipe: https://www.allrecipes.com/recipe/33474/artichoke-spinach-dip-restaurant-style/
Fetching details for recipe: https://www.allrecipes.com/recipe/268013/instant-pot-spinach-and-artichoke-dip/
Fetching details for recipe: https://www.allrecipes.com/recipe/223012/amazing-no-cook-spinach-artichoke-dip/
Fetching details for recipe: https://www.allrecipes.com/recipe/231086/hot-artichoke-and-crab-dip/
Fetching details for recipe: https://www.allrecipes.com/recipe/14814/hot-artichoke-spinach-dip/
Fetching details for recipe: https://www.allrecipes.com/recipe/16651/delicious-artichoke-dip/
Fetching details for recipe: https://www.allrecipes.com/recipe/60357/veronicas-hot-spinach-artichoke-and-chile-dip/
Fetching details for recipe: https://www.allrecipes.com/recipe/221191/chef-johns-hot-spinach-

Scraping categories:   2%|▏         | 8/378 [03:33<2:43:31, 26.52s/it]

Scraping category: bagels
Fetching recipes from: https://www.allrecipes.com/recipes/1537/bread/yeast-bread/bagels/
Found 24 recipes so far.
Fetching details for recipe: https://www.allrecipes.com/breakfast-bagel-ideas-8415826
Fetching details for recipe: https://www.allrecipes.com/recipe/220619/real-homemade-bagels/
Fetching details for recipe: https://www.allrecipes.com/recipe/7178/bread-machine-bagels/
Fetching details for recipe: https://www.allrecipes.com/recipe/6849/boiled-bagels/
Fetching details for recipe: https://www.allrecipes.com/everything-bagel-grilled-cheese-recipe-8753365
Fetching details for recipe: https://www.allrecipes.com/article/bialys-vs-bagels/
Fetching details for recipe: https://www.allrecipes.com/recipe/223557/pumpernickel-bagels/
Fetching details for recipe: https://www.allrecipes.com/recipe/140020/moist-passover-bagel/
Fetching details for recipe: https://www.allrecipes.com/recipe/231089/multigrain-bagels/
Fetching details for recipe: https://www.allrecipes.

Scraping categories:   2%|▏         | 9/378 [03:47<2:20:18, 22.81s/it]

Scraping category: baked-beans
Fetching recipes from: https://www.allrecipes.com/recipes/1673/side-dish/beans-and-peas/baked-beans/
Found 64 recipes so far.
Fetching details for recipe: https://www.allrecipes.com/southern-baked-beans-recipe-7554356
Fetching details for recipe: https://www.allrecipes.com/drunken-baked-beans-frijoles-borrachos-recipe-7511596
Fetching details for recipe: https://www.allrecipes.com/recipe/18255/boston-baked-beans/
Fetching details for recipe: https://www.allrecipes.com/recipe/165249/baked-beans-from-scratch/
Fetching details for recipe: https://www.allrecipes.com/recipe/21655/simple-baked-beans/
Fetching details for recipe: https://www.allrecipes.com/recipe/58389/slow-cooker-homemade-beans/
Fetching details for recipe: https://www.allrecipes.com/recipe/232677/homemade-baked-beans/
Fetching details for recipe: https://www.allrecipes.com/recipe/272258/instant-pot-baked-beans/
Fetching details for recipe: https://www.allrecipes.com/recipe/236866/texas-style-b

Scraping categories:   3%|▎         | 10/378 [04:22<2:42:27, 26.49s/it]

Scraping category: banana-bread
Fetching recipes from: https://www.allrecipes.com/recipes/343/bread/quick-bread/fruit-bread/banana-bread/
Found 64 recipes so far.
Fetching details for recipe: https://www.allrecipes.com/recipe/20144/banana-banana-bread/
Fetching details for recipe: https://www.allrecipes.com/recipe/230482/banana-chocolate-chip-bread/
Fetching details for recipe: https://www.allrecipes.com/recipe/241707/joys-easy-banana-bread/
Fetching details for recipe: https://www.allrecipes.com/recipe/17066/janets-rich-banana-bread/
Fetching details for recipe: https://www.allrecipes.com/recipe/67937/extreme-banana-nut-bread-ebnb/
Fetching details for recipe: https://www.allrecipes.com/article/banana-bread-mistakes/
Fetching details for recipe: https://www.allrecipes.com/recipe/6984/banana-sour-cream-bread/
Fetching details for recipe: https://www.allrecipes.com/recipe/16952/the-best-banana-bread/
Fetching details for recipe: https://www.allrecipes.com/article/how-to-ripen-bananas-fa

Scraping categories:   3%|▎         | 11/378 [04:59<2:46:18, 27.19s/it]

Reached the maximum number of recipes.
Category: air-fryer
Total Recipes: 64
Sample Recipes:
- Air Fryer Lemon Garlic Parmesan Chicken: https://www.allrecipes.com/air-fryer-lemon-garlic-parmesan-chicken-recipe-8726749
- Our 15 Best Air Fryer Thanksgiving Recipes: https://www.allrecipes.com/best-air-fryer-thanksgiving-recipes-8729089
- Air Fryer S’Mores: https://www.allrecipes.com/air-fryer-s-mores-recipe-8736955

Category: allrecipes-allstars
Total Recipes: 64
Sample Recipes:
- Halibut en Papillote: https://www.allrecipes.com/halibut-en-papillote-recipe-8775088
- Thai Peanut Butter Ramen: https://www.allrecipes.com/thai-peanut-butter-ramen-recipe-8775397
- Italian Ricotta Cookie Bars: https://www.allrecipes.com/italian-ricotta-cookie-bars-recipe-8775003

Category: angel-food-cake
Total Recipes: 15
Sample Recipes:
- Mock Angel Food Cake: https://www.allrecipes.com/recipe/7868/mock-angel-food-cake/
- Heavenly Raspberry Dessert: https://www.allrecipes.com/recipe/244321/heavenly-raspberry-




## Paso 2: Conversion del archivo json a csv con dataframe


Este código convierte un archivo JSON con información de recetas (`all_recipes.json`) en un archivo CSV (`all_recipes.csv`) utilizando `pandas`. Cada receta se transforma en una fila del DataFrame con las siguientes columnas:
- `category`: Categoría de la receta.
- `title`: Título de la receta.
- `description`: Descripción breve de la receta.
- `ingredients`: Lista de ingredientes, convertida en una cadena separada por punto y coma.
- `steps`: Pasos de preparación, convertidos en una cadena separada por punto y coma.
- `url`: Enlace a la receta original.

El archivo CSV resultante se guarda con todas las recetas organizadas, permitiendo un análisis o procesamiento posterior. El script también muestra en la consola cuántas recetas se guardaron.


In [3]:
import pandas as pd
import json

# Cargar el archivo JSON
with open("all_recipes.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Transformar los datos en un DataFrame
recipes_list = []
for category, details in data.items():
    for recipe in details["recipes"]:
        recipes_list.append({
            "category": category,
            "title": recipe["title"],
            "description": recipe["description"],
            "ingredients": "; ".join(recipe["ingredients"]),  # Convertir lista en cadena separada por punto y coma
            "steps": "; ".join(recipe["steps"]),  # Convertir lista en cadena separada por punto y coma
            "url": recipe["url"]
        })

# Crear el DataFrame
df = pd.DataFrame(recipes_list)

# Guardar el DataFrame en un archivo CSV
output_file = "all_recipes.csv"
df.to_csv(output_file, index=False, encoding="utf-8")

print(f"Archivo CSV guardado como '{output_file}' con {len(df)} recetas.")


Archivo CSV guardado como 'all_recipes.csv' con 539 recetas.


Aqui se mostraran de resultado las primeras 5 filas del dataframe del archivo all_recipes.csv

In [4]:
import pandas as pd

# Cargar el archivo CSV
csv_file = "all_recipes.csv"
df = pd.read_csv(csv_file)

# Mostrar las primeras 5 filas del DataFrame
print(df.head())


    category                                       title  \
0  air-fryer     Air Fryer Lemon Garlic Parmesan Chicken   
1  air-fryer  Our 15 Best Air Fryer Thanksgiving Recipes   
2  air-fryer                           Air Fryer S’Mores   
3  air-fryer                        Air Fryer Baked Yams   
4  air-fryer        Lemon Garlic Butter Chicken Spiedini   

                                         description  \
0  These simple lemon garlic Parmesan chicken thi...   
1       Thanksgiving dinner doesn't need to be hard.   
2  This recipe for air fryer s'mores is perfect f...   
3  These air fryer baked yams free up the oven, w...   
4  These lemon garlic butter chicken spiedini are...   

                                         ingredients  \
0  1 1/2 pounds skinless boneless chicken thighs;...   
1                                                NaN   
2  1 sleeve graham crackers; 5 (1.5 ounce) chocol...   
3                      1 yam; 1/2 teaspoon olive oil   
4  1/2 cup extra-virgi

## Paso 3: Preprocesamiento y normalización


Este código realiza el **preprocesamiento y normalización** de los textos en un archivo CSV de recetas (`all_recipes.csv`). El proceso incluye:
1. **Preprocesamiento**:
   - Convertir todo el texto a minúsculas.
   - Eliminar caracteres especiales, puntuación y espacios múltiples.
2. **Normalización**:
   - **Lematización**: Reduce las palabras a su forma base gramatical (e.g., "running" → "run").
   - **Eliminación de stopwords**: Remueve palabras comunes como "and", "the", etc., que no aportan significado.

El preprocesamiento se aplica a las columnas `title`, `description`, `ingredients` y `steps`. El resultado se guarda en un nuevo archivo CSV llamado `all_recipes_preprocessed_normalized.csv`, listo para su uso en análisis posteriores o modelos de aprendizaje automático.

El script también utiliza la biblioteca NLTK para realizar la lematización y gestionar las stopwords.


In [6]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

# Descargar recursos necesarios de NLTK
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

# Cargar el archivo CSV
csv_file = "all_recipes.csv"
df = pd.read_csv(csv_file)

# Inicializar lematizador y stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Función para preprocesar y normalizar texto
def preprocess_and_normalize(text):
    if pd.isna(text):
        return ""
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar caracteres especiales y puntuación
    text = re.sub(r"[^a-z0-9áéíóúñü ]", "", text)
    # Eliminar espacios múltiples
    text = re.sub(r"\s+", " ", text).strip()
    # Tokenizar y lematizar
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return " ".join(tokens)

# Aplicar preprocesamiento y normalización a las columnas relevantes
df["title"] = df["title"].apply(preprocess_and_normalize)
df["description"] = df["description"].apply(preprocess_and_normalize)
df["ingredients"] = df["ingredients"].apply(preprocess_and_normalize)
df["steps"] = df["steps"].apply(preprocess_and_normalize)

# Guardar el DataFrame preprocesado y normalizado en un nuevo archivo CSV
output_file = "all_recipes_preprocessed_normalized.csv"
df.to_csv(output_file, index=False, encoding="utf-8")

print(f"Preprocesamiento y normalización completados. Archivo guardado como '{output_file}'.")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Preprocesamiento y normalización completados. Archivo guardado como 'all_recipes_preprocessed_normalized.csv'.


Se cargara el archivo creado con el código anterior y se mostraran los 5 datos del archivo

In [7]:
import pandas as pd

# Cargar el archivo CSV preprocesado y normalizado
csv_file = "all_recipes_preprocessed_normalized.csv"
df = pd.read_csv(csv_file)

# Mostrar las primeras 5 filas del DataFrame
print("Primeros 5 datos del archivo preprocesado y normalizado:")
print(df.head())


Primeros 5 datos del archivo preprocesado y normalizado:
    category                                    title  \
0  air-fryer  air fryer lemon garlic parmesan chicken   
1  air-fryer    15 best air fryer thanksgiving recipe   
2  air-fryer                         air fryer smores   
3  air-fryer                      air fryer baked yam   
4  air-fryer     lemon garlic butter chicken spiedini   

                                         description  \
0  simple lemon garlic parmesan chicken thigh coo...   
1               thanksgiving dinner doesnt need hard   
2  recipe air fryer smores perfect time craving b...   
3  air fryer baked yam free oven especially helpf...   
4  lemon garlic butter chicken spiedini airfried ...   

                                         ingredients  \
0  1 12 pound skinless boneless chicken thigh 3 c...   
1                                                NaN   
2  1 sleeve graham cracker 5 15 ounce chocolate c...   
3                        1 yam 12 teasp

## Paso 4: Embeddings

Este código utiliza el modelo de `SentenceTransformer` para generar **embeddings** (representaciones vectoriales) de las recetas contenidas en un archivo CSV preprocesado (`all_recipes_preprocessed_normalized.csv`). El flujo es el siguiente:

1. **Preparación del texto**:
   - Combina las columnas `title`, `description`, `ingredients` y `steps` en un único campo llamado `combined_text`.

2. **Generación de embeddings**:
   - Usa el modelo `all-mpnet-base-v2` para convertir cada texto combinado en un vector de alta dimensión que captura su significado semántico.

3. **Guardado de resultados**:
   - Los embeddings se guardan en un archivo binario llamado `recipe_embeddings.npy`.
   - El DataFrame actualizado, con los textos combinados y los índices de embeddings, se guarda en `all_recipes_with_embeddings.csv`.

4. **Validación**:
   - Muestra las primeras 5 recetas con sus títulos e índices de embeddings para verificar que los datos se procesaron correctamente.

Este código es útil para preparar las recetas para búsquedas semánticas o para entrenar modelos de aprendizaje automático.

In [8]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

# Cargar el archivo CSV preprocesado y normalizado
csv_file = "all_recipes_preprocessed_normalized.csv"
df = pd.read_csv(csv_file)

# Inicializar el modelo de embeddings
model = SentenceTransformer('all-mpnet-base-v2')

# Combinar las columnas relevantes en un solo texto para cada receta
def combine_text(row):
    return f"{row['title']} {row['description']} {row['ingredients']} {row['steps']}"

df['combined_text'] = df.apply(combine_text, axis=1)

# Generar embeddings para cada receta
print("Generando embeddings...")
embeddings = model.encode(df['combined_text'].tolist(), show_progress_bar=True)

# Guardar los embeddings como un archivo separado
np.save("recipe_embeddings.npy", embeddings)

# Agregar una columna con los índices de los embeddings en el DataFrame
df['embedding_index'] = range(len(embeddings))

# Guardar el DataFrame con los índices de embeddings
output_file = "all_recipes_with_embeddings.csv"
df.to_csv(output_file, index=False, encoding="utf-8")

print(f"Embeddings generados y guardados. Archivo guardado como '{output_file}'.")

# Mostrar una muestra de 5 datos para validar
print("Muestra de 5 datos del DataFrame con embeddings:")
print(df[['title', 'embedding_index']].head(5))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generando embeddings...


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Embeddings generados y guardados. Archivo guardado como 'all_recipes_with_embeddings.csv'.
Muestra de 5 datos del DataFrame con embeddings:
                                     title  embedding_index
0  air fryer lemon garlic parmesan chicken                0
1    15 best air fryer thanksgiving recipe                1
2                         air fryer smores                2
3                      air fryer baked yam                3
4     lemon garlic butter chicken spiedini                4


## Instalación de dependencias

Este bloque de código instala **FAISS**.

1. **Instalación del paquete FAISS a través de `apt`**:
   - `!apt-get install -y libfaiss-dev`: Instala las bibliotecas base de FAISS en el sistema operativo.
   
2. **Instalación del módulo de Python**:
   - `!pip install faiss-cpu`: Instala la versión compatible con CPU de la biblioteca FAISS para Python.

Estas instalaciones son necesarias para usar FAISS en Python, lo cual es útil para realizar búsquedas rápidas y eficientes en grandes bases de datos vectoriales.


In [10]:
!apt-get install -y libfaiss-dev
!pip install faiss-cpu


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  libfaiss-dev
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 949 kB of archives.
After this operation, 6,224 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libfaiss-dev amd64 1.7.2-5 [949 kB]
Fetched 949 kB in 1s (1,287 kB/s)
Selecting previously unselected package libfaiss-dev:amd64.
(Reading database ... 124574 files and directories currently installed.)
Preparing to unpack .../libfaiss-dev_1.7.2-5_amd64.deb ...
Unpacking libfaiss-dev:amd64 (1.7.2-5) ...
Setting up libfaiss-dev:amd64 (1.7.2-5) ...
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━

In [11]:
import faiss
print("FAISS importado correctamente.")


FAISS importado correctamente.


## Paso 5: Faiss

Este código utiliza la biblioteca **FAISS** para crear un índice de similitud basado en los embeddings generados previamente, permitiendo realizar búsquedas eficientes en el espacio vectorial.

1. **Cargar embeddings**:
   - Se cargan los embeddings desde el archivo `recipe_embeddings.npy`.

2. **Crear el índice FAISS**:
   - Se inicializa un índice plano (`IndexFlatL2`) basado en la distancia Euclidiana (L2) para calcular similitudes entre vectores.

3. **Agregar embeddings al índice**:
   - Los vectores de embeddings se agregan al índice FAISS para su futura búsqueda.

4. **Verificar el índice**:
   - Muestra cuántos vectores (embeddings) han sido indexados.

5. **Guardar el índice**:
   - El índice FAISS se guarda en un archivo llamado `faiss_recipe_index.index` para reutilizarlo posteriormente.

Este código es una etapa clave para sistemas de búsqueda semántica, ya que organiza los embeddings en una estructura eficiente para realizar consultas rápidas.


In [12]:
import faiss
import numpy as np

# Cargar los embeddings generados previamente
embeddings_file = "recipe_embeddings.npy"
embeddings = np.load(embeddings_file)

# Crear un índice FAISS (Flat Index para búsquedas de similitud exacta)
embedding_dimension = embeddings.shape[1]  # Dimensión de los vectores
index = faiss.IndexFlatL2(embedding_dimension)  # Índice con distancia L2 (Euclidiana)

# Agregar los embeddings al índice
print("Agregando embeddings al índice FAISS...")
index.add(embeddings)

# Verificar cuántos embeddings se han indexado
print(f"Número de embeddings en el índice: {index.ntotal}")

# Guardar el índice FAISS
faiss.write_index(index, "faiss_recipe_index.index")
print("Índice FAISS guardado como 'faiss_recipe_index.index'.")


Agregando embeddings al índice FAISS...
Número de embeddings en el índice: 539
Índice FAISS guardado como 'faiss_recipe_index.index'.


## Resultados con busquedas

Este código realiza **búsquedas** en un índice FAISS previamente creado, utilizando embeddings generados con Sentence Transformers. Permite encontrar recetas similares a una consulta de texto.

1. **Cargar los datos**:
   - Se carga el índice FAISS (`faiss_recipe_index.index`), los embeddings originales (`recipe_embeddings.npy`), y el DataFrame con información de recetas (`all_recipes_with_embeddings.csv`).

2. **Modelo de embeddings**:
   - Utiliza `SentenceTransformer` (`all-mpnet-base-v2`) para convertir la consulta en un embedding vectorial.

3. **Consulta en FAISS**:
   - La función `query_faiss`:
     - Genera un embedding para la consulta de texto.
     - Busca los `k` vecinos más cercanos en el índice FAISS.
     - Recupera las recetas más relevantes, incluyendo su título, descripción, y la distancia calculada.

4. **Resultados**:
   - Muestra los `k` resultados más relevantes para la consulta `"apple pie with cinnamon"`, ordenados por similitud (distancia más baja).

Este código es ideal para implementar sistemas de búsqueda semántica en bases de datos grandes, utilizando embeddings para capturar el significado del texto y FAISS para realizar consultas rápidas.


### Consulta #1

In [13]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

# Cargar el índice FAISS
index_file = "faiss_recipe_index.index"
index = faiss.read_index(index_file)

# Cargar los embeddings originales y el DataFrame de recetas
embeddings_file = "recipe_embeddings.npy"
embeddings = np.load(embeddings_file)

data_file = "all_recipes_with_embeddings.csv"
df = pd.read_csv(data_file)

# Inicializar el modelo de embeddings para procesar consultas
model = SentenceTransformer('all-mpnet-base-v2')

# Función para realizar una consulta en el índice FAISS
def query_faiss(query, k=5):
    # Generar el embedding de la consulta
    query_embedding = model.encode([query])

    # Buscar los k vecinos más cercanos
    distances, indices = index.search(query_embedding, k)

    # Mostrar resultados
    results = []
    for i, idx in enumerate(indices[0]):
        result = {
            "rank": i + 1,
            "title": df.loc[idx, "title"],
            "description": df.loc[idx, "description"],
            "distance": distances[0][i]
        }
        results.append(result)
    return results

# Prueba de consulta
query_text = "apple pie with cinnamon"
k = 5
results = query_faiss(query_text, k)

print(f"Resultados para la consulta: '{query_text}'")
for result in results:
    print(f"Rank {result['rank']}: {result['title']} (Distancia: {result['distance']:.4f})")


Resultados para la consulta: 'apple pie with cinnamon'
Rank 1: sunday apple pie (Distancia: 0.4953)
Rank 2: nosugar apple pie (Distancia: 0.5543)
Rank 3: easy apple pie (Distancia: 0.5604)
Rank 4: spiced apple topping pancake (Distancia: 0.5752)
Rank 5: apple crumble pie (Distancia: 0.5934)


### Consulta #2

In [14]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

# Cargar el índice FAISS
index_file = "faiss_recipe_index.index"
index = faiss.read_index(index_file)

# Cargar los embeddings originales y el DataFrame de recetas
embeddings_file = "recipe_embeddings.npy"
embeddings = np.load(embeddings_file)

data_file = "all_recipes_with_embeddings.csv"
df = pd.read_csv(data_file)

# Inicializar el modelo de embeddings para procesar consultas
model = SentenceTransformer('all-mpnet-base-v2')

# Función para realizar una consulta en el índice FAISS
def query_faiss(query, k=5):
    # Generar el embedding de la consulta
    query_embedding = model.encode([query])

    # Buscar los k vecinos más cercanos
    distances, indices = index.search(query_embedding, k)

    # Mostrar resultados
    results = []
    for i, idx in enumerate(indices[0]):
        result = {
            "rank": i + 1,
            "title": df.loc[idx, "title"],
            "description": df.loc[idx, "description"],
            "distance": distances[0][i]
        }
        results.append(result)
    return results

# Prueba de consulta
query_text = "chicken"
k = 5
results = query_faiss(query_text, k)

print(f"Resultados para la consulta: '{query_text}'")
for result in results:
    print(f"Rank {result['rank']}: {result['title']} (Distancia: {result['distance']:.4f})")


Resultados para la consulta: 'chicken'
Rank 1: bang bang chicken nugget (Distancia: 1.1280)
Rank 2: turmeric black pepper chicken broccoli (Distancia: 1.1507)
Rank 3: air fryer chicken bite (Distancia: 1.1663)
Rank 4: lemon garlic butter chicken spiedini (Distancia: 1.1772)
Rank 5: lemon garlic butter chicken spiedini (Distancia: 1.1772)
