In [1]:
%pip install requests beautifulsoup4 ingredient-parser-nlp scikit-learn

Collecting requests
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting beautifulsoup4
  Using cached beautifulsoup4-4.13.1-py3-none-any.whl.metadata (3.8 kB)
Collecting ingredient-parser-nlp
  Using cached ingredient_parser_nlp-1.3.2-py3-none-any.whl.metadata (5.2 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting charset-normalizer<4,>=2 (from requests)
  Using cached charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (35 kB)
Collecting idna<4,>=2.5 (from requests)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Using cached urllib3-2.3.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests)
  Using cached certifi-2025.1.31-py3-none-any.whl.metadata (2.5 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Using cached soupsieve-2.6-py3-

In [1]:
import re
import pickle
import json
import requests
from bs4 import BeautifulSoup
from ingredient_parser import parse_ingredient
import concurrent.futures
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/cristi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
}

url = "https://www.allrecipes.com/recipes-a-z-6735880"

recipe_links_set = set() # links to actual recipes
sublinks_set = set() # links to other pages that contain recipes

In [None]:
def sublinks_retrieval(href):
    response = requests.get(href, headers=headers)
    if (response.status_code == 200):
        soup = BeautifulSoup(response.text, 'html.parser')
    else:
        print("Failed to fetch the webpage. Status code:", response.status_code)
        return

    unordered_list = soup.find_all('ul', class_="comp mntl-taxonomy-nodes__list mntl-block")

    if len(unordered_list) > 0: 
        soup = BeautifulSoup(str(unordered_list[0]), 'html.parser')
        list_items = soup.find_all('li')
        
        for li in list_items:
            soup = BeautifulSoup(str(li), 'html.parser')
            item = soup.find_all('a')

            for link in item:
                href = link.get('href')
                if href not in sublinks_set:
                    sublinks_set.add(href)
                    sublinks_retrieval(href)

    # get links to recipes from the current page
    cards_hyperlink_elems = soup.find_all('a', class_="comp mntl-card-list-items mntl-universal-card mntl-document-card mntl-card card card--no-image")
    for hyperlink_elem in cards_hyperlink_elems:
        soup = BeautifulSoup(str(hyperlink_elem), 'html.parser')
        fav_div = soup.find_all('div', class_="comp card__favorite mm-myrecipes-favorite")
        if len(fav_div) > 0:
            href = hyperlink_elem.get('href')
            if href not in recipe_links_set:
                recipe_links_set.add(href)

def scrape_links_to_recipes():
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        print("Scrapping started")
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a', class_="mntl-link-list__link text-body-100 global-link text-body-100 global-link")
        unique_links = set(links)

        cnt = 0
        for link in unique_links:
            href = link.get('href')
            if href not in sublinks_set:
                print(f"Scrapping {href} total_recipes {len(recipe_links_set)}")
                sublinks_set.add(href)
                sublinks_retrieval(href)
            else:
                print(f"Already scrapped {href}")                
    
    else:
        print("Failed to fetch the webpage. Status code:", response.status_code)

    with open("./recipe_links.txt", "w") as file:
        for item in recipe_links_set:
            file.write(f"{item}\n")

    print("Scrapping finished")

In [None]:
def scrape_recipe(link):
    """Scrapes a single recipe given a link."""
    recipe_data = {}
    print(f"Scraping: {link.strip()}") # Print which link is being scraped
    try:
        response = requests.get(link.strip(), headers=headers, allow_redirects=True, timeout=10)
        if response.status_code != 200:
            print(f"Cannot access the recipe link: {link}")
            return None  # Return None if there's an issue

        recipe_data['link'] = link.strip()
        soup = BeautifulSoup(response.text, 'html.parser')
        main_html_tag = soup.find('main', class_="loc main")

        if not main_html_tag:
            print(f"Main content not found for link: {link}")
            return None

        main_soup = BeautifulSoup(str(main_html_tag), "html.parser")

        # Title
        title_element = main_soup.find('h1', class_='article-heading')
        recipe_data['title'] = title_element.text.strip() if title_element else "Title not found."

        # Description
        description_element = main_soup.find('p', class_='article-subheading')
        recipe_data['description'] = description_element.text.strip() if description_element else "Description not found."

        # Total Time
        total_time_element = main_soup.find('div', class_='mm-recipes-details__label', string='Total Time:')
        if total_time_element:
            total_time_value_element = total_time_element.find_next_sibling('div', class_='mm-recipes-details__value')
            recipe_data['total_time'] = total_time_value_element.text.strip() if total_time_value_element else "Total Time value not found."
        else:
            recipe_data['total_time'] = "Total Time label not found."

        # Ingredients
        ingredients = []
        ingredients_div = main_soup.find('div', class_="comp mm-recipes-structured-ingredients")
        if ingredients_div:
            aux_soup = BeautifulSoup(str(ingredients_div), "html.parser")
            ingredients_ul = aux_soup.find('ul', class_="mm-recipes-structured-ingredients__list")
            if ingredients_ul:
                aux_soup = BeautifulSoup(str(ingredients_ul), "html.parser")
                ingredients_li = aux_soup.find_all('li', class_="mm-recipes-structured-ingredients__list-item")
                for ingredient_li in ingredients_li:
                    aux_soup = BeautifulSoup(str(ingredient_li), "html.parser")
                    p_tag = aux_soup.find('p')
                    for child in p_tag.children:
                        if child.name == 'span' and child.get('data-ingredient-name') == 'true':
                            try:
                                parsed_ingredient = parse_ingredient(child.text.strip().lower())
                                if parsed_ingredient.name and len(parsed_ingredient.name.text) > 1:
                                    ingredients.append(parsed_ingredient.name.text)
                            except Exception as e:
                                print(f"Error parsing ingredient: {child.text.strip().lower()} - Error: {e}")
        recipe_data['ingredients'] = ingredients

        # Steps
        steps = []
        for li in main_soup.find_all('li', class_='mntl-sc-block-group--LI'):
            p = li.find('p', class_='mntl-sc-block-html')
            if p:
                step_text = p.text.strip()
                steps.append(step_text)
        recipe_data['steps'] = steps

        return recipe_data

    except requests.exceptions.RequestException as e: # Catch request errors
        print(f"Request error for {link}: {e}")
        return None
    except Exception as e: # Catch any other error
        print(f"Error processing {link}: {e}")
        return None

def scrape_all_recipes():
    all_recipes = []

    with open('recipe_links.txt', 'r') as file:
        links = [link.strip() for link in file]  # Read all links into memory

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: # Use ThreadPoolExecutor
        futures = [executor.submit(scrape_recipe, link) for link in links]

        for future in concurrent.futures.as_completed(futures): # Process results as they complete
            recipe_data = future.result()
            if recipe_data:
                all_recipes.append(recipe_data)

    with open("recipes.json", 'w', encoding="utf-8") as file:
        json.dump(all_recipes, file, indent=4, ensure_ascii=False)

    print("Scraping and saving to JSON complete.")

In [None]:
lemmatizer = WordNetLemmatizer()

def sanitize_ingredient(ingredient):

    ingredient = ingredient.lower().strip()
    ingredient = re.sub(r"[\u00AE\u2122]", "", ingredient).strip()  # Remove registered and trademark symbols
    ingredient = re.sub(r" \(.+?\)", "", ingredient).strip()  # Remove parenthetical info
    ingredient = re.sub(r"[^a-zA-Z0-9\s-]", "", ingredient).strip()  # Remove special characters (except hyphen)
    ingredient = re.sub(r"\s+", " ", ingredient).strip()  # Remove extra spaces
    ingredient = re.sub(r"^(fresh|dried|frozen|canned|chopped|sliced|ground|minced|diced|cooked|raw|or|and|with|in|of|the|a|an|all-natural|all-purpose|all-beef|100%|%|number|#|\$|\d+(?:st|nd|rd|th)?(?:-less)?(?:-fat)?(?:-free)?)\s*", "", ingredient).strip()  # Remove prefixes, numbers, units, etc.
    ingredient = lemmatizer.lemmatize(ingredient)  # Lemmatize
    ingredient = ingredient.strip()  # Remove any remaining whitespace
    return ingredient

def create_sanitized_datasets():
    sanitized_ingredients_set = set()

    with open("recipes.json", 'r', encoding='utf-8') as f:
        recipes_data = json.load(f)
    
    for recipe in recipes_data:
        sanitized_recipe_ingredients = []
        for ingredient in recipe['ingredients']:
            result = sanitize_ingredient(ingredient)
            if not result:
                print(ingredient)
                continue
            
            sanitized_recipe_ingredients.append(result)
        
        recipe['sanitized_ingredients'] = sanitized_recipe_ingredients
        for sanitized_ingredient in sanitized_recipe_ingredients:
            sanitized_ingredients_set.add(sanitized_ingredient)
    
    with open("sanitized_ingredients.txt", 'w') as f:
        for sanitized_ingredient in sanitized_ingredients_set:
            f.write(f"{sanitized_ingredient}\n")
    
    with open("sanitized_recipes.json", 'w', encoding='utf-8') as f:
        json.dump(recipes_data, f, indent=4, ensure_ascii=False)

In [None]:
def create_model_and_matrix_files(recipes_data):
    recipe_ingredient_strings = []
    for recipe in recipes_data:
        ingredient_names = [ingredient.lower().strip() for ingredient in recipe.get('sanitized_ingredients', [])] # Basic lowercasing and stripping
        recipe_ingredient_strings.append(" ".join(ingredient_names))

    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(recipe_ingredient_strings)

    with open('model_file', 'wb') as f:
        pickle.dump(tfidf, f)

    with open('matrix_file', 'wb') as f:
        pickle.dump(tfidf_matrix, f)

In [2]:
def lookup_ingredients(all_ingredients, target_ingredients):
    related_ingredients = []
    for ingredient in all_ingredients:
        for target_ingredient in target_ingredients:
            if target_ingredient.lower() in ingredient.lower():  # Case-insensitive check
                related_ingredients.append(ingredient)
    return related_ingredients

def calculate_similarity(user_ingredients, loaded_tfidf, loaded_tfidf_matrix):
    user_ingredient_string = " ".join([ingredient.lower().strip() for ingredient in user_ingredients])
    user_vector = loaded_tfidf.transform([user_ingredient_string]) # Use the loaded tfidf model
    similarity_scores = cosine_similarity(user_vector, loaded_tfidf_matrix) # Use the loaded matrix
    return similarity_scores

def load_and_test_model(model_file, matrix_file, ingredients_set, recipes):
    try:
        with open(model_file, 'rb') as f:
            loaded_tfidf = pickle.load(f)
        with open(matrix_file, 'rb') as f:
            loaded_tfidf_matrix = pickle.load(f)
        print("Model and matrix loaded successfully.")

    except FileNotFoundError:
        print("Error: Model or matrix file not found.")
        return
    except Exception as e:
        print(f"Error loading model or matrix: {e}")
        return

    user_ingredients = ["onion", "butter", "eggs", "bread", "milk"]  # Example user input
    enriched_user_ingredients = lookup_ingredients(ingredients_set, user_ingredients)
    similarity_scores = calculate_similarity(enriched_user_ingredients, loaded_tfidf, loaded_tfidf_matrix)

    N = 5  # Number of recommendations
    top_n_indices = similarity_scores.argsort()[0][::-1][:N]

    recommended_recipes = [recipes[i] for i in top_n_indices]

    for recipe in recommended_recipes:
        print(recipe['title'])
        print(recipe['ingredients'])
        print(recipe['link'])
        print("-" * 20)


In [None]:
ingredients_set = set()

create_sanitized_datasets()

with open("sanitized_recipes.json", 'r', encoding='utf-8') as f:
    recipes_data = json.load(f)

create_model_and_matrix_files(recipes_data)

# load_and_test_model("model_file", "matrix_file", ingredients_set, recipes_data)