# This notebook will scrape 2 websites with different mochi recipes to make one JSON inspiring set 

#### Import Libraries

In [3]:
import re
import requests
import string
import json
import random

from bs4 import BeautifulSoup

#### General definitions

In [4]:
def make_soup(URL):
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    return soup

def convert_to_number(amount):
    # Dictionary to map common fractions to their decimal equivalents
    fraction_map = {
        "½": 0.5,
        "⅓": 1/3,
        "¼": 0.25,
        "¾": 0.75,
        "⅛": 0.125,
        "⅔": 2/3,
        "⅘": 4/5
        # Add other fractions as needed
    }
    
    # Split the amount into parts
    parts = amount.split()

    # If there's only one part (like "1" or "½"), check if it's a whole number or fraction
    if len(parts) == 1:
        if parts[0] in fraction_map:
            return fraction_map[parts[0]]  # It's a fraction like "½", return its decimal
        else:
            return float(parts[0])  # It's a whole number, return as float

    # If there are two parts (like "1 ¼"), process both the whole number and fraction
    elif len(parts) == 2:
        whole_part = float(parts[0])  # Convert the first part (whole number) to float
        fraction_part = parts[1]      # Second part is the fraction (e.g., "¼")

        if fraction_part in fraction_map:
            return whole_part + fraction_map[fraction_part]  # Add the decimal equivalent of fraction
    else: 
        return f'No match for{amount}'
    

In [5]:
convert_to_number("7")

7.0

# Scrape from *all purpose veggies*

##### Definitions

In [6]:
def scrape_ingredients(dom):
    """ returns a dictionary in format: { "amount": 150,  "unit": "g",  "ingredient": "minced beef" },
    """
    ingredients  = []

    for ingredient in dom.find_all("li", class_="wprm-recipe-ingredient"):
        dic = {}

        # find all objects in the page
        amount = ingredient.find("span", class_="wprm-recipe-ingredient-amount").text
        unit = ingredient.find("span", class_="wprm-recipe-ingredient-unit").text
        name = ingredient.find("span", class_="wprm-recipe-ingredient-name").text

        if unit == "package":
            amount = 1
            unit = "cup"
        elif unit == "recipe":
            amount = 300
            unit = "g"
            name = "sweetened white bean paste (shiro an)"
           
            dictionary = {
                "amount": 2.0,
                "unit": "tablespoons",
                "ingredient": "pumpkin puree",
            }
            ingredients.append(dictionary)

            dictionary = {
                "amount": 2.0,
                "unit": "teaspoons",
                "ingredient": "pumpkin pie spice",
            }
            ingredients.append(dictionary)
        elif unit.lower() == "tsp":
            unit = "teaspoons"
        elif unit.lower() == "tbsp":
            unit = "tablespoons"
        elif amount == "6-8":
            amount = 7
        elif amount == '1 ¼':
            amount = 250
            unit = "g"
            name = "mashed silken firm tofu"
        else: 
            amount = convert_to_number(amount)
        
        # make for each ingredient a seperate dict
        dic['amount'] = float(amount)
        dic['unit'] = unit
        dic['ingredient'] = name
        ingredients.append(dic)


    return ingredients

def scrape_servings(dom):
    servings = dom.find("span", class_="wprm-recipe-servings").text
    servings = int(servings)
    return servings

def scrape_rating(dom):
    rating = dom.find("div", class_="wprm-recipe-rating-details")
    if "No" in rating.text:
        rating = "needs rating"
    else:
        rating = rating.find("span", class_="wprm-recipe-rating-average").text
        rating = float(rating)
    return rating

### 1. Get main page content and recipe links

In [7]:
URL = "https://allpurposeveggies.com/12967/12-mochi-flavors-easy-recipes-for-mochi-ice-cream-and-more/"
soup = make_soup(URL)

recipe_links = []
for link in soup.find_all('a', href=True):
    if 'See Recipe' in link.get_text():  # Using get_text() ensures you only check the text inside the <a> tag
        recipe_links.append(link['href'])

recipe_links = recipe_links[:17]

### 2. Get list of names of recipe from main page

In [8]:
recipe_names = []
for title in soup.find_all('h2', class_="wp-block-heading"):
    title = title.text
    if title[0].isdigit():
        title = title.strip(string.punctuation + string.whitespace + string.digits)
        recipe_names.append(title)

### 3. Scrape recipes into list

In [9]:
recipes_all_1 = []
for i, recipe in enumerate(recipe_links):  # Enumerate, a great Python trick!
        dic = {}
        title = recipe_names[i]
        i = i+1

        print('Scraping recipe %d ...' % i, title)

        # Grab web page
        recipe_html = requests.get(recipe)
        
        # Extract relevant information for each recipe
        recipe_dom = BeautifulSoup(recipe_html.content, "html.parser")

        
        rating = scrape_rating(recipe_dom)
        ingredients = scrape_ingredients(recipe_dom)
        servings = scrape_servings(recipe_dom)
        dic["name"] = title
        dic["rating"] = rating
        dic["ingredients"] = ingredients
        dic["servings"] = servings
        
        recipes_all_1.append(dic)

Scraping recipe 1 ... Strawberry Mochi
Scraping recipe 2 ... Fresh Mango
Scraping recipe 3 ... Applesauce Mochi
Scraping recipe 4 ... Sweet Potato
Scraping recipe 5 ... Raspberry Chocolate
Scraping recipe 6 ... Banana Chocolate
Scraping recipe 7 ... Tofu
Scraping recipe 8 ... Pumpkin
Scraping recipe 9 ... Savory Sweet Corn Mochi
Scraping recipe 10 ... Pistachio Butter
Scraping recipe 11 ... Black Sesame
Scraping recipe 12 ... Purple Sweet Potato Mochi
Scraping recipe 13 ... Green Tea / Matcha (with a Twist
Scraping recipe 14 ... Almond Milk
Scraping recipe 15 ... Crunchy Peanut Butter
Scraping recipe 16 ... Blueberry Mochi Ice Cream
Scraping recipe 17 ... Mugwort Mochi


# 2. Scrape recipes from *The rice chick*

##### Definitions

In [11]:
def scrape_ingredients(dom):
    """ returns a dictionary in format: { "amount": 150,  "unit": "g",  "ingredient": "minced beef" },
    """
    ingredients  = []

    for ingredient in dom.find_all("li", class_="wprm-recipe-ingredient"):
        dic = {}

        # find all objects in the page
        amount = ingredient.find("span", class_="wprm-recipe-ingredient-amount")
        unit = ingredient.find("span", class_="wprm-recipe-ingredient-unit")
        name = ingredient.find("span", class_="wprm-recipe-ingredient-name").text

        if unit is None:
            unit = "whole"
        elif unit == "Pinch": 
            amount = "0.3"
            unit = "g"
        else: 
            unit = unit.text
            unit = re.sub(r"\(.*?\)", "", unit).strip()
        
        if "ml/grams" in name:
            unit = "ml/grams"
        elif "grams" in name: 
            name = "glutinous rice flour"
        name = re.sub(r"\(.*?\)", "", name).strip()
        
        if amount is None:
            amount = "1"
        else:
            amount = amount.text
            
        
        if amount == "Pinch of":
            amount = "0.3"
            unit = "g"

        amount = convert_to_number(amount)
        
        # make for each ingredient a seperate dict
        dic['amount'] = float(amount)
        dic['unit'] = unit
        dic['ingredient'] = name
        ingredients.append(dic)


    return ingredients

def scrape_servings(dom):
    servings = dom.find("span", class_="wprm-recipe-servings").text
    servings = int(servings)
    return servings

def scrape_rating(dom):
    rating = dom.find("span", class_="wprm-recipe-rating-average")
    if rating is None:
        rating = "needs rating"
    else:
        rating = rating.text
    return rating

### 1. Get main page content and recipe links

In [10]:
URL = "https://thericechick.com/mochi-recipes/"

soup = make_soup(URL)

recipe_links = []
recipe_names = []
for link in soup.find_all('a', class_="wp-block-button__link has-text-color has-background wp-element-button", href=True):
    recipe_links.append(link['href'])
    recipe_names.append(link.text)

### 2. Scrape recipes into list

In [13]:
recipes_all_2 = []
for i, recipe in enumerate(recipe_links):  # Enumerate, a great Python trick!
        dic = {}
        title = recipe_names[i]
        i = i+1

        print('Scraping recipe %d ...' % i, title)

        # Grab web page
        recipe_html = requests.get(recipe)
        
        # Extract relevant information for each recipe
        recipe_dom = BeautifulSoup(recipe_html.content, "html.parser")

        
        rating = scrape_rating(recipe_dom)
        ingredients = scrape_ingredients(recipe_dom)
        servings = scrape_servings(recipe_dom)
        dic["name"] = title
        dic["rating"] = rating
        dic["ingredients"] = ingredients
        dic["servings"] = servings
        
        recipes_all_2.append(dic)

Scraping recipe 1 ... Pandan Mochi
Scraping recipe 2 ... Black Sesame Mochi Muffins
Scraping recipe 3 ... Mango Mochi
Scraping recipe 4 ... Matcha Mochi Waffles
Scraping recipe 5 ... Ube Baked Mochi Donuts
Scraping recipe 6 ... Potato Mochi
Scraping recipe 7 ... Chocolate Mochi
Scraping recipe 8 ... Ube Mochi Muffins
Scraping recipe 9 ... Biscoff Baked Mochi Donuts
Scraping recipe 10 ... Nutella Mochi
Scraping recipe 11 ... Chocolate Mochi Cupcakes
Scraping recipe 12 ... Matcha Mochi Muffins
Scraping recipe 13 ... Ube Butter Mochi
Scraping recipe 14 ... Mochi Waffle Recipe
Scraping recipe 15 ... Mochi Bread
Scraping recipe 16 ... Chocolate Mochi Muffins
Scraping recipe 17 ... Mochi Cookies
Scraping recipe 18 ... Chocolate Mochi Donuts
Scraping recipe 19 ... Ube Mochi
Scraping recipe 20 ... Matcha Baked Mochi Donuts
Scraping recipe 21 ... Mochi Pancakes
Scraping recipe 22 ... Pandan Donuts
Scraping recipe 23 ... Milk Rice Cake


In [14]:
recipes_all_json = {}
recipes_all_json['recipes'] = recipes_all_1 + recipes_all_2 #combine both lists
data = json.dumps(recipes_all_json, indent=4) 
with open("data/sample.json", "w") as outfile:
    outfile.write(data)