# This notebook will scrape 2 websites with different mochi recipes to make one JSON inspiring set 

#### Import Libraries

In [1]:
import re
import requests
import string
import json
from sugarcube import Mass, Volume, Liquids, Other_liquids, Butter, Sugar, Flour, Pastes, Solids, Powder, Bindings, Yeast, Salt, Icecream, Jam, Biscuits, Nori, Beans

from bs4 import BeautifulSoup

#### Global variables

In [2]:
categories = {
    "Liquids": ["milk", "almond milk", "coconut", "water"],
    "Other_liquids": ["oil", "juice", "extract", "essence"],
    "Butter": ["butter", "nutella", "peanut butter", "spread"],
    "Sugar": ["sugar"],
    "Flour": ["flour", "starch", "cornstarch", "mochiko"],
    "Pastes": ["paste", "cream"],
    "Solids": ["fruit", "nuts", "chocolate", "raspberries", "strawberries", "banana", "mango", "potato", "corn"],
    "Powder": ["matcha", "powder", "spice"],
    "Bindings": ["egg", "mashed", "applesauce", "puree"],
    "Yeast": ["yeast"],
    "Salt": ["salt"],
    "Icecream": ["ice"],
    "Jam": ["jam", "pudding"],
    "Biscuits": ["biscuits"],
    "Nori": ["nori"],
    "Beans": ["beans"]
}

volume_mapping = {
    "ounce": Mass.ounce,
    "oz": Mass.ounce,
    "teaspoon": Volume.teaspoon,
    "tablespoon": Volume.tablespoon,
    "teaspoons": Volume.teaspoon,
    "tablespoons": Volume.tablespoon,
    "cup": Volume.cup,
    "cups": Volume.cup,
    "pinch": Volume.pinch,
    "pinch of": Volume.pinch,
    "ml": Volume.milliliter
}

categories_mapping = {
    "Liquids": Liquids,
    "Other_liquids": Other_liquids,
    "Butter": Butter,
    "Sugar": Sugar,
    "Flour": Flour,
    "Pastes": Pastes,
    "Solids": Solids,
    "Powder": Powder,
    "Bindings": Bindings,
    "Yeast": Yeast,
    "Salt": Salt,
    "Icecream": Icecream,
    "Jam": Jam,
    "Biscuits": Biscuits,
    "Nori": Nori,
    "Beans": Beans
}

alternative_ratings = {
    "Mango Mochi": 3.8,
    "Matcha Mochi Waffles": 3.3,
    "Ube Baked Mochi Donuts": 3.5,
    "Potato Mochi": 3.3,
    "Ube Mochi Muffins": 3.5,
    "Biscoff Baked Mochi Donuts": 3.0,
    "Chocolate Mochi Cupcakes": 4.3,
    "Matcha Mochi Muffins": 3.3,
    "Ube Butter Mochi": 3.8,
    "Mochi Waffle Recipe": 4.0,
    "Chocolate Mochi Muffins": 4.7,
    "Chocolate Mochi Donuts": 4.3,
    "Matcha Baked Mochi Donuts": 3.3,
    "Pandan Donuts": 3.8,
    "Savory Sweet Corn Mochi": 4.0
}


#### General definitions

In [3]:
def make_soup(URL):
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    return soup

def check_category(ingredient):
    for category, subs in categories.items():
        for sub in subs:
            if sub in ingredient:
                return True, category
    return False, ingredient

def gram_calculation(amount, unit, category, name):
    if unit == "g" or "gram" in unit:
        return amount, "g"
    volume = volume_mapping.get(unit, 0)
    category_obj = categories_mapping.get(category)
    if volume == 0:
        print(f"volume {unit} not registered correctly from {name}")
    else: 
        if category == "Liquids":
            print(amount, volume, name)
            ml = str((amount * volume * category_obj).to(Volume.milliliter)).split(' ml')
            return ml[0], "ml"
        else: 
            g = str((amount * volume * category_obj).to(Mass.gram)).split(' g')
            return g[0], "g"

def convert_to_number(amount):
    # Dictionary to map common fractions to their decimal equivalents
    fraction_map = {
        "½": 0.5,
        "⅓": 1/3,
        "¼": 0.25,
        "¾": 0.75,
        "⅛": 0.125,
        "⅔": 2/3,
        "⅘": 4/5
        # Add other fractions as needed
    }
    
    # Split the amount into parts
    parts = amount.split()

    # If there's only one part (like "1" or "½"), check if it's a whole number or fraction
    if len(parts) == 1:
        if parts[0] in fraction_map:
            return fraction_map[parts[0]]  # It's a fraction like "½", return its decimal
        else:
            return round(float(parts[0]),2)  # It's a whole number, return as float

    # If there are two parts (like "1 ¼"), process both the whole number and fraction
    elif len(parts) == 2:
        whole_part = round(float(parts[0]),2)  # Convert the first part (whole number) to float
        fraction_part = parts[1]      # Second part is the fraction (e.g., "¼")

        if fraction_part in fraction_map:
            return whole_part + fraction_map[fraction_part]  # Add the decimal equivalent of fraction
    else: 
        return f'No match for{amount}'

def scrape_ingredients(dom, rating_tot, servings):
    """ returns a dictionary in format: { "amount": 150,  "unit": "g",  "ingredient": "minced beef" },
    """
    ingredients  = []

    for ingredient in dom.find_all("li", class_="wprm-recipe-ingredient"):
        dic = {}

        # find all objects in the page
        amount = ingredient.find("span", class_="wprm-recipe-ingredient-amount")
        unit = ingredient.find("span", class_="wprm-recipe-ingredient-unit")
        name = ingredient.find("span", class_="wprm-recipe-ingredient-name").text
        
        # exceptions AMOUNT
        if amount is None:
            amount = "1"
        else:
            amount = amount.text
   
        if amount == "Pinch of":
            amount = 1
            unit = "pinch"
        elif amount == "6-8":
            amount = 7
        else: 
            amount = convert_to_number(amount)
            amount = amount/servings*10

        # exceptions UNIT
        if unit is None:
            if "egg" in name:
                amount = amount/5 # based on 5 eggs go in 1 cup
                unit = "cup"
            elif "starch" in name:
                amount = 0.5
                unit = "cup"
                name = "corn starch to dust"
            elif "water" in name:
                amount = 0.5
                unit = "cup"
                name = "corn starch to dust"
            else:
                print("No unit available", amount, name)
        else: 
            if unit != "pinch":
                unit = unit.text
                unit = re.sub(r"\(.*?\)", "", unit).strip()
                unit = unit.lower()

            if unit == "package":
                amount = 1
                unit = "cup"
            elif "scoop" in unit:
                amount = amount/4
                unit = "cup"
            elif "slices" in unit:
                print("slices", amount, name)
                amount = (amount/15) / 1.5 # sliced bananas (15 slices in 1 banana, 1.5 banana in 1 cup)
                unit = "cup"
            elif "pkg" in unit: 
                print("pkg", amount, name)
                amount = 400*amount # based on half a package of beans (400 in total)
                unit = "g"
            elif "can" in unit:
                print("can", amount, name)
                amount = amount*15 # based on 1 package = 15 oz
                unit = "oz"
            elif "box" in unit:
                print("box", amount, name)
                amount = amount*8 # based on 1 box almond paste = 8 oz
                unit = "oz"
            elif "tsp" in unit.lower():
                unit = "teaspoon"
            elif "tbsp" in unit.lower():
                unit = "tablespoon"

        # exceptions NAME
        if "dust" in name or "boil" in name: 
            continue 
        elif "ml/grams (3.53 ounces) milk" in name: 
            unit = "ml"
            name = "milk"
        elif "grams" in name and "glutinous rice flour" in name:
            name = "glutinous rice flour"
        elif "grams" in name or "ounces" in name: 
            name = re.sub(r"\(.*?\)", "", name).strip()
        elif "of salt" in name:
            name = "salt"
            unit = "pinch"
        elif "tofu" in name:
            amount = 250
            unit = "g"
            name = "mashed silken firm tofu"
        elif "milk of choice" in name:
            name = "milk"
        elif "an)" in name or "ur)" in name:
            name = re.sub(r'\s*\(.*?\)', '', name)
        elif "PURPLE SWEET POTATO FILLING" in name:
            amount = 260 # based on 2 whole medium sizes potatoes
            unit = "g"
            name = "mashed purple sweet potato"
        elif "PISTACHIO CUSTARD" in name:
            name = "pastry cream"
        elif "pumpkin spice bean paste" in name:
            amount = 300
            unit = "g"
            name = "sweetened white bean paste"

            availabe, category = check_category("pumpkin puree")
            if (availabe):
                new_amount, new_unit = gram_calculation(2.0, "tablespoon", category, "pumpkin puree")

            dictionary = {
                "amount": round(float(new_amount), 2),
                "unit": new_unit,
                "ingredient": "pumpkin puree",
                "rating": rating_tot,
                "category": category
            }
            ingredients.append(dictionary)

            availabe, category = check_category("pumpkin pie spice")
            if (availabe):
                new_amount, new_unit = gram_calculation(2.0, "teaspoon", category, "pumpkin pie spice")

            dictionary = {
                "amount": round(float(new_amount), 2),
                "unit": new_unit,
                "ingredient": "pumpkin pie spice",
                "rating": rating_tot,
                "category": category
            }
            ingredients.append(dictionary)

        elif " or " in name and ", " in name: # splitting flours into three ingredients
            parts = name.split(", ")
            first = parts[0]
            parts = parts[1]
            parts = parts.split(" or ")
            second = parts[0]
            third = parts[1]

            if "starch" in name:
                if "starch" not in first:
                    first = first + " starch"
                if "starch" not in second: 
                    second = second + " starch"

            elif "flour" in name: 
                if "flour" not in second: 
                    second = second + " flour"
        

            name = third

            availabe, category = check_category(first)
            if (availabe):
                new_amount, new_unit = gram_calculation(amount, unit, category, name)

            dictionary = {
                "amount": round(float(new_amount), 2),
                "unit": new_unit,
                "ingredient": first,
                "rating": rating_tot,
                "category": category
            }
            ingredients.append(dictionary)

            dictionary = {
                "amount": round(float(new_amount), 2),
                "unit": new_unit,
                "ingredient": second,
                "rating": rating_tot,
                "category": category
            }
            ingredients.append(dictionary)

        availabe, category = check_category(name)
        if (availabe):
            new_amount, new_unit = gram_calculation(amount, unit, category, name)

        # make for each ingredient a seperate dict
        dic['amount'] = round(float(new_amount), 2)
        dic['unit'] = new_unit
        dic['ingredient'] = name.lower().rstrip('*')
        dic['rating'] = rating_tot
        dic['category'] = category
        ingredients.append(dic)


    return ingredients

def scrape_servings(dom):
    servings = dom.find("span", class_="wprm-recipe-servings").text
    servings = int(servings)
    return servings

def scrape_rating(dom, name):
    rating = dom.find("span", class_="wprm-recipe-rating-average")
    if rating is None:
        new_rating = alternative_ratings.get(name, 0)
        if new_rating == 0:
            print("name incorrect:", name)
        else:
            rating = float(new_rating)
    else:
        rating = rating.text
        rating = float(rating)
    return rating

# Scrape from *all purpose veggies*

##### Definitions

### 1. Get main page content and recipe links

In [4]:
URL = "https://allpurposeveggies.com/12967/12-mochi-flavors-easy-recipes-for-mochi-ice-cream-and-more/"
soup = make_soup(URL)

recipe_links = []
for link in soup.find_all('a', href=True):
    if 'See Recipe' in link.get_text():  # Using get_text() ensures you only check the text inside the <a> tag
        recipe_links.append(link['href'])

recipe_links = recipe_links[:17]

### 2. Get list of names of recipe from main page

In [5]:
recipe_names = []
for title in soup.find_all('h2', class_="wp-block-heading"):
    title = title.text
    if title[0].isdigit():
        title = title.strip(string.punctuation + string.whitespace + string.digits)
        recipe_names.append(title)

### 3. Scrape recipes into list

In [6]:
recipes_all_1 = []
for i, recipe in enumerate(recipe_links):  # Enumerate, a great Python trick!
        dic = {}
        title = recipe_names[i]
        i = i+1

        print('Scraping recipe %d ...' % i, title)

        # Grab web page
        recipe_html = requests.get(recipe)
        
        # Extract relevant information for each recipe
        recipe_dom = BeautifulSoup(recipe_html.content, "html.parser")

        servings = scrape_servings(recipe_dom)
        rating = scrape_rating(recipe_dom, title)
        ingredients = scrape_ingredients(recipe_dom, rating, servings)
        
        dic["name"] = title
        dic["rating"] = rating
        dic["ingredients"] = ingredients
        
        recipes_all_1.append(dic)

Scraping recipe 1 ... Strawberry Mochi
Scraping recipe 2 ... Fresh Mango
Scraping recipe 3 ... Applesauce Mochi
Scraping recipe 4 ... Sweet Potato
0.5555555555555556 cup water
Scraping recipe 5 ... Raspberry Chocolate
Scraping recipe 6 ... Banana Chocolate
1.6666666666666665 tablespoon water
slices 8.333333333333334 fresh banana
Scraping recipe 7 ... Tofu
pkg 1.0 sweetened adzuki beans
Scraping recipe 8 ... Pumpkin
Scraping recipe 9 ... Savory Sweet Corn Mochi
can 0.8333333333333333 sweet corn kernels
1.6666666666666665 tablespoon water
Scraping recipe 10 ... Pistachio Butter
1.0 cup water
1.0 cup water
Scraping recipe 11 ... Black Sesame
1.25 cup almond milk or water
Scraping recipe 12 ... Purple Sweet Potato Mochi
0.8333333333333333 cup water
Scraping recipe 13 ... Green Tea / Matcha (with a Twist
Scraping recipe 14 ... Almond Milk
1.3333333333333333 cup unsweetened almond milk
box 1.6666666666666665 almond paste
Scraping recipe 15 ... Crunchy Peanut Butter
1.0 cup water
1.0 cup wate

# 2. Scrape recipes from *The rice chick*

##### Definitions

### 1. Get main page content and recipe links

In [7]:
URL = "https://thericechick.com/mochi-recipes/"

soup = make_soup(URL)

recipe_links = []
recipe_names = []
for link in soup.find_all('a', class_="wp-block-button__link has-text-color has-background wp-element-button", href=True):
    recipe_links.append(link['href'])
    recipe_names.append(link.text)

### 2. Scrape recipes into list

In [8]:
recipes_all_2 = []
for i, recipe in enumerate(recipe_links):  # Enumerate, a great Python trick!
        dic = {}
        title = recipe_names[i]
        i = i+1

        print('Scraping recipe %d ...' % i, title)

        # Grab web page
        recipe_html = requests.get(recipe)
        
        # Extract relevant information for each recipe
        recipe_dom = BeautifulSoup(recipe_html.content, "html.parser")

        servings = scrape_servings(recipe_dom)
        rating = scrape_rating(recipe_dom, title)
        ingredients = scrape_ingredients(recipe_dom, rating, servings)
        
        dic["name"] = title
        dic["rating"] = rating
        dic["ingredients"] = ingredients
        
        recipes_all_2.append(dic)


Scraping recipe 1 ... Pandan Mochi
150.0 milliliter coconut milk
0.8333333333333333 cup milk 
Scraping recipe 2 ... Black Sesame Mochi Muffins
1.25 cup coconut milk
Scraping recipe 3 ... Mango Mochi
128.57142857142858 milliliter water
Scraping recipe 4 ... Matcha Mochi Waffles
1.875 cup milk of choice
Scraping recipe 5 ... Ube Baked Mochi Donuts
Scraping recipe 6 ... Potato Mochi
No unit available 0.8333333333333333 Water to boil potatoes
No unit available 0.8333333333333333 Oil for shallow frying
No unit available 0.8333333333333333 Nori
Scraping recipe 7 ... Chocolate Mochi
0.3125 cup milk
Scraping recipe 8 ... Ube Mochi Muffins
1.25 cup coconut milk
Scraping recipe 9 ... Biscoff Baked Mochi Donuts
No unit available 10.0 Biscoff Biscuits
Scraping recipe 10 ... Nutella Mochi
90.0 milliliter water
No unit available 0.5 Pinch of salt
Scraping recipe 11 ... Chocolate Mochi Cupcakes
0.8333333333333333 cup milk of choice
0.2777777777777778 cup sweetened condensed milk of choice
0.208333333

In [9]:
recipes_all_json = {}
recipes_all_json['recipes'] = recipes_all_1 + recipes_all_2 #combine both lists
data = json.dumps(recipes_all_json, indent=4) 
with open("data/mochi.json", "w") as outfile:
    outfile.write(data)