# This notebook will scrape 2 websites with different mochi recipes to make one JSON inspiring set 

#### Import Libraries

In [183]:
import re
import requests
import string
import json
import random

from bs4 import BeautifulSoup

#### General definitions

In [209]:
def make_soup(URL):
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    return soup

def convert_to_number(amount):
    # Dictionary to map common fractions to their decimal equivalents
    fraction_map = {
        "½": 0.5,
        "⅓": 1/3,
        "¼": 0.25,
        "¾": 0.75,
        "⅛": 0.125,
        "⅔": 2/3,
        "⅘": 4/5
        # Add other fractions as needed
    }
    
    # Split the amount into parts
    parts = amount.split()

    # If there's only one part (like "1" or "½"), check if it's a whole number or fraction
    if len(parts) == 1:
        if parts[0] in fraction_map:
            return fraction_map[parts[0]]  # It's a fraction like "½", return its decimal
        else:
            return round(float(parts[0]),2)  # It's a whole number, return as float

    # If there are two parts (like "1 ¼"), process both the whole number and fraction
    elif len(parts) == 2:
        whole_part = round(float(parts[0]),2)  # Convert the first part (whole number) to float
        fraction_part = parts[1]      # Second part is the fraction (e.g., "¼")

        if fraction_part in fraction_map:
            return whole_part + fraction_map[fraction_part]  # Add the decimal equivalent of fraction
    else: 
        return f'No match for{amount}'

def scrape_ingredients(dom, rating_tot, servings):
    """ returns a dictionary in format: { "amount": 150,  "unit": "g",  "ingredient": "minced beef" },
    """
    ingredients  = []

    for ingredient in dom.find_all("li", class_="wprm-recipe-ingredient"):
        dic = {}

        # find all objects in the page
        amount = ingredient.find("span", class_="wprm-recipe-ingredient-amount")
        unit = ingredient.find("span", class_="wprm-recipe-ingredient-unit")
        name = ingredient.find("span", class_="wprm-recipe-ingredient-name").text
        
        # exceptions AMOUNT
        if amount is None:
            amount = "1"
        else:
            amount = amount.text
   
        if amount == "Pinch of":
            amount = 0.3
            unit = "g"
        elif amount == "6-8":
            amount = 7
        else: 
            amount = convert_to_number(amount)
            amount = amount/servings*10

        # exceptions UNIT
        if unit is None:
            unit = "whole"
        elif unit == "g":
            unit = "g"
        else: 
            unit = unit.text
            unit = re.sub(r"\(.*?\)", "", unit).strip()

            if unit == "Pinch": 
                amount = 0.3
                unit = "g"
            elif unit == "package":
                amount = 1
                unit = "cup"
            elif "tsp" in unit.lower():
                unit = "teaspoons"
            elif "tbsp" in unit.lower():
                unit = "tablespoons"
        
        # exceptions NAME
        if "ml/grams (3.53 ounces) milk" in name: 
            unit = "ml"
            name = "milk"
        elif "grams" in name and "glutinous rice flour" in name:
            name = "glutinous rice flour"
        elif "grams" in name or "ounces" in name: 
            name = re.sub(r"\(.*?\)", "", name).strip()
        elif "of salt" in name:
            name = "salt"
        elif "tofu" in name:
            amount = 250
            unit = "g"
            name = "mashed silken firm tofu"
        elif "PURPLE SWEET POTATO FILLING" in name:
            amount = 2
            unit = "whole"
            name = "mashed purple sweet potato"
        elif "PISTACHIO CUSTARD" in name:
            name = "pastry cream"
        elif "pumpkin spice bean paste" in name:
            amount = 300
            unit = "g"
            name = "sweetened white bean paste (shiro an)"
           
            dictionary = {
                "amount": 2.0,
                "unit": "tablespoons",
                "ingredient": "pumpkin puree",
                "rating": rating_tot
            }
            ingredients.append(dictionary)

            dictionary = {
                "amount": 2.0,
                "unit": "teaspoons",
                "ingredient": "pumpkin pie spice",
                "rating": rating_tot
            }
            ingredients.append(dictionary)
        elif " or " in name and ", " in name: # splitting flours into three ingredients
            print(name)
            parts = name.split(", ")
            first = parts[0]
            parts = parts[1]
            parts = parts.split(" or ")
            second = parts[0]
            third = parts[1]

            if "starch" in name:
                if "starch" not in first:
                    first = first + " starch"
                if "starch" not in second: 
                    second = second + " starch"

            elif "flour" in name: 
                if "flour" not in second: 
                    second = second + " flour"
            
            print(first, second, third)

            name = third

            dictionary = {
                "amount": round(float(amount), 2),
                "unit": unit,
                "ingredient": first,
                "rating": rating_tot
            }
            ingredients.append(dictionary)

            dictionary = {
                "amount": round(float(amount), 2),
                "unit": unit,
                "ingredient": second,
                "rating": rating_tot
            }
            ingredients.append(dictionary)

            
        # make for each ingredient a seperate dict
        dic['amount'] = round(float(amount), 2)
        dic['unit'] = unit
        dic['ingredient'] = name
        dic['rating'] = rating_tot
        ingredients.append(dic)


    return ingredients

def scrape_servings(dom):
    servings = dom.find("span", class_="wprm-recipe-servings").text
    servings = int(servings)
    return servings

def scrape_rating(dom):
    rating = dom.find("span", class_="wprm-recipe-rating-average")
    if rating is None:
        rating = "needs rating" 
        rating = 4 ## REMOVE!!!
    else:
        rating = rating.text
        rating = float(rating)
    return rating

# Scrape from *all purpose veggies*

##### Definitions

### 1. Get main page content and recipe links

In [210]:
URL = "https://allpurposeveggies.com/12967/12-mochi-flavors-easy-recipes-for-mochi-ice-cream-and-more/"
soup = make_soup(URL)

recipe_links = []
for link in soup.find_all('a', href=True):
    if 'See Recipe' in link.get_text():  # Using get_text() ensures you only check the text inside the <a> tag
        recipe_links.append(link['href'])

recipe_links = recipe_links[:17]

### 2. Get list of names of recipe from main page

In [211]:
recipe_names = []
for title in soup.find_all('h2', class_="wp-block-heading"):
    title = title.text
    if title[0].isdigit():
        title = title.strip(string.punctuation + string.whitespace + string.digits)
        recipe_names.append(title)

### 3. Scrape recipes into list

In [212]:
recipes_all_1 = []
needs_r = []
for i, recipe in enumerate(recipe_links):  # Enumerate, a great Python trick!
        dic = {}
        title = recipe_names[i]
        i = i+1

        print('Scraping recipe %d ...' % i, title)

        # Grab web page
        recipe_html = requests.get(recipe)
        
        # Extract relevant information for each recipe
        recipe_dom = BeautifulSoup(recipe_html.content, "html.parser")

        servings = scrape_servings(recipe_dom)
        rating = scrape_rating(recipe_dom)
        ingredients = scrape_ingredients(recipe_dom, rating, servings)
        
        if rating == "needs rating":
                needs_r.append(title)
        dic["name"] = title
        dic["rating"] = rating
        dic["ingredients"] = ingredients
        
        recipes_all_1.append(dic)

print(needs_r)

Scraping recipe 1 ... Strawberry Mochi
mochiko flour, sweet rice or glutinous rice flour
mochiko flour sweet rice flour glutinous rice flour
cornstarch, potato starch or tapioca starch
cornstarch potato starch tapioca starch
Scraping recipe 2 ... Fresh Mango
mochiko flour, sweet rice or glutinous rice flour
mochiko flour sweet rice flour glutinous rice flour
pastry cream, vanilla pudding or ice cream of choice
pastry cream vanilla pudding ice cream of choice
Scraping recipe 3 ... Applesauce Mochi
mochiko flour, sweet rice or glutinous rice flour
mochiko flour sweet rice flour glutinous rice flour
corn, potato or tapioca starch
corn starch potato starch tapioca starch
Scraping recipe 4 ... Sweet Potato
mochiko flour, sweet rice or glutinous rice flour
mochiko flour sweet rice flour glutinous rice flour
corn, potato or tapioca starch
corn starch potato starch tapioca starch
Scraping recipe 5 ... Raspberry Chocolate
mochiko flour, sweet rice or glutinous rice flour
mochiko flour sweet ric

# 2. Scrape recipes from *The rice chick*

##### Definitions

### 1. Get main page content and recipe links

In [213]:
URL = "https://thericechick.com/mochi-recipes/"

soup = make_soup(URL)

recipe_links = []
recipe_names = []
for link in soup.find_all('a', class_="wp-block-button__link has-text-color has-background wp-element-button", href=True):
    recipe_links.append(link['href'])
    recipe_names.append(link.text)

### 2. Scrape recipes into list

In [214]:
recipes_all_2 = []
needs_r = []
for i, recipe in enumerate(recipe_links):  # Enumerate, a great Python trick!
        dic = {}
        title = recipe_names[i]
        i = i+1

        print('Scraping recipe %d ...' % i, title)

        # Grab web page
        recipe_html = requests.get(recipe)
        
        # Extract relevant information for each recipe
        recipe_dom = BeautifulSoup(recipe_html.content, "html.parser")

        servings = scrape_servings(recipe_dom)
        rating = scrape_rating(recipe_dom)
        ingredients = scrape_ingredients(recipe_dom, rating, servings)
        
        if rating == "needs rating":
                needs_r.append(recipe)
        dic["name"] = title
        dic["rating"] = rating
        dic["ingredients"] = ingredients
        
        recipes_all_2.append(dic)
print(needs_r)

Scraping recipe 1 ... Pandan Mochi
Scraping recipe 2 ... Black Sesame Mochi Muffins
Scraping recipe 3 ... Mango Mochi
Scraping recipe 4 ... Matcha Mochi Waffles
Scraping recipe 5 ... Ube Baked Mochi Donuts
Scraping recipe 6 ... Potato Mochi
Scraping recipe 7 ... Chocolate Mochi
Scraping recipe 8 ... Ube Mochi Muffins
Scraping recipe 9 ... Biscoff Baked Mochi Donuts
Scraping recipe 10 ... Nutella Mochi
Scraping recipe 11 ... Chocolate Mochi Cupcakes
Scraping recipe 12 ... Matcha Mochi Muffins
Scraping recipe 13 ... Ube Butter Mochi
Scraping recipe 14 ... Mochi Waffle Recipe
Scraping recipe 15 ... Mochi Bread
Scraping recipe 16 ... Chocolate Mochi Muffins
Scraping recipe 17 ... Mochi Cookies
Scraping recipe 18 ... Chocolate Mochi Donuts
Scraping recipe 19 ... Ube Mochi
Scraping recipe 20 ... Matcha Baked Mochi Donuts
Scraping recipe 21 ... Mochi Pancakes
Scraping recipe 22 ... Pandan Donuts
Scraping recipe 23 ... Milk Rice Cake
[]


In [215]:
recipes_all_json = {}
recipes_all_json['recipes'] = recipes_all_1 + recipes_all_2 #combine both lists
data = json.dumps(recipes_all_json, indent=4) 
with open("data/mochi.json", "w") as outfile:
    outfile.write(data)