# SBS Maltese Food Recipes
---

In [2]:
# Libraries needed to get the html from a site and to parse the html
from bs4 import BeautifulSoup as bs
import requests
import csv
from parse_ingredients import parse_ingredient
import pyfood as pyf

In [8]:
# Saving the url to a variable, getting the url and parsing the html
mainURL = "https://www.sbs.com.au/food/cuisine/maltese"
req = requests.get(mainURL)
soup = bs(req.text, "html.parser")

In [9]:
# Base link that hrefs will be added onto
baseLink = 'https://www.sbs.com.au'

In [10]:
# The tag that stores the href to the recipes
hrefs = soup.findAll(class_='link-underlay')

In [11]:
# Simply printing out the contents to see what the hrefs of the tags are

def printHrefs(hrefsIn):
    for i in hrefsIn:
        print(i.get('href'), "\n")

# printHrefs(hrefs)

In [12]:
# This for loop will generate the links to all the recipes that are shown on the mainURL

def funcRecipeLinks(hrefsIn, baseLinkIn, recipeLinks):
    for i in hrefsIn:
        fullLink = baseLinkIn + i.get('href')
        if i.get('href')[6] == 'r':
            recipeLinks.append(fullLink)
    return recipeLinks

recipeLinks = []
recipeLinks = funcRecipeLinks(hrefs, baseLink, recipeLinks)

In [13]:
mainURL = "https://www.sbs.com.au/food/cuisine/maltese?sort_by=created&page=1"
req = requests.get(mainURL)
soup = bs(req.text, "html.parser")

baseLink = 'https://www.sbs.com.au'
hrefs = soup.findAll(class_='link-underlay')

recipeLinks = funcRecipeLinks(hrefs, baseLink, recipeLinks)


In [None]:
# Printing the links to all the recipes shown before clicking the show more button
print(recipeLinks)
print(len(recipeLinks))

## Testing different ingredient extraction methods
---
### Method 1: parse_ingredients library

The parse_ingredients library provides a way to extract ingredients, quantity, units and comments from recipe ingredients. In testing it was found that the library can be inconsistent with removing quantifiers such as size, colour and plurals. 

In [None]:
def parseIngredientsFunc(i, title):
    data = []
    for j in title:
                data.append(str(j.get_text(strip=False))) # Dish Name
    
    parseResult = parse_ingredient(str(i.get_text(strip=False)))
            
    data.append(parseResult.original_string) #Original String
    data.append(parseResult.name) #Ingredient
    data.append(parseResult.quantity) #Quantity
    data.append(parseResult.unit) #Unit
    data.append(parseResult.comment) #Comment
    
    return data

### Method 2: pyfood library

Contrary to the parse_ingredient library, the pyfood library only extracts the ingredient from the input string. Moreover, It is prone to some silly errors where it will convert ingredients like 'frozen peas' to 'green peppers'. These mistakes appear to be few and far between. However, examination shows a better handling of quantifiers; it ignores indicators to size and colour, as well as removes plurals. 

In [None]:
def pyfoodFunc(title, i):
    data = []

    for j in title:
        data.append(str(j.get_text(strip=False))) # Dish Name
        
    print(str(i.get_text(strip=False)))
    results = shelf.process_ingredients([i.get_text(strip=False)])
    try:
        temp = results['ingredients'][0]['foodname'] # vegetarian, vegan, nutrition, seasonality
    except:
        temp = results['HS'][0]
    data.append(i)
    data.append(temp)

    return data

### Method 3: parse_ingredients and pyfood libraries

After taking into consideration the above scenario, we decided to use both of these libraries for the best result. First, the parse_ingredients library is used to extract: quantity, and the ingredient name. The ingredient name is then passed trough the pyfoods function set to obtain a striped down version of the ingredient. Using the parse_ingredients library first also appears to reduce the error in the pyfoods library. 

In [96]:
def parseIngredientsAndPyfoodFunc(i, title):
    data = []
    for j in title:
                data.append(str(j.get_text(strip=False))) # Dish Name
    
    parseResult = parse_ingredient(str(i.get_text(strip=False)))

    results = shelf.process_ingredients([parseResult.name])
    try:
        temp = results['ingredients'][0]['foodname'] # vegetarian, vegan, nutrition, seasonality
    except:
        temp = results['HS'][0]
    data.append(str(i.get_text(strip=False)))
    data.append(temp)

    return data

In [None]:
ingredientList = []
shelf = pyf.Shelf(region='Italy', month_id=0)

for recipe in recipeLinks:
    req = requests.get(recipe) # Accesses the next recipe
    soup = bs(req.text, "html.parser")
    title = soup.findAll('h1') # finds the name of the recipe
    
    ingredientsDiv = soup.findAll('div', class_='field-name-field-ingredients') # finds all the divs containing the ingredients
    
    # For each div go trough and extract the ingredients
    for ul in ingredientsDiv:
        ingredients = ul.findAll("li")
        
        for i in ingredients:
            # data = parseIngredientsFunc(i, title)
            # data = pyfoodFunc(title, i)
            data = parseIngredientsAndPyfoodFunc(i, title)
            
            ingredientList.append(data)

In [21]:
import csv

header = ['Dish Name', 'Original String', 'Ingredient', 'Quantity', 'Unit', 'Comment']


with open('CSV/sbsRecipes.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write multiple rows
    writer.writerows(ingredientList)

In [98]:
header = ['Dish Name', 'Original String', 'Ingredient']

with open('CSV/sbsRecipes3.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write multiple rows
    writer.writerows(ingredientList)