# Maltese Cuisine Food Recipes 
---

In [1]:
# Libraries needed to get the html from a site and to parse the html
from bs4 import BeautifulSoup as bs
import requests
from parse_ingredients import parse_ingredient
import pandas
import pyfood as pyf

## Base URL
This is the main URL or home page of the Maltese Cuisine site. It is being accessed, parsed and stored in the soup variable.

In [2]:
# Saving the url to a variable, getting the url and parsing the html
mainURL = "https://maltesecuisine.com/"
req = requests.get(mainURL)
soup = bs(req.text, "html.parser")

## getPages
This function will go through the main URL and the next pages. These pages all contain hrefs to the recipes on this site. It will save the links of these hrefs to the the hrefs list variable that is being returned

In [3]:
def getPages():
    hrefs = []
    for i in range(1, 12):
        if i == 1:
            req = requests.get("https://maltesecuisine.com")
            soup = bs(req.text, "html.parser")
            temp = soup.findAll("div", class_="post-img")
            for j in temp:
                allAs = j.findAll("a")
                for a in allAs:
                    hrefs.append(a.get('href'))
        else:
            siteURL = "https://maltesecuisine.com/page/" + str(i)
            req = requests.get(siteURL)
            soup = bs(req.text, "html.parser")
            temp = soup.findAll("div", class_="post-img")
            for j in temp:
                allAs = j.findAll("a")
                for a in allAs:
                    hrefs.append(a.get('href'))
    return hrefs

In [22]:
hrefs = getPages()

## Save recipe links to CSV
These chunks of code will create a CSV file, ListOfRecipes which will have two columns. The one of the left being the recipe's name and the one on the left will store the link to the recipe.

To get the recipe's name, it is using the list of links/hrefs, accessing and parsing the page. Then it finds the h1 tag in the html wihch should be the dish name.

In [5]:
RecipeList = []

for recipe in hrefs:
    req = requests.get(recipe) # Accesses the next recipe
    soup = bs(req.text, "html.parser")
    title = soup.findAll('h1') # finds the name of the recipe

    row = []

    for j in title:
        row.append(str(j.get_text(strip=False)))

    row.append(recipe)

    RecipeList.append(row)

In [6]:
linkList = []
dishList = []

for entry in RecipeList:
    linkList.append(entry[1])
    dishList.append(entry[0])

In [7]:
listOfRecipesCSV = pandas.read_csv('../CSV/recipeLinks.csv')

In [8]:
recipeListDF = pandas.DataFrame(list(zip(dishList, linkList)), columns=['Recipe Name', 'Links'])

In [9]:
listOfRecipesCSV = listOfRecipesCSV.append(recipeListDF, ignore_index = True)

  listOfRecipesCSV = listOfRecipesCSV.append(recipeListDF, ignore_index = True)


In [10]:
listOfRecipesCSV.to_csv('../CSV/recipeLinks.csv')

## Append ingredients to CSV file
The CSV file created before is being used to get all the Maltese Cuisine dishes and their links.

The getListOfIngredients function will go through a recipe's page whose URL is passed as a parameter. The list of ingredients is being returned. This function does not parse the ingredients insted it takes the entire string found on the page, meaning that if one of the recipe ingredients were '2 breaded veal chops' it will return '2 breaded veal chops'.

The parseIngredientsAndPyfoodFunc function will parse the ingredient passed as a parameter. This function will parse the ingredient and return the parsed ingredient back. Meaning that the example used before, '2 breaded veal chops' will return 'veal' instead.

In [30]:
listOfRecipesCSV = pandas.read_csv('../CSV/recipeLinks.csv')

In [26]:
def getListOfIngredients(URL):
    ingredients = []
    req = requests.get(URL)
    soup = bs(req.text, "html.parser")
    temp = soup.findAll("div", class_="recipe-ingredients")
    for ul in temp:
        t = ul.findAll("li")
        for ing in t:
            strippedIng = str(ing.get_text(strip=False))
            ingredients.append(strippedIng)
    return ingredients

In [27]:
def parseIngredientsAndPyfoodFunc(ing):
    shelf = pyf.Shelf(region='Italy', month_id=0)
    
    parseResult = parse_ingredient(ing)

    results = shelf.process_ingredients([parseResult.name])
    try:
        pIng = results['ingredients'][0]['foodname'].replace(" ", "_")
    except:
        try:
            pIng = results['HS'][0].replace(" ", "_")
        except:
            pIng = parseResult.name.replace(" ", "_")

    return pIng

In [34]:
mcLinkList = []
mcDishList = []
mcIngDishList = []
mcParsedIngList = []
counter = 0

for row in listOfRecipesCSV.iterrows():
    temp = []
    if 'maltesecuisine' in row[1][2]:
        mcLinkList.append(row[1][2])
        mcDishList.append(row[1][1])
        mcIngDishList.append(getListOfIngredients(row[1][2]))
        if counter == 0:
            for dish in mcIngDishList:
                for ing in dish:
                    temp.append(parseIngredientsAndPyfoodFunc(ing))
            counter += 1
        else:
            for ing in mcIngDishList[-1]:
                temp.append(parseIngredientsAndPyfoodFunc(ing))
        mcParsedIngList.append(temp)
    else:
        continue
    del temp

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [35]:
print(mcParsedIngList[0])

['brown_sugar', 'sugar', 'egg', 'mascarpone', 'Qing_Pi', 'cinnamon', 'buttermilk', 'salt', 'vanilla', 'flour', 'apple']


## Finding each ingredients type and index

In [36]:
ing_info = pandas.read_csv("../CSV/Compound CSVs/ingr_info.tsv", sep="\t")

ingNameList = list(ing_info['ingredient name'])
ingIdxList = list(ing_info['# id'])
ingCatList = list(ing_info['category'])

In [37]:
mcIngCatList = []
mcIngIdxList = []

for dishIdx in range(0, len(mcParsedIngList)):
    for ingIdx in range(0, len(mcParsedIngList[dishIdx])):
        try:
            foundIdx = ingNameList.index(mcParsedIngList[dishIdx][ingIdx])
            mcIngCatList.append(ingCatList[foundIdx])
            mcIngIdxList.append(ingIdxList[foundIdx])
        except:
            mcIngCatList.append(None)
            mcIngIdxList.append(None)

# Preparing List for Dataframe
Two of the prievous lists need to be arranged before being placed in the dataframe. The dish list variable needs to have repeated dish names and the ingredients per dish list needs to be flattened. 

In [38]:
repMCdishList = []

for i in range(0, len(mcIngDishList)):
    for j in mcIngDishList[i]:
        repMCdishList.append(mcDishList[i])

In [39]:
flatMCingDishList = [ing for dish in mcParsedIngList for ing in dish]

In [40]:
print(len(repMCdishList))
print(len(flatMCingDishList))
print(len(mcIngCatList))
print(len(mcIngIdxList))

2337
2337
2337
2337


# Creating and Appending Maltese Cuisine Dataframe

In [41]:
mcIngDF =  pandas.DataFrame(list(zip(repMCdishList,flatMCingDishList, mcIngCatList, mcIngIdxList)), columns=['Recipe', 'Ingredient', 'Ingredint Category', 'Ingredint Index'])

In [42]:
recList = pandas.read_csv("../CSV/recipeList.csv")

In [43]:
recList = recList.append(mcIngDF)

  recList = recList.append(mcIngDF)


In [16]:
recList.to_csv("../CSV/recipeList.csv", index = False)