# Hanadi's Capstone Project (Personlized Recipe Recommendation)

In [823]:
import pandas as pd
import numpy as np


In [824]:
##Read the data
Recipe = pd.read_csv("../data/scraped-07-05-21.csv")

## Data preprocessing

In [825]:
##Check for duplicates
Recipe.duplicated().sum()

0

In [826]:
## original dataset shape was (35,516 instences and 47 features)
Recipe.shape

(35516, 47)

In [827]:
##Drop the features that I don't need, for example: (author, summary, rating_count, review_count, directions, sodium_mg, calories_from_fat, & all micronutirents)
Recipe.drop(['author', 'summary', 'rating','rating_count', 'review_count', 'directions', 'sugars_g', 'saturated_fat_g', 'cholesterol_mg', 'dietary_fiber_g', 'sodium_mg', 'calories_from_fat', 'prep', 'cook','calcium_mg', 'iron_mg', 'magnesium_mg', 'potassium_mg', 'vitamin_a_iu_IU', 'niacin_equivalents_mg', 'vitamin_c_mg', 'folate_mcg', 'thiamin_mg', 'zinc_mg', 'phosphorus_mg', 'vitamin_b6_mg', 'riboflavin_mg', 'vitamin_e_iu_IU', 'vitamin_k_mcg', 'biotin_mcg' , 'vitamin_b12_mcg', 'mono_fat_g', 'poly_fat_g', 'trans_fatty_acid_g', 'omega_3_fatty_acid_g', 'omega_6_fatty_acid_g'], axis=1, inplace=True )


In [828]:
#After deleting unnecssary features, the shape was (35516 instences and 10 features).
Recipe.shape


(35516, 11)

In [829]:
##checking for missing calue
Recipe.isna().sum()
#Maximum missing values were in total cooking time (1952) represents 5% of the total data.
#Other missing values were in the nutrient information features, all were less than (500).

name                  0
url                   0
category              0
ingredients           0
total              1952
servings              0
yield                 1
calories            114
carbohydrates_g     148
fat_g               418
protein_g           261
dtype: int64

In [830]:
###Dealing with missing data, I'll drop the instences that doesn't contain the maximum missing values feature represents only 5% of the total data.

Recipe.dropna(inplace=True)
Recipe.isna().sum()


name               0
url                0
category           0
ingredients        0
total              0
servings           0
yield              0
calories           0
carbohydrates_g    0
fat_g              0
protein_g          0
dtype: int64

### nutrition value columns

In [831]:

##Check the maximum values for nutrition values
Recipe.describe()


Unnamed: 0,servings,calories,carbohydrates_g,fat_g,protein_g
count,33102.0,33102.0,33102.0,33102.0,33102.0
mean,10.090357,318.526714,28.546888,16.734657,14.192982
std,11.51707,212.19222,24.092086,14.545362,14.063005
min,1.0,0.1,0.0,0.0,0.0
25%,4.0,167.625,10.8,6.7,3.7
50%,8.0,280.0,24.0,13.3,8.8
75%,12.0,421.9,40.4,22.9,22.0
max,832.0,4709.2,578.2,383.9,273.2


In [832]:
#Check the mode for serving
Recipe.servings.mode()


0    4
Name: servings, dtype: int64

In [833]:
##I found crazy large amount of max	serving=832, calories=4709.2, carbs=578.2, fat=383.9, protein=273.2
## So, I decided to put upper limit of calories at 1000 kcal, in which 50% are carbs = 125 g, 25% are fat = 28 g, and 25% are protein =63 g.
## For serving, the upper limit would be the 75th % = 12.

Recipe = Recipe[Recipe['calories'] <= 1000]
Recipe = Recipe[Recipe['fat_g'] <= 28]
Recipe = Recipe[Recipe['protein_g'] <= 63]
Recipe = Recipe[Recipe['carbohydrates_g'] <= 125]
Recipe = Recipe[Recipe['servings'] <= 12]

In [834]:
Recipe.describe()


Unnamed: 0,servings,calories,carbohydrates_g,fat_g,protein_g
count,21680.0,21680.0,21680.0,21680.0,21680.0
mean,6.792435,277.063584,28.013247,12.60494,13.390392
std,3.106487,134.686314,20.640242,7.478032,11.464151
min,1.0,3.2,0.0,0.0,0.0
25%,4.0,176.175,11.4,6.5,4.3
50%,6.0,266.75,24.6,12.0,9.3
75%,8.0,364.6,40.3,18.5,20.8
max,12.0,904.0,123.4,28.0,62.8


### total column

In [835]:
###Remove recipe that takes week to cook.
searchfor = ['week']
Recipe = Recipe[~Recipe.total.str.contains('|'.join(searchfor))]

In [836]:
##Defined a function
from collections import defaultdict
import re

def humantime2minutes(s):
    d = {
      'w':      7*24*60,
      'week':   7*24*60,
      'weeks':  7*24*60,
      'd':      24*60,
      'day':    24*60,
      'days':   24*60,
      'hr':      60,
      'hrs':     60,
      'hour':   60,
      'hours':  60,
    }
    mult_items = defaultdict(lambda: 1).copy()
    mult_items.update(d)

    parts = re.search(r'^(\d+)([^\d]*)', s.lower().replace(' ', ''))
    if parts:
        return int(parts.group(1)) * mult_items[parts.group(2)] + humantime2minutes(re.sub(r'^(\d+)([^\d]*)', '', s.lower()))
    else:
        return 0
        

In [837]:
#Apply the function to the column
Recipe['total'] = Recipe['total'].apply(humantime2minutes)

In [838]:

###Take a look
Recipe.sample(4)


Unnamed: 0,name,url,category,ingredients,total,servings,yield,calories,carbohydrates_g,fat_g,protein_g
16779,Taco Hand Pies,https://www.allrecipes.com/recipe/272513/taco-...,main-dish,"2 Roma tomatoes, diced ; ½ cup diced white on...",55,12,12 hand pies,226.2,22.9,9.9,11.7
8697,The Best Mashed Potatoes,https://www.allrecipes.com/recipe/15213/the-be...,side-dish,5 pounds Yukon Gold potatoes ; ½ cup butter ; ...,20,12,12 servings,380.6,36.3,21.8,11.6
29789,Zoodle Caprese Salad,https://www.allrecipes.com/recipe/265279/zoodl...,salad,"2 small zucchini ; 16 ounces fresh mozzarella,...",20,4,4 servings,343.9,10.5,20.9,29.8
29795,Pickled Onion and Cilantro Coleslaw for Pulled...,https://www.allrecipes.com/recipe/261570/pickl...,salad,1 teaspoon ground cumin ; 1 cup cider vinegar ...,513,10,10 servings,176.0,18.5,11.1,1.8


In [839]:
##save cleaned data in new csv
Recipe.to_csv('cleaned_data.csv', index = None)

### ingredient column

In [840]:
### First, I'll drop recipe that contains (pork, wine, beer, alcohol) as these are inconvenient for the target audience
searchfor = ['pork', 'wine', 'beer', 'alcohol']
Recipe = Recipe[~Recipe.ingredients.str.contains('|'.join(searchfor))]


In [841]:
### change ingredients sepration from ; to , & seprate them into '' 
Recipe['ingredients'] = Recipe['ingredients'].str.replace(" ; ", "','")
Recipe.update("['" + Recipe["ingredients"].astype(str) + "']")

Recipe.sample(2)

Unnamed: 0,name,url,category,ingredients,total,servings,yield,calories,carbohydrates_g,fat_g,protein_g
27488,Big Papa's Homemade Beef Stew,https://www.allrecipes.com/recipe/239269/big-p...,soups-stews-and-chili,"['2 tablespoons vegetable oil','2 pounds botto...",155,6,6 servings,273.5,19.8,10.7,24.3
28482,No-Bake Chocolate-Oatmeal Cookies,https://www.allrecipes.com/recipe/276670/no-ba...,desserts,"['2 cups white sugar','½ cup unsweetened cocoa...",75,12,12 cookies,290.4,48.1,11.0,3.1


In [842]:
### download required libraries

import nltk
import string
import ast
import re
import unidecode

# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter
import config


In [843]:
## use function by whatscooking-deployment. Reference (https://github.com/jackmleitch/whatscooking-deployment)
def ingredient_parser(ingreds):
    measures = [
        "teaspoon",
        "t",
        "tsp.",
        "tablespoon",
        "T",
        "tbl.",
        "tb",
        "tbsp.",
        "fluid ounce",
        "fl oz",
        "gill",
        "cup",
        "c",
        "pint",
        "p",
        "pt",
        "fl pt",
        "quart",
        "q",
        "qt",
        "fl qt",
        "gallon",
        "g",
        "gal",
        "ml",
        "milliliter",
        "millilitre",
        "cc",
        "mL",
        "l",
        "liter",
        "litre",
        "L",
        "dl",
        "deciliter",
        "decilitre",
        "dL",
        "bulb",
        "level",
        "heaped",
        "rounded",
        "whole",
        "pinch",
        "medium",
        "slice",
        "pound",
        "lb",
        "#",
        "ounce",
        "oz",
        "mg",
        "milligram",
        "milligramme",
        "g",
        "gram",
        "gramme",
        "kg",
        "kilogram",
        "kilogramme",
        "x",
        "of",
        "mm",
        "millimetre",
        "millimeter",
        "cm",
        "centimeter",
        "centimetre",
        "m",
        "meter",
        "metre",
        "inch",
        "in",
        "milli",
        "centi",
        "deci",
        "hecto",
        "kilo",
    ]
    words_to_remove = [
        "fresh",
        "minced",
        "chopped" "oil",
        "a",
        "red",
        "bunch",
        "and",
        "clove",
        "or",
        "leaf",
        "chilli",
        "large",
        "extra",
        "sprig",
        "ground",
        "handful",
        "free",
        "small",
        "pepper",
        "virgin",
        "range",
        "from",
        "dried",
        "sustainable",
        "black",
        "peeled",
        "higher",
        "welfare",
        "seed",
        "for",
        "finely",
        "freshly",
        "sea",
        "quality",
        "white",
        "ripe",
        "few",
        "piece",
        "source",
        "to",
        "organic",
        "flat",
        "smoked",
        "ginger",
        "sliced",
        "green",
        "picked",
        "the",
        "stick",
        "plain",
        "plus",
        "mixed",
        "mint",
        "bay",
        "basil",
        "your",
        "cumin",
        "optional",
        "fennel",
        "serve",
        "mustard",
        "unsalted",
        "baby",
        "paprika",
        "fat",
        "ask",
        "natural",
        "skin",
        "roughly",
        "into",
        "such",
        "cut",
        "good",
        "brown",
        "grated",
        "trimmed",
        "oregano",
        "powder",
        "yellow",
        "dusting",
        "knob",
        "frozen",
        "on",
        "deseeded",
        "low",
        "runny",
        "balsamic",
        "cooked",
        "streaky",
        "nutmeg",
        "sage",
        "rasher",
        "zest",
        "pin",
        "groundnut",
        "breadcrumb",
        "turmeric",
        "halved",
        "grating",
        "stalk",
        "light",
        "tinned",
        "dry",
        "soft",
        "rocket",
        "bone",
        "colour",
        "washed",
        "skinless",
        "leftover",
        "splash",
        "removed",
        "dijon",
        "thick",
        "big",
        "hot",
        "drained",
        "sized",
        "chestnut",
        "watercress",
        "fishmonger",
        "english",
        "dill",
        "caper",
        "raw",
        "worcestershire",
        "flake",
        "cider",
        "cayenne",
        "tbsp",
        "leg",
        "pine",
        "wild",
        "if",
        "fine",
        "herb",
        "almond",
        "shoulder",
        "cube",
        "dressing",
        "with",
        "chunk",
        "spice",
        "thumb",
        "garam",
        "new",
        "little",
        "punnet",
        "peppercorn",
        "shelled",
        "saffron",
        "other" "chopped",
        "salt",
        "olive",
        "taste",
        "can",
        "sauce",
        "water",
        "diced",
        "package",
        "italian",
        "shredded",
        "divided",
        "parsley",
        "vinegar",
        "all",
        "purpose",
        "crushed",
        "juice",
        "more",
        "coriander",
        "bell",
        "needed",
        "thinly",
        "boneless",
        "half",
        "thyme",
        "cubed",
        "cinnamon",
        "cilantro",
        "jar",
        "seasoning",
        "rosemary",
        "extract",
        "sweet",
        "baking",
        "beaten",
        "heavy",
        "seeded",
        "tin",
        "vanilla",
        "uncooked",
        "crumb",
        "style",
        "thin",
        "nut",
        "coarsely",
        "spring",
        "chili",
        "cornstarch",
        "strip",
        "cardamom",
        "rinsed",
        "honey",
        "cherry",
        "root",
        "quartered",
        "head",
        "softened",
        "container",
        "crumbled",
        "frying",
        "lean",
        "cooking",
        "roasted",
        "warm",
        "whipping",
        "thawed",
        "corn",
        "pitted",
        "sun",
        "kosher",
        "bite",
        "toasted",
        "lasagna",
        "split",
        "melted",
        "degree",
        "lengthwise",
        "romano",
        "packed",
        "pod",
        "anchovy",
        "rom",
        "prepared",
        "juiced",
        "fluid",
        "floret",
        "room",
        "active",
        "seasoned",
        "mix",
        "deveined",
        "lightly",
        "anise",
        "thai",
        "size",
        "unsweetened",
        "torn",
        "wedge",
        "sour",
        "basmati",
        "marinara",
        "dark",
        "temperature",
        "garnish",
        "bouillon",
        "loaf",
        "shell",
        "reggiano",
        "canola",
        "parmigiano",
        "round",
        "canned",
        "ghee",
        "crust",
        "long",
        "broken",
        "ketchup",
        "bulk",
        "cleaned",
        "condensed",
        "sherry",
        "provolone",
        "cold",
        "soda",
        "cottage",
        "spray",
        "tamarind",
        "pecorino",
        "shortening",
        "part",
        "bottle",
        "sodium",
        "cocoa",
        "grain",
        "french",
        "roast",
        "stem",
        "link",
        "firm",
        "asafoetida",
        "mild",
        "dash",
        "boiling",
        "oil",
        "chopped",
        "vegetable oil",
        "chopped oil",
        "garlic",
        "skin off",
        "bone out" "from sustrainable sources",
    ]
    # The ingredient list is now a string so we need to turn it back into a list. We use ast.literal_eval
    if isinstance(ingreds, list):
        ingredients = ingreds
    else:
        ingredients = ast.literal_eval(ingreds)
    # We first get rid of all the punctuation. We make use of str.maketrans. It takes three input
    # arguments 'x', 'y', 'z'. 'x' and 'y' must be equal-length strings and characters in 'x'
    # are replaced by characters in 'y'. 'z' is a string (string.punctuation here) where each character
    #  in the string is mapped to None.
    translator = str.maketrans("", "", string.punctuation)
    lemmatizer = WordNetLemmatizer()
    ingred_list = []
    for i in ingredients:
        i.translate(translator)
        # We split up with hyphens as well as spaces
        items = re.split(" |-", i)
        # Get rid of words containing non alphabet letters
        items = [word for word in items if word.isalpha()]
        # Turn everything to lowercase
        items = [word.lower() for word in items]
        # remove accents
        items = [
            unidecode.unidecode(word) for word in items
        ]  #''.join((c for c in unicodedata.normalize('NFD', items) if unicodedata.category(c) != 'Mn'))
        # Lemmatize words so we can compare words to measuring words
        items = [lemmatizer.lemmatize(word) for word in items]
        # Gets rid of measuring words/phrases, e.g. heaped teaspoon
        items = [word for word in items if word not in measures]
        # Get rid of common easy words
        items = [word for word in items if word not in words_to_remove]
        if items:
            ingred_list.append(" ".join(items))
    # ingred_list = " ".join(ingred_list)
    return ingred_list

In [844]:
##Remove sympol in ingredients column
Recipe['ingredients'] = Recipe['ingredients'].str.replace("2 6 ounce", "")
Recipe['ingredients'] = Recipe['ingredients'].str.replace("®", "")
Recipe['ingredients'] = Recipe['ingredients'].str.replace("(", "")
Recipe['ingredients'] = Recipe['ingredients'].str.replace(")", "")
Recipe['ingredients'] = Recipe['ingredients'].str.replace("  ", "")
Recipe['ingredients'] = Recipe['ingredients'].str.replace("' ", "'")


  Recipe['ingredients'] = Recipe['ingredients'].str.replace("(", "")
  Recipe['ingredients'] = Recipe['ingredients'].str.replace(")", "")


In [846]:

##Apply the function to ingredients column
Recipe['ingredients'] = Recipe['ingredients'].apply(ingredient_parser)

SyntaxError: invalid syntax (<unknown>, line 1)