# Hanadi's Capstone Project (Personlized Recipe Recommendation)

In [187]:
import pandas as pd
import numpy as np


In [188]:
##Read the data
Recipe = pd.read_csv("../data/scraped-07-05-21.csv")

## Data preprocessing

In [189]:
## original dataset shape was (35,516 instences and 47 features)
Recipe.shape


(35516, 48)

In [190]:
### Check features
Recipe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35516 entries, 0 to 35515
Data columns (total 48 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             35516 non-null  int64  
 1   name                   35516 non-null  object 
 2   url                    35516 non-null  object 
 3   category               35516 non-null  object 
 4   author                 35475 non-null  object 
 5   summary                35516 non-null  object 
 6   rating                 35516 non-null  float64
 7   rating_count           35516 non-null  int64  
 8   review_count           35516 non-null  int64  
 9   ingredients            35516 non-null  object 
 10  directions             35516 non-null  object 
 11  prep                   33390 non-null  object 
 12  cook                   27857 non-null  object 
 13  total                  33564 non-null  object 
 14  servings               35516 non-null  int64  
 15  yi

In [191]:
##Check for duplicates
Recipe.duplicated().sum()

0

In [192]:
##Drop the features that I don't need, for example: (author, summary, rating_count, review_count, directions, sodium_mg, calories_from_fat, & all micronutirents)
Recipe.drop(['author', 'summary', 'rating_count', 'review_count', 'directions', 'sugars_g', 'saturated_fat_g', 'cholesterol_mg', 'dietary_fiber_g', 'sodium_mg', 'calories_from_fat', 'prep', 'cook','calcium_mg', 'iron_mg', 'magnesium_mg', 'potassium_mg', 'vitamin_a_iu_IU', 'niacin_equivalents_mg', 'vitamin_c_mg', 'folate_mcg', 'thiamin_mg', 'zinc_mg', 'phosphorus_mg', 'vitamin_b6_mg', 'riboflavin_mg', 'vitamin_e_iu_IU', 'vitamin_k_mcg', 'biotin_mcg' , 'vitamin_b12_mcg', 'mono_fat_g', 'poly_fat_g', 'trans_fatty_acid_g', 'omega_3_fatty_acid_g', 'omega_6_fatty_acid_g'], axis=1, inplace=True )


In [193]:
#After deleting unnecssary features, the shape was (35516 instences and 10 features).
Recipe.shape


(35516, 13)

In [194]:
##checking for missing calue
Recipe.isna().sum()
#Maximum missing values were in total cooking time (1952) represents 5% of the total data.
#Other missing values were in the nutrient information features, all were less than (500).

Unnamed: 0            0
name                  0
url                   0
category              0
rating                0
ingredients           0
total              1952
servings              0
yield                 1
calories            114
carbohydrates_g     148
fat_g               418
protein_g           261
dtype: int64

In [195]:
###Dealing with missing data, I'll drop the instences that doesn't contain the maximum missing values feature represents only 5% of the total data.

Recipe.dropna(inplace=True)
Recipe.isna().sum()


Unnamed: 0         0
name               0
url                0
category           0
rating             0
ingredients        0
total              0
servings           0
yield              0
calories           0
carbohydrates_g    0
fat_g              0
protein_g          0
dtype: int64

### nutrition value columns

In [196]:

##Check the maximum values for nutrition values
Recipe.describe()


Unnamed: 0.1,Unnamed: 0,rating,servings,calories,carbohydrates_g,fat_g,protein_g
count,33102.0,33102.0,33102.0,33102.0,33102.0,33102.0,33102.0
mean,17815.433448,4.002661,10.090357,318.526714,28.546888,16.734657,14.192982
std,10265.5322,1.372287,11.51707,212.19222,24.092086,14.545362,14.063005
min,0.0,0.0,1.0,0.1,0.0,0.0,0.0
25%,8972.25,4.11,4.0,167.625,10.8,6.7,3.7
50%,17900.5,4.48,8.0,280.0,24.0,13.3,8.8
75%,26732.75,4.67,12.0,421.9,40.4,22.9,22.0
max,35515.0,5.0,832.0,4709.2,578.2,383.9,273.2


In [197]:
#Check the mode for serving
Recipe.servings.mode()


0    4
Name: servings, dtype: int64

In [198]:
##I found crazy large amount of max	serving=832, calories=4709.2, carbs=578.2, fat=383.9, protein=273.2
## So, I decided to put upper limit of calories at 1000 kcal, in which 50% are carbs = 125 g, 25% are fat = 28 g, and 25% are protein =63 g.
## For serving, the upper limit would be the 75th % = 12.

Recipe = Recipe[Recipe['calories'] <= 1000]
Recipe = Recipe[Recipe['fat_g'] <= 28]
Recipe = Recipe[Recipe['protein_g'] <= 63]
Recipe = Recipe[Recipe['carbohydrates_g'] <= 125]
Recipe = Recipe[Recipe['servings'] <= 12]

In [199]:
Recipe.describe()


Unnamed: 0.1,Unnamed: 0,rating,servings,calories,carbohydrates_g,fat_g,protein_g
count,21680.0,21680.0,21680.0,21680.0,21680.0,21680.0,21680.0
mean,17394.131965,4.001043,6.792435,277.063584,28.013247,12.60494,13.390392
std,10126.81339,1.359057,3.106487,134.686314,20.640242,7.478032,11.464151
min,1.0,0.0,1.0,3.2,0.0,0.0,0.0
25%,8682.5,4.1,4.0,176.175,11.4,6.5,4.3
50%,17528.5,4.47,6.0,266.75,24.6,12.0,9.3
75%,25869.25,4.67,8.0,364.6,40.3,18.5,20.8
max,35515.0,5.0,12.0,904.0,123.4,28.0,62.8


### total column

In [200]:
###Remove recipe that takes week to cook.
searchfor = ['week']
Recipe = Recipe[~Recipe.total.str.contains('|'.join(searchfor))]

In [201]:
##Defined a function
from collections import defaultdict
import re

def humantime2minutes(s):
    d = {
      'w':      7*24*60,
      'week':   7*24*60,
      'weeks':  7*24*60,
      'd':      24*60,
      'day':    24*60,
      'days':   24*60,
      'hr':      60,
      'hrs':     60,
      'hour':   60,
      'hours':  60,
    }
    mult_items = defaultdict(lambda: 1).copy()
    mult_items.update(d)

    parts = re.search(r'^(\d+)([^\d]*)', s.lower().replace(' ', ''))
    if parts:
        return int(parts.group(1)) * mult_items[parts.group(2)] + humantime2minutes(re.sub(r'^(\d+)([^\d]*)', '', s.lower()))
    else:
        return 0
        

In [202]:
#Apply the function to the column
Recipe['total'] = Recipe['total'].apply(humantime2minutes)

In [203]:

###Take a look at the summary statisitcs after the convertion.
Recipe.describe()


Unnamed: 0.1,Unnamed: 0,rating,total,servings,calories,carbohydrates_g,fat_g,protein_g
count,21668.0,21668.0,21668.0,21668.0,21668.0,21668.0,21668.0,21668.0
mean,17395.47194,4.000759,97.300166,6.790844,277.128124,28.017367,12.608639,13.39466
std,10126.606423,1.35936,231.32545,3.106085,134.667401,20.639325,7.476456,11.465576
min,1.0,0.0,1.0,1.0,3.2,0.0,0.0,0.0
25%,8687.5,4.1,25.0,4.0,176.2,11.4,6.5,4.3
50%,17531.0,4.47,45.0,6.0,266.8,24.6,12.0,9.3
75%,25869.25,4.67,80.0,8.0,364.6,40.3,18.5,20.9
max,35515.0,5.0,8640.0,12.0,904.0,123.4,28.0,62.8


### ingredient column

In [204]:
### First, I'll drop recipe that contains (pork, wine, beer, alcohol) as these are inconvenient for the target audience
searchfor = ['pork', 'wine', 'beer', 'alcohol']
Recipe = Recipe[~Recipe.ingredients.str.contains('|'.join(searchfor))]


In [205]:
### change ingredients sepration from ; to , & seprate them into '' 
Recipe['ingredients'] = Recipe['ingredients'].str.replace(" ; ", "','")
Recipe.update("['" + Recipe["ingredients"].astype(str) + "']")

In [206]:

### Convert the ingredients column to a list
Recipe['ingredients'].values.tolist()
Recipe.sample(2)

Unnamed: 0.1,Unnamed: 0,name,url,category,rating,ingredients,total,servings,yield,calories,carbohydrates_g,fat_g,protein_g
34050,34050,Easy Sweet 'n Hot Greek Salsa with Pita,https://www.allrecipes.com/recipe/233809/easy-...,trusted-brands-recipes-and-tips,0.0,['1 (8.75 ounce) jar Dickinson's® Sweet 'n' Ho...,10,4,4 servings,428.3,69.1,11.5,11.4
29361,29361,Garbanzo Bean Burgers,https://www.allrecipes.com/recipe/181040/garba...,main-dish,4.12,"['1 (15 ounce) can garbanzo beans, rinsed and ...",90,4,4 servings,139.8,21.7,4.2,4.9


In [207]:
##Save the cleaned dataset

Recipe.to_csv('cleaned_data.csv', index = None)

In [208]:

### download required libraries

import nltk
import string
import ast
import re
import unidecode

# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter
import config


In [209]:
## use function by whatscooking-deployment. Reference (https://github.com/jackmleitch/whatscooking-deployment)
def ingredient_parser(ingreds):
    measures = [
        "teaspoon",
        "t",
        "tsp.",
        "tablespoon",
        "T",
        "tbl.",
        "tb",
        "tbsp.",
        "fluid ounce",
        "fl oz",
        "gill",
        "cup",
        "c",
        "pint",
        "p",
        "pt",
        "fl pt",
        "quart",
        "q",
        "qt",
        "fl qt",
        "gallon",
        "g",
        "gal",
        "ml",
        "milliliter",
        "millilitre",
        "cc",
        "mL",
        "l",
        "liter",
        "litre",
        "L",
        "dl",
        "deciliter",
        "decilitre",
        "dL",
        "bulb",
        "level",
        "heaped",
        "rounded",
        "whole",
        "pinch",
        "medium",
        "slice",
        "pound",
        "lb",
        "#",
        "ounce",
        "oz",
        "mg",
        "milligram",
        "milligramme",
        "g",
        "gram",
        "gramme",
        "kg",
        "kilogram",
        "kilogramme",
        "x",
        "of",
        "mm",
        "millimetre",
        "millimeter",
        "cm",
        "centimeter",
        "centimetre",
        "m",
        "meter",
        "metre",
        "inch",
        "in",
        "milli",
        "centi",
        "deci",
        "hecto",
        "kilo",
    ]
    words_to_remove = [
        "fresh",
        "minced",
        "chopped" "oil",
        "a",
        "red",
        "bunch",
        "and",
        "clove",
        "or",
        "leaf",
        "chilli",
        "large",
        "extra",
        "sprig",
        "ground",
        "handful",
        "free",
        "small",
        "pepper",
        "virgin",
        "range",
        "from",
        "dried",
        "sustainable",
        "black",
        "peeled",
        "higher",
        "welfare",
        "seed",
        "for",
        "finely",
        "freshly",
        "sea",
        "quality",
        "white",
        "ripe",
        "few",
        "piece",
        "source",
        "to",
        "organic",
        "flat",
        "smoked",
        "ginger",
        "sliced",
        "green",
        "picked",
        "the",
        "stick",
        "plain",
        "plus",
        "mixed",
        "mint",
        "bay",
        "basil",
        "your",
        "cumin",
        "optional",
        "fennel",
        "serve",
        "mustard",
        "unsalted",
        "baby",
        "paprika",
        "fat",
        "ask",
        "natural",
        "skin",
        "roughly",
        "into",
        "such",
        "cut",
        "good",
        "brown",
        "grated",
        "trimmed",
        "oregano",
        "powder",
        "yellow",
        "dusting",
        "knob",
        "frozen",
        "on",
        "deseeded",
        "low",
        "runny",
        "balsamic",
        "cooked",
        "streaky",
        "nutmeg",
        "sage",
        "rasher",
        "zest",
        "pin",
        "groundnut",
        "breadcrumb",
        "turmeric",
        "halved",
        "grating",
        "stalk",
        "light",
        "tinned",
        "dry",
        "soft",
        "rocket",
        "bone",
        "colour",
        "washed",
        "skinless",
        "leftover",
        "splash",
        "removed",
        "dijon",
        "thick",
        "big",
        "hot",
        "drained",
        "sized",
        "chestnut",
        "watercress",
        "fishmonger",
        "english",
        "dill",
        "caper",
        "raw",
        "worcestershire",
        "flake",
        "cider",
        "cayenne",
        "tbsp",
        "leg",
        "pine",
        "wild",
        "if",
        "fine",
        "herb",
        "almond",
        "shoulder",
        "cube",
        "dressing",
        "with",
        "chunk",
        "spice",
        "thumb",
        "garam",
        "new",
        "little",
        "punnet",
        "peppercorn",
        "shelled",
        "saffron",
        "other" "chopped",
        "salt",
        "olive",
        "taste",
        "can",
        "sauce",
        "water",
        "diced",
        "package",
        "italian",
        "shredded",
        "divided",
        "parsley",
        "vinegar",
        "all",
        "purpose",
        "crushed",
        "juice",
        "more",
        "coriander",
        "bell",
        "needed",
        "thinly",
        "boneless",
        "half",
        "thyme",
        "cubed",
        "cinnamon",
        "cilantro",
        "jar",
        "seasoning",
        "rosemary",
        "extract",
        "sweet",
        "baking",
        "beaten",
        "heavy",
        "seeded",
        "tin",
        "vanilla",
        "uncooked",
        "crumb",
        "style",
        "thin",
        "nut",
        "coarsely",
        "spring",
        "chili",
        "cornstarch",
        "strip",
        "cardamom",
        "rinsed",
        "honey",
        "cherry",
        "root",
        "quartered",
        "head",
        "softened",
        "container",
        "crumbled",
        "frying",
        "lean",
        "cooking",
        "roasted",
        "warm",
        "whipping",
        "thawed",
        "corn",
        "pitted",
        "sun",
        "kosher",
        "bite",
        "toasted",
        "lasagna",
        "split",
        "melted",
        "degree",
        "lengthwise",
        "romano",
        "packed",
        "pod",
        "anchovy",
        "rom",
        "prepared",
        "juiced",
        "fluid",
        "floret",
        "room",
        "active",
        "seasoned",
        "mix",
        "deveined",
        "lightly",
        "anise",
        "thai",
        "size",
        "unsweetened",
        "torn",
        "wedge",
        "sour",
        "basmati",
        "marinara",
        "dark",
        "temperature",
        "garnish",
        "bouillon",
        "loaf",
        "shell",
        "reggiano",
        "canola",
        "parmigiano",
        "round",
        "canned",
        "ghee",
        "crust",
        "long",
        "broken",
        "ketchup",
        "bulk",
        "cleaned",
        "condensed",
        "sherry",
        "provolone",
        "cold",
        "soda",
        "cottage",
        "spray",
        "tamarind",
        "pecorino",
        "shortening",
        "part",
        "bottle",
        "sodium",
        "cocoa",
        "grain",
        "french",
        "roast",
        "stem",
        "link",
        "firm",
        "asafoetida",
        "mild",
        "dash",
        "boiling",
        "oil",
        "chopped",
        "vegetable oil",
        "chopped oil",
        "garlic",
        "skin off",
        "bone out" "from sustrainable sources",
    ]
    # The ingredient list is now a string so we need to turn it back into a list. We use ast.literal_eval
    if isinstance(ingreds, list):
        ingredients = ingreds
    else:
        ingredients = ast.literal_eval(ingreds)
    # We first get rid of all the punctuation. We make use of str.maketrans. It takes three input
    # arguments 'x', 'y', 'z'. 'x' and 'y' must be equal-length strings and characters in 'x'
    # are replaced by characters in 'y'. 'z' is a string (string.punctuation here) where each character
    #  in the string is mapped to None.
    translator = str.maketrans("", "", string.punctuation)
    lemmatizer = WordNetLemmatizer()
    ingred_list = []
    for i in ingredients:
        i.translate(translator)
        # We split up with hyphens as well as spaces
        items = re.split(" |-", i)
        # Get rid of words containing non alphabet letters
        items = [word for word in items if word.isalpha()]
        # Turn everything to lowercase
        items = [word.lower() for word in items]
        # remove accents
        items = [
            unidecode.unidecode(word) for word in items
        ]  #''.join((c for c in unicodedata.normalize('NFD', items) if unicodedata.category(c) != 'Mn'))
        # Lemmatize words so we can compare words to measuring words
        items = [lemmatizer.lemmatize(word) for word in items]
        # Gets rid of measuring words/phrases, e.g. heaped teaspoon
        items = [word for word in items if word not in measures]
        # Get rid of common easy words
        items = [word for word in items if word not in words_to_remove]
        if items:
            ingred_list.append(" ".join(items))
    # ingred_list = " ".join(ingred_list)
    return ingred_list

In [210]:
##Remove sympol in ingredients column
Recipe['ingredients'].str.replace("®", "")
Recipe['ingredients'].str.replace("  ", "")

2        ['4eggs, lightly beaten','1 ⅓ cups milk','2 ta...
5        ['4skinless, boneless chicken breast halves','...
6        ['1 (16 ounce) can refrigerated jumbo buttermi...
9        ['1 ½ cups self-rising flour, plus more for kn...
10       ['2 pounds baking potatoes, peeled and quarter...
                               ...                        
35504    ['2 cups panko bread crumbs','salt to taste','...
35509    ['2 cups canned chickpeas, rinsed and drained'...
35512    ['1 sheet nori (dried seaweed), cut into thin ...
35514    ['1 cup garbanzo beans','⅓ cup canned jalapeno...
35515    ['1zucchini, thinly sliced','1 tablespoon garl...
Name: ingredients, Length: 19325, dtype: object

In [211]:
##Apply the function to ingredients column
if __name__ == "__main__":
    Recipe = pd.read_csv(config.RECIPES_PATH)
    Recipe["ingredients_parsed"] = Recipe["ingredients"].apply(
        lambda x: ingredient_parser(x)
    )

SyntaxError: invalid syntax (<unknown>, line 1)