# Hanadi's Capstone Project (Personlized Recipe Recommendation)

In [524]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from collections import defaultdict
import re

import nltk
import string
import ast
import re
import unidecode
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter
import config



## Background

Food is the source of energy, having food high in nutritional value is important for the overall health. Being aware of the amount of nutrients in our food help us keep track of meeting our daily needs of energy and nutrients, thus maintaining our health. Time availability to cook varies from time to time, but that shouldn’t prevent us from having the same nutritious meals. For that, I wanted to create a Recommendation system that suggest a recipe based on user’s preference of main ingredient, time to cook, and nutritional information.


## Data

Original dataset contained 35,516 instances & 47 feature. The dataset was scrapped from a recipe website called Allrecipe, and was obtained from this GitHub repo:
https://github.com/shaansubbaiah/allrecipes-scraper/tree/main/export

For this project I will be using 13 features which are:
| Feature         | Description |
|  :---:          |   :---:     |
|  name           | recipe name |
| category        | Recipe category, example (main dish, desserts, bread, ... ect) |
| rating          | Recipe rating |
| rating count    | Number of ratings |
| url             | link to this recipe |
| ingredients     | ingredients used in this recipe |
| total           | total recipe time, preparation and cooking time |
| servings        | number of servings in this recipe |
| yield           | what will this recipe yield, example (3 cups, 12 cupcakes, ... ect) |
| calories        | amount of calories per serving |
| carbohydrates_g | grams of carbohydrate per serving |
| fat_g           | grams of fat per serving |
| protein_g       | grams of protein per serving |


## Data preprocessing

In [525]:
##Read the data
Recipe = pd.read_csv("../data/scraped-07-05-21.csv", index_col=0)

In [526]:
## original dataset shape
Recipe.shape


(35516, 47)

In [527]:
### Check the dataset
Recipe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35516 entries, 0 to 35515
Data columns (total 47 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   name                   35516 non-null  object 
 1   url                    35516 non-null  object 
 2   category               35516 non-null  object 
 3   author                 35475 non-null  object 
 4   summary                35516 non-null  object 
 5   rating                 35516 non-null  float64
 6   rating_count           35516 non-null  int64  
 7   review_count           35516 non-null  int64  
 8   ingredients            35516 non-null  object 
 9   directions             35516 non-null  object 
 10  prep                   33390 non-null  object 
 11  cook                   27857 non-null  object 
 12  total                  33564 non-null  object 
 13  servings               35516 non-null  int64  
 14  yield                  35515 non-null  object 
 15  ca

In [528]:
##Check for duplicates
Recipe.duplicated().sum()

0

In [529]:
##Drop the features that I don't need, for example: (author, summary, review_count, directions, sodium_mg, calories_from_fat, & all micronutirents)
Recipe.drop(['author', 'summary', 'review_count', 'directions', 'sugars_g', 'saturated_fat_g', 'cholesterol_mg', 'dietary_fiber_g', 'sodium_mg', 'calories_from_fat', 'prep', 'cook','calcium_mg', 'iron_mg', 'magnesium_mg', 'potassium_mg', 'vitamin_a_iu_IU', 'niacin_equivalents_mg', 'vitamin_c_mg', 'folate_mcg', 'thiamin_mg', 'zinc_mg', 'phosphorus_mg', 'vitamin_b6_mg', 'riboflavin_mg', 'vitamin_e_iu_IU', 'vitamin_k_mcg', 'biotin_mcg' , 'vitamin_b12_mcg', 'mono_fat_g', 'poly_fat_g', 'trans_fatty_acid_g', 'omega_3_fatty_acid_g', 'omega_6_fatty_acid_g'], axis=1, inplace=True )


In [530]:
#Check shape after deleting unnecssary features
Recipe.shape


(35516, 13)

In [531]:
##checking for missing calue
Recipe.isna().sum()
#Maximum missing values were in total cooking time was (1952) which represents 5% of the total data.
#All other missing values were in the nutrient information features  were less than (500).

name                  0
url                   0
category              0
rating                0
rating_count          0
ingredients           0
total              1952
servings              0
yield                 1
calories            114
carbohydrates_g     148
fat_g               418
protein_g           261
dtype: int64

In [532]:
###Dealing with missing data, I'll drop the instences that doesn't contain the maximum missing values feature represents only 5% of the total data.

Recipe.dropna(inplace=True)
Recipe.isna().sum()


name               0
url                0
category           0
rating             0
rating_count       0
ingredients        0
total              0
servings           0
yield              0
calories           0
carbohydrates_g    0
fat_g              0
protein_g          0
dtype: int64

### Nutrition values columns

In [533]:

##Check the maximum values for nutrition values
Recipe.describe()


Unnamed: 0,rating,rating_count,servings,calories,carbohydrates_g,fat_g,protein_g
count,33102.0,33102.0,33102.0,33102.0,33102.0,33102.0,33102.0
mean,4.002661,136.172135,10.090357,318.526714,28.546888,16.734657,14.192982
std,1.372287,521.55765,11.51707,212.19222,24.092086,14.545362,14.063005
min,0.0,0.0,1.0,0.1,0.0,0.0,0.0
25%,4.11,4.0,4.0,167.625,10.8,6.7,3.7
50%,4.48,21.0,8.0,280.0,24.0,13.3,8.8
75%,4.67,88.0,12.0,421.9,40.4,22.9,22.0
max,5.0,19358.0,832.0,4709.2,578.2,383.9,273.2


In [534]:
##I found crazy large amount of max	serving=832, calories=4709.2, carbs=578.2, fat=383.9, protein=273.2
## So, I decided to put upper limit of calories at 1000 kcal, in which 50% are carbs = 125 g, 25% are fat = 28 g, and 25% are protein =63 g.
## For serving, the upper limit would be the 75th % = 12.

Recipe = Recipe[Recipe['calories'] <= 1000]
Recipe = Recipe[Recipe['fat_g'] <= 28]
Recipe = Recipe[Recipe['protein_g'] <= 63]
Recipe = Recipe[Recipe['carbohydrates_g'] <= 125]
Recipe = Recipe[Recipe['servings'] <= 12]

In [535]:
##Check the again after setting up maximum limits
Recipe.describe()


Unnamed: 0,rating,rating_count,servings,calories,carbohydrates_g,fat_g,protein_g
count,21680.0,21680.0,21680.0,21680.0,21680.0,21680.0,21680.0
mean,4.001043,133.255397,6.792435,277.063584,28.013247,12.60494,13.390392
std,1.359057,530.601374,3.106487,134.686314,20.640242,7.478032,11.464151
min,0.0,0.0,1.0,3.2,0.0,0.0,0.0
25%,4.1,4.0,4.0,176.175,11.4,6.5,4.3
50%,4.47,21.0,6.0,266.75,24.6,12.0,9.3
75%,4.67,86.0,8.0,364.6,40.3,18.5,20.8
max,5.0,19358.0,12.0,904.0,123.4,28.0,62.8


### Total (cooking time) column

In [536]:
###Remove recipe that takes week to cook.
searchfor = ['week']
Recipe = Recipe[~Recipe.total.str.contains('|'.join(searchfor))]

In [537]:
## Convert total from string to int
def humantime2minutes(s):
    d = {
      'w':      7*24*60,
      'week':   7*24*60,
      'weeks':  7*24*60,
      'd':      24*60,
      'day':    24*60,
      'days':   24*60,
      'hr':      60,
      'hrs':     60,
      'hour':   60,
      'hours':  60,
    }
    mult_items = defaultdict(lambda: 1).copy()
    mult_items.update(d)

    parts = re.search(r'^(\d+)([^\d]*)', s.lower().replace(' ', ''))
    if parts:
        return int(parts.group(1)) * mult_items[parts.group(2)] + humantime2minutes(re.sub(r'^(\d+)([^\d]*)', '', s.lower()))
    else:
        return 0
        

In [538]:
#Apply the function to the column
Recipe['total'] = Recipe['total'].apply(humantime2minutes)

In [539]:

###Take a look at the summary statisitcs after the convertion.
Recipe.describe()


Unnamed: 0,rating,rating_count,total,servings,calories,carbohydrates_g,fat_g,protein_g
count,21668.0,21668.0,21668.0,21668.0,21668.0,21668.0,21668.0,21668.0
mean,4.000759,133.297582,97.300166,6.790844,277.128124,28.017367,12.608639,13.39466
std,1.35936,530.742108,231.32545,3.106085,134.667401,20.639325,7.476456,11.465576
min,0.0,0.0,1.0,1.0,3.2,0.0,0.0,0.0
25%,4.1,4.0,25.0,4.0,176.2,11.4,6.5,4.3
50%,4.47,21.0,45.0,6.0,266.8,24.6,12.0,9.3
75%,4.67,86.0,80.0,8.0,364.6,40.3,18.5,20.9
max,5.0,19358.0,8640.0,12.0,904.0,123.4,28.0,62.8


### Ingredients column

In [540]:
### First, I'll drop recipe that contains (pork, wine, beer, alcohol) as these are inconvenient for the target audience
searchfor = ['pork', 'wine', 'beer', 'alcohol']
Recipe = Recipe[~Recipe.ingredients.str.contains('|'.join(searchfor))]


In [541]:
### Convert the ingredients column to a list
Recipe['ingredients'] = Recipe['ingredients'].str.strip('()').str.split(',')
Recipe.sample(2)


Unnamed: 0,name,url,category,rating,rating_count,ingredients,total,servings,yield,calories,carbohydrates_g,fat_g,protein_g
16759,Thai Beef Curry,https://www.allrecipes.com/recipe/274073/thai-...,world-cuisine,5.0,2,"[1 tablespoon peanut oil ; ½ yellow onion, s...",35,6,6 servings,377.1,10.9,27.5,24.1
34231,Chocolate Date Loaf I,https://www.allrecipes.com/recipe/6896/chocola...,bread,3.33,3,"[1 cup dates, pitted and chopped ; ¾ cup boil...",85,12,1 -9x5 inch loaf,256.4,38.9,10.8,3.9


In [542]:
##save to new csv
Recipe.to_csv('cleaned_data.csv', index = False)

In [543]:
## use function by whatscooking-deployment. Reference (https://github.com/jackmleitch/whatscooking-deployment)
def ingredient_parser(ingreds):
    measures = [
        "teaspoon",
        "t",
        "tsp.",
        "tablespoon",
        "T",
        "tbl.",
        "tb",
        "tbsp.",
        "fluid ounce",
        "fl oz",
        "gill",
        "cup",
        "c",
        "pint",
        "p",
        "pt",
        "fl pt",
        "quart",
        "q",
        "qt",
        "fl qt",
        "gallon",
        "g",
        "gal",
        "ml",
        "milliliter",
        "millilitre",
        "cc",
        "mL",
        "l",
        "liter",
        "litre",
        "L",
        "dl",
        "deciliter",
        "decilitre",
        "dL",
        "bulb",
        "level",
        "heaped",
        "rounded",
        "whole",
        "pinch",
        "medium",
        "slice",
        "pound",
        "lb",
        "#",
        "ounce",
        "oz",
        "mg",
        "milligram",
        "milligramme",
        "g",
        "gram",
        "gramme",
        "kg",
        "kilogram",
        "kilogramme",
        "x",
        "of",
        "mm",
        "millimetre",
        "millimeter",
        "cm",
        "centimeter",
        "centimetre",
        "m",
        "meter",
        "metre",
        "inch",
        "in",
        "milli",
        "centi",
        "deci",
        "hecto",
        "kilo",
    ]
    words_to_remove = [
        "fresh",
        "minced",
        "chopped" "oil",
        "a",
        "red",
        "bunch",
        "and",
        "clove",
        "or",
        "leaf",
        "chilli",
        "large",
        "extra",
        "sprig",
        "ground",
        "handful",
        "free",
        "small",
        "pepper",
        "virgin",
        "range",
        "from",
        "dried",
        "sustainable",
        "black",
        "peeled",
        "higher",
        "welfare",
        "seed",
        "for",
        "finely",
        "freshly",
        "sea",
        "quality",
        "white",
        "ripe",
        "few",
        "piece",
        "source",
        "to",
        "organic",
        "flat",
        "smoked",
        "ginger",
        "sliced",
        "green",
        "picked",
        "the",
        "stick",
        "plain",
        "plus",
        "mixed",
        "mint",
        "bay",
        "basil",
        "your",
        "cumin",
        "optional",
        "fennel",
        "serve",
        "mustard",
        "unsalted",
        "baby",
        "paprika",
        "fat",
        "ask",
        "natural",
        "skin",
        "roughly",
        "into",
        "such",
        "cut",
        "good",
        "brown",
        "grated",
        "trimmed",
        "oregano",
        "powder",
        "yellow",
        "dusting",
        "knob",
        "frozen",
        "on",
        "deseeded",
        "low",
        "runny",
        "balsamic",
        "cooked",
        "streaky",
        "nutmeg",
        "sage",
        "rasher",
        "zest",
        "pin",
        "groundnut",
        "breadcrumb",
        "turmeric",
        "halved",
        "grating",
        "stalk",
        "light",
        "tinned",
        "dry",
        "soft",
        "rocket",
        "bone",
        "colour",
        "washed",
        "skinless",
        "leftover",
        "splash",
        "removed",
        "dijon",
        "thick",
        "big",
        "hot",
        "drained",
        "sized",
        "chestnut",
        "watercress",
        "fishmonger",
        "english",
        "dill",
        "caper",
        "raw",
        "worcestershire",
        "flake",
        "cider",
        "cayenne",
        "tbsp",
        "leg",
        "pine",
        "wild",
        "if",
        "fine",
        "herb",
        "almond",
        "shoulder",
        "cube",
        "dressing",
        "with",
        "chunk",
        "spice",
        "thumb",
        "garam",
        "new",
        "little",
        "punnet",
        "peppercorn",
        "shelled",
        "saffron",
        "other" "chopped",
        "salt",
        "olive",
        "taste",
        "can",
        "sauce",
        "water",
        "diced",
        "package",
        "italian",
        "shredded",
        "divided",
        "parsley",
        "vinegar",
        "all",
        "purpose",
        "crushed",
        "juice",
        "more",
        "coriander",
        "bell",
        "needed",
        "thinly",
        "boneless",
        "half",
        "thyme",
        "cubed",
        "cinnamon",
        "cilantro",
        "jar",
        "seasoning",
        "rosemary",
        "extract",
        "sweet",
        "baking",
        "beaten",
        "heavy",
        "seeded",
        "tin",
        "vanilla",
        "uncooked",
        "crumb",
        "style",
        "thin",
        "nut",
        "coarsely",
        "spring",
        "chili",
        "cornstarch",
        "strip",
        "cardamom",
        "rinsed",
        "honey",
        "cherry",
        "root",
        "quartered",
        "head",
        "softened",
        "container",
        "crumbled",
        "frying",
        "lean",
        "cooking",
        "roasted",
        "warm",
        "whipping",
        "thawed",
        "corn",
        "pitted",
        "sun",
        "kosher",
        "bite",
        "toasted",
        "lasagna",
        "split",
        "melted",
        "degree",
        "lengthwise",
        "romano",
        "packed",
        "pod",
        "anchovy",
        "rom",
        "prepared",
        "juiced",
        "fluid",
        "floret",
        "room",
        "active",
        "seasoned",
        "mix",
        "deveined",
        "lightly",
        "anise",
        "thai",
        "size",
        "unsweetened",
        "torn",
        "wedge",
        "sour",
        "basmati",
        "marinara",
        "dark",
        "temperature",
        "garnish",
        "bouillon",
        "loaf",
        "shell",
        "reggiano",
        "canola",
        "parmigiano",
        "round",
        "canned",
        "ghee",
        "crust",
        "long",
        "broken",
        "ketchup",
        "bulk",
        "cleaned",
        "condensed",
        "sherry",
        "provolone",
        "cold",
        "soda",
        "cottage",
        "spray",
        "tamarind",
        "pecorino",
        "shortening",
        "part",
        "bottle",
        "sodium",
        "cocoa",
        "grain",
        "french",
        "roast",
        "stem",
        "link",
        "firm",
        "asafoetida",
        "mild",
        "dash",
        "boiling",
        "oil",
        "chopped",
        "vegetable oil",
        "chopped oil",
        "garlic",
        "skin off",
        "bone out" "from sustrainable sources",
    ]
    # The ingredient list is now a string so we need to turn it back into a list. We use ast.literal_eval
    if isinstance(ingreds, list):
        ingredients = ingreds
    else:
        ingredients = ast.literal_eval(ingreds)
    # We first get rid of all the punctuation. We make use of str.maketrans. It takes three input
    # arguments 'x', 'y', 'z'. 'x' and 'y' must be equal-length strings and characters in 'x'
    # are replaced by characters in 'y'. 'z' is a string (string.punctuation here) where each character
    #  in the string is mapped to None.
    translator = str.maketrans("", "", string.punctuation)
    lemmatizer = WordNetLemmatizer()
    ingred_list = []
    for i in ingredients:
        i.translate(translator)
        # We split up with hyphens as well as spaces
        items = re.split(" |-", i)
        # Get rid of words containing non alphabet letters
        items = [word for word in items if word.isalpha()]
        # Turn everything to lowercase
        items = [word.lower() for word in items]
        # remove accents
        items = [
            unidecode.unidecode(word) for word in items
        ]  #''.join((c for c in unicodedata.normalize('NFD', items) if unicodedata.category(c) != 'Mn'))
        # Lemmatize words so we can compare words to measuring words
        items = [lemmatizer.lemmatize(word) for word in items]
        # Gets rid of measuring words/phrases, e.g. heaped teaspoon
        items = [word for word in items if word not in measures]
        # Get rid of common easy words
        items = [word for word in items if word not in words_to_remove]
        if items:
            ingred_list.append(" ".join(items))
    # ingred_list = " ".join(ingred_list)
    return ingred_list

In [544]:
##Apply the function to ingredients column
if __name__ == "__main__":
    Recipe = pd.read_csv(config.RECIPES_PATH)
    Recipe["ingredients_parsed"] = Recipe["ingredients"].apply(
        lambda x: ingredient_parser(x)
    )

In [545]:
###Take a look after cleaning
Recipe.head(3)


Unnamed: 0,name,url,category,rating,rating_count,ingredients,total,servings,yield,calories,carbohydrates_g,fat_g,protein_g,ingredients_parsed
0,Dessert Crepes,https://www.allrecipes.com/recipe/19037/desser...,breakfast-and-brunch,4.8,1156,"['4 eggs', ' lightly beaten ; 1\u2009⅓ cups m...",20,8,8 crepes,163.8,17.2,7.7,6.4,"[egg, milk butter, flour sugar]"
1,Chicken Parmesan,https://www.allrecipes.com/recipe/223042/chick...,world-cuisine,4.83,4245,"['4 skinless', ' boneless chicken breast halv...",60,4,4 servings,470.8,24.8,24.9,42.1,"[chicken breast egg panko bread, parmesan chee..."
2,Easy Sausage Gravy and Biscuits,https://www.allrecipes.com/recipe/216391/easy-...,trusted-brands-recipes-and-tips,4.81,1063,['1 (16 ounce) can refrigerated jumbo buttermi...,15,8,8 servings,332.8,30.8,18.7,9.8,[refrigerated jumbo buttermilk biscuit jimmy o...


In [546]:
###save to csv 
Recipe.to_csv('full_clean_data.csv', index = False)

## EDA

In [None]:
##Summary statisitcs

In [None]:
##Correlation matrix


In [None]:
##Calories/protein/carb/fat distribution

In [None]:
##Rating vs recipe category

In [None]:
##Calories/protein/carb/fat vs recipe category

In [None]:
##Rating counts vs recipe category