In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import ast
import operator

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [None]:
# pip install fuzzywuzzy
# pip install python-Levenshtein

## Load the Input Data

In [117]:
raw_recipes = pd.read_csv("../../data/recommendation/RAW_recipes.csv")

In [118]:
# Load labels from Food 101
all_labels = pd.read_csv("../../data/food-101/meta/labels.txt", header=None, names = ['label'])
all_labels = all_labels['label'].str.lower().tolist()
all_labels

['apple pie',
 'baby back ribs',
 'baklava',
 'beef carpaccio',
 'beef tartare',
 'beet salad',
 'beignets',
 'bibimbap',
 'bread pudding',
 'breakfast burrito',
 'bruschetta',
 'caesar salad',
 'cannoli',
 'caprese salad',
 'carrot cake',
 'ceviche',
 'cheesecake',
 'cheese plate',
 'chicken curry',
 'chicken quesadilla',
 'chicken wings',
 'chocolate cake',
 'chocolate mousse',
 'churros',
 'clam chowder',
 'club sandwich',
 'crab cakes',
 'creme brulee',
 'croque madame',
 'cup cakes',
 'deviled eggs',
 'donuts',
 'dumplings',
 'edamame',
 'eggs benedict',
 'escargots',
 'falafel',
 'filet mignon',
 'fish and chips',
 'foie gras',
 'french fries',
 'french onion soup',
 'french toast',
 'fried calamari',
 'fried rice',
 'frozen yogurt',
 'garlic bread',
 'gnocchi',
 'greek salad',
 'grilled cheese sandwich',
 'grilled salmon',
 'guacamole',
 'gyoza',
 'hamburger',
 'hot and sour soup',
 'hot dog',
 'huevos rancheros',
 'hummus',
 'ice cream',
 'lasagna',
 'lobster bisque',
 'lobster

In [119]:
print(raw_recipes.shape)
raw_recipes.head()

(231637, 12)


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [125]:
raw_recipes['name'] = raw_recipes['name'].astype(str)
recipes = raw_recipes[['name','id','ingredients']]

# Convert ingredients into list format from string format
recipes["ingredients"] = recipes["ingredients"].apply(lambda x: ast.literal_eval(x))
recipes.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recipes["ingredients"] = recipes["ingredients"].apply(lambda x: ast.literal_eval(x))


Unnamed: 0,name,id,ingredients
0,arriba baked winter squash mexican style,137739,"[winter squash, mexican seasoning, mixed spice..."
1,a bit different breakfast pizza,31490,"[prepared pizza crust, sausage patty, eggs, mi..."
2,all in the kitchen chili,112140,"[ground beef, yellow onions, diced tomatoes, t..."
3,alouette potatoes,59389,"[spreadable cheese with garlic and herbs, new ..."
4,amish tomato ketchup for canning,44061,"[tomato juice, apple cider vinegar, sugar, sal..."


In [127]:
recipe_list = raw_recipes['name'].unique().tolist()
recipe_list[:5]

['arriba   baked winter squash mexican style',
 'a bit different  breakfast pizza',
 'all in the kitchen  chili',
 'alouette  potatoes',
 'amish  tomato ketchup  for canning']

## Extract Recipe -> Food-101 Label Mapping

Get the closest food-101 label for each recipe id in data

In [128]:
# Extract recipe name and id for each label in Food-101 dataset
MATCH_COUNT = 1
threshold = 80
all_recipe_matches = {}
for recipe in recipe_list:
    recipe_match = process.extract(recipe, all_labels, limit = MATCH_COUNT)
    name = recipe_match[0][0]
    score = recipe_match[0][1]
    if score >= threshold:
        all_recipe_matches[recipe] = name

In [129]:
matched_recipes = pd.DataFrame(list(all_recipe_matches.items()), columns=['name','label'])
matched_recipes.head()

Unnamed: 0,name,label
0,a bit different breakfast pizza,pizza
1,apple a day milk shake,apple pie
2,backyard style barbecued ribs,baby back ribs
3,bananas 4 ice cream pie,ice cream
4,beat this banana bread,bread pudding


In [131]:
matched_recipes = recipes.merge(matched_recipes, how='left', on = 'name')
matched_recipes.head()

Unnamed: 0,name,id,ingredients,label
0,arriba baked winter squash mexican style,137739,"[winter squash, mexican seasoning, mixed spice...",
1,a bit different breakfast pizza,31490,"[prepared pizza crust, sausage patty, eggs, mi...",pizza
2,all in the kitchen chili,112140,"[ground beef, yellow onions, diced tomatoes, t...",
3,alouette potatoes,59389,"[spreadable cheese with garlic and herbs, new ...",
4,amish tomato ketchup for canning,44061,"[tomato juice, apple cider vinegar, sugar, sal...",


In [135]:
matched_recipes.to_csv("../../data/recommendation/recipe_label_mapping.csv", index = False)

## Extract Food-101 Label -> Ingredients Mapping

Extract 1 recipe_id per label. Total 101 rows

In [55]:
# Extract recipe name and id for each label in Food-101 dataset
MATCH_COUNT = 1
all_ingr_matches = {}
for label in all_labels:
    ingr_match = process.extract(label, recipe_list, limit = MATCH_COUNT)
    ingr_match = ingr_match[0][0]
    all_ingr_matches[label] =ingr_match
    

In [56]:
matched_ingredients = pd.DataFrame(list(all_ingr_matches.items()), columns=['label','name'])
matched_ingredients.head()

Unnamed: 0,label,name
0,apple pie,apple pie
1,baby back ribs,baby back ribs
2,baklava,baklava
3,beef carpaccio,beef carpaccio
4,beef tartare,love is in the air beef fondue sauces


In [57]:
matched_ingredients = matched_ingredients.merge(recipes, how='inner', on = 'name')
matched_ingredients.head()

Unnamed: 0,label,name,id,ingredients
0,apple pie,apple pie,124853,"[apple juice, raw honey, whole cloves, evercle..."
1,apple pie,apple pie,65988,"[all-purpose flour, salt, shortening, cold wat..."
2,baby back ribs,baby back ribs,502407,"[baby back ribs, orange juice, margarita mix, ..."
3,baklava,baklava,48804,"[phyllo dough, nuts, butter, ground cinnamon, ..."
4,beef carpaccio,beef carpaccio,189130,"[beef tenderloin, arugula, vinaigrette, kosher..."


In [59]:
# Drop if duplicates
matched_ingredients = matched_ingredients.drop_duplicates(subset= ['label','name'])
matched_ingredients.shape

(101, 4)

In [60]:
matched_recipes.to_csv("../../data/recommendation/label_ingredients_mapping.csv", index = False)