In [1]:
import pandas as pd
import nltk
import re
import string
from collections import Counter
import json

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [3]:
from parse_url import *
from mapping_urls_make import *

# train.json contains thousands of recipes with their cuisine category and ingredients
## we will do cuisine transformations with cuisines that have more than 2000 recipes in train.json; this includes italian, mexican, southern_us, indian, chinese, and french

In [4]:
with open('train.json') as f:
    train = pd.read_json(f)

In [5]:
train['cuisine'].value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

# use get_cuisine_ingredients to extract the ingredients of our desired cuisine from train.json

In [6]:
# takes the name of the recipes json file and the desired cuisine as a string; returns a list of ingredients for that cuisine
def get_cuisine_ingredients(all_recipes, cuisine_name):
    cuisine_df = pd.read_json(all_recipes)
    my_df = cuisine_df[cuisine_df['cuisine'] == cuisine_name]
    
    cuisine = []
    #print(cuisine_df['ingredients'].tolist())
    for i in my_df['ingredients'].tolist():
        for j in i:
            #print(j)
            cuisine.append(j)
            
    return cuisine

In [7]:
cuisine_ingredients = get_cuisine_ingredients('train.json', 'italian')

In [8]:
cuisine_ingredients

['sugar',
 'pistachio nuts',
 'white almond bark',
 'flour',
 'vanilla extract',
 'olive oil',
 'almond extract',
 'eggs',
 'baking powder',
 'dried cranberries',
 'chopped tomatoes',
 'fresh basil',
 'garlic',
 'extra-virgin olive oil',
 'kosher salt',
 'flat leaf parsley',
 'pimentos',
 'sweet pepper',
 'dried oregano',
 'olive oil',
 'garlic',
 'sharp cheddar cheese',
 'pepper',
 'swiss cheese',
 'provolone cheese',
 'canola oil',
 'mushrooms',
 'black olives',
 'sausages',
 'Italian parsley leaves',
 'walnuts',
 'hot red pepper flakes',
 'extra-virgin olive oil',
 'fresh lemon juice',
 'trout fillet',
 'garlic cloves',
 'chipotle chile',
 'fine sea salt',
 'flat leaf parsley',
 'fresh parmesan cheese',
 'butter',
 'all-purpose flour',
 'fat free less sodium chicken broth',
 'chopped fresh chives',
 'gruyere cheese',
 'ground black pepper',
 'bacon slices',
 'gnocchi',
 'fat free milk',
 'cooking spray',
 'salt',
 'italian seasoning',
 'broiler-fryer chicken',
 'mayonaise',
 'zesty 

# create ingredients_kb, which is a kb of ingredients found in all cuisines sorted by food group (e.g. protein, carb, dairy, etc.)

In [9]:
with open('cuisine_kb.json') as f:
    ingredients_kb = json.loads(f.read())

In [10]:
ingredients_kb

{'carb': ['bagels',
  'baguettes',
  'barley',
  'biscuits',
  'bran',
  'bread',
  'buns',
  'cereal',
  'cornbread',
  'couscous',
  'crackers',
  'croutons',
  'crusts',
  'dough',
  'granola',
  'hominy',
  'kasha',
  'masa',
  'matzo',
  'millet',
  'muffins',
  'oats',
  'pitas',
  'popcorn',
  'pretzels',
  'quinoa',
  'rice',
  'rolls',
  'shortbread',
  'sourdough',
  'stuffing',
  'tapioca',
  'toast',
  'tortillas',
  'wheat',
  'kaiser',
  'cornmeal',
  'breadcrumbs',
  'bread crumbs',
  'graham',
  'bulgur',
  'farina',
  'oatmeal',
  'croissants',
  'polenta',
  'grits',
  'pumpernickel',
  'sago',
  'seitan',
  'grains',
  'taters',
  'risotto',
  'shells',
  'amarettini',
  'mochi',
  'cornflakes',
  'pilaf',
  'puppies',
  'farfalle',
  'fettuccine',
  'lasagnas',
  'linguine',
  'mac',
  'macaroni',
  'manicotti',
  'noodles',
  'pasta',
  'farfel',
  'vermicelli',
  'tagliatelle',
  'cannelloni',
  'penne',
  'spaghetti',
  'fettucine',
  'waffle',
  'pancake'],
 'pr

# for each cuisine that we want to transform to, we make a specific kb for that cuisine

## we do this by comparing the list of ingredients for a specific cuisine to our ingredients kb and using a counter to track the frequency of each ingredient

## this returns a dictionary with food groups as keys and ingredients + frequencies as values, i.e. {"protein" : [["chicken", 1000], ["pork", 900]], "carb": ...}

In [11]:
# takes a string, returns a cleaned list of strings
def clean_text(s):
    # import stop words
    stop_words = set(stopwords.words('english'))
    
    s = re.sub("[^a-zA-Z ]", ' ', s) # remove punctuation + numbers
    s = s.lower() # lowercase 
    s = word_tokenize(s) # tokenize
    
    # filter out stop words
    words = [w for w in s if not w in stop_words]
    
    return words

In [12]:
# takes a list of strings, returns a nested list of all possible ngram variations for each string in original list
# this list is ordered from longest to shortest ngram
def get_ngrams(i_list):
    all_grams = []
    
    for i in i_list:
        l = clean_text(i)
        grams = list(nltk.everygrams(l, max_len=len(l)))
        grams.sort(key=len, reverse=True)        
        grams = [' '.join(g) for g in grams]
        all_grams.append(grams)
    
    return all_grams

In [13]:
# takes a list of cuisine ingredients and the ingredients kb, and returns a dictionary for that specific cuisine formatted like the ingredients kb
def get_cuisine_kb(cuisine_ingredients, ingredients_kb):
    # create PorterStemmer object for stemming words
    ps = PorterStemmer()
    
    cuisine_kb = {}
    
    # get all possible ngrams of the cuisine ingredients
    cuisine_grams = get_ngrams(cuisine_ingredients)

    # initialize a counter for each food category in the ingredients kb
    # if an ngram in the list of cuisine ingredients appears in a food category in the ingredients kb, increment the counter for that ingredient
    # because the list of ngrams is ordered from longest to shortest, we should only get the longest ngram that appears
    for category in ingredients_kb:
        counter = Counter()
        l = ingredients_kb[category]
        
        for i in cuisine_grams:
            for j in i:                
                if j in l:
                    counter[j] += 1
                    break
                
        if counter:
            cuisine_kb[category] = counter.most_common()
            
    return cuisine_kb

In [14]:
cuisine_kb = get_cuisine_kb(cuisine_ingredients, ingredients_kb)

## here is the generated italian kb

In [15]:
cuisine_kb

{'carb': [('pasta', 827),
  ('bread', 549),
  ('rice', 358),
  ('spaghetti', 337),
  ('noodles', 331),
  ('bread crumbs', 300),
  ('penne', 264),
  ('linguine', 196),
  ('wheat', 174),
  ('polenta', 139),
  ('fettucine', 136),
  ('dough', 89),
  ('cornmeal', 78),
  ('rolls', 61),
  ('fettuccine', 52),
  ('breadcrumbs', 51),
  ('sourdough', 40),
  ('macaroni', 34),
  ('manicotti', 28),
  ('risotto', 20),
  ('croutons', 20),
  ('shells', 19),
  ('tagliatelle', 19),
  ('barley', 17),
  ('vermicelli', 16),
  ('buns', 15),
  ('crusts', 14),
  ('tortillas', 13),
  ('quinoa', 10),
  ('biscuits', 10),
  ('oats', 8),
  ('crackers', 7),
  ('couscous', 6),
  ('toast', 6),
  ('pitas', 5),
  ('graham', 5),
  ('stuffing', 5),
  ('muffins', 4),
  ('cereal', 4),
  ('cornbread', 3),
  ('grits', 3),
  ('cannelloni', 2),
  ('tapioca', 2),
  ('bagels', 2),
  ('bulgur', 2),
  ('shortbread', 2),
  ('cornflakes', 2),
  ('popcorn', 1),
  ('pancake', 1),
  ('hominy', 1),
  ('matzo', 1)],
 'protein': [('chicken

## here is a wrapper function that does all of the steps above and writes the cuisine kb to a json

In [16]:
# takes our created cuisine ingredients KB and list of all recipes, writes dict of cuisine ingredients + frequencies divided by food group to json
def get_cuisine_ingredients_dict(cuisine_json, all_recipes, cuisine_name):        
    # import cuisine ingredients KB (dict of food categories)
    with open(cuisine_json) as json_file:
        ingredients_kb = json.load(json_file)
        
    # import all_recipes json, get ingredients from only cuisine recipes
    cuisine_ing = get_cuisine_ingredients(all_recipes, cuisine_name)
    
    # calculate dict of cuisine ingredients + frequencies (keys=food categories)
    cuisine_kb = get_cuisine_kb(cuisine_ing, ingredients_kb)
    
    # write dict to json file
    name = 'cuisine_kbs/' + cuisine_name + '_kb.json'
    with open(name, 'w') as f:
        json.dump(cuisine_kb, f)
        
    return cuisine_kb, cuisine_ing

# now that we have our specific cuisine kb, we can begin the process of transforming a recipe to that cuisine

# first, get lists of ingredients and directions for the recipe we want to test

## use main main_parse from parse_url 

In [30]:
og_ingredients, og_directions = main_parse('https://www.allrecipes.com/recipe/259870/briam-greek-baked-zucchini-and-potatoes/',check='single')

In [31]:
og_ingredients

[{'quantity': '2',
  'name': 'potatoes, peeled and thinly sliced',
  'measure': 'pounds'},
 {'quantity': '4', 'name': 'zucchini, thinly sliced', 'measure': 'large'},
 {'quantity': '4', 'name': 'red onions, thinly sliced', 'measure': 'small'},
 {'quantity': '6', 'name': 'ripe tomatoes, pureed', 'measure': ''},
 {'quantity': '1/2', 'name': 'olive oil', 'measure': 'cup'},
 {'quantity': '2',
  'name': 'chopped fresh parsley (optional)',
  'measure': 'tablespoons'},
 {'quantity': 'n/a',
  'name': 'sea salt and freshly ground black pepper to taste',
  'measure': 'taste'}]

In [32]:
og_directions

['Preheat oven to 400 degrees F (200 degrees C).\n                            ',
 'Spread potatoes, zucchini, and red onions in a 9x13-inch baking dish, or preferably a larger one. Use 2 baking dishes if necessary. Cover with pureed tomatoes, olive oil, parsley. Season with salt and freshly ground pepper. Toss all ingredients together so that the vegetables are evenly coated.\n                            ',
 'Bake in the preheated oven, stirring after 1 hour, until vegetables are tender and moisture has evaporated, about 90 minutes. Cool slightly before serving, or serve at room temperature.\n                            ']

## og_ingredients is a dictionary with keys quantity, ingredient, and measure, so extract only the ingredients and return as list test_ingredients

In [20]:
# takes dictionary og_ingredients, returns list of ingredients
def get_test_ingredients(og_ingredients):
    test_ingredients = [x['name'] for x in og_ingredients]
    
    return test_ingredients

In [21]:
test_ingredients = get_test_ingredients(og_ingredients)

In [22]:
test_ingredients

['olive oil',
 'medium onion, chopped',
 'bay leaves',
 'ground cumin',
 'dried oregano',
 'salt',
 'stalks celery, chopped',
 'green bell peppers, chopped',
 'jalapeno peppers, chopped',
 'garlic, chopped',
 'cans chopped green chile peppers, drained',
 'packages vegetarian burger crumbles',
 'cans whole peeled tomatoes, crushed',
 'chili powder',
 'ground black pepper',
 'can kidney beans, drained',
 'can garbanzo beans, drained',
 'can black beans',
 'can whole kernel corn']

# now that we have our original recipe ingredients, the specific cuisine kb, and the overall ingredients kb, we can transform the original recipe ingredients to cuisine-specific ingredients 

## we use statistical techniques for this method, replacing each ingredient with the most frequently appearing ingredient in a specific food category from our desired cuisine

### i.e. for italian cuisine, the ingredient rice may be replaced with pasta because pasta is the most frequent ingredient in the carbs category

In [23]:
# takes the original recipe ingredients, the cuisine kb, and the ingredients kb
# returns a list of transformed ingredients and a list of simplified original ingredients that match our ingredients kba
def transform_ingredients(test_ingredients, cuisine_kb, ingredients_kb):
    # create PorterStemmer object for stemming words
    ps = PorterStemmer()
    
    transformed_ingredients = []
    og_simplified_ingredients = []
    
    # keep of a counter of how many recipe ingredients fall into each food category
    counter = Counter()
    
    # get ngrams of recipe ingredients
    test_grams = get_ngrams(test_ingredients)

    # for each recipe ingredient
    for i in test_grams:    
        match = False
        
        # for each ngram for an ingredient (starting with longest)
        for g in i:
            
            # for each food category in the ingredients kb
            for category in ingredients_kb:
                
                # create lists of stemmed ingredients for ingredients kb and the cuisine kb
                ingredients_stemmed = [ps.stem(x) for x in ingredients_kb[category]]
                cuisine_stemmed = [ps.stem(x[0]) for x in cuisine_kb[category]]

                # if the stemmed ingredient ngram is in the ingredient kb
                if ps.stem(g) in ingredients_stemmed:
                    og = g
                    new = g 
                      
                    # if the ingredient is not in the top 3 ingredients for that cuisine category (doesn't have to be replaced)
                    # and if the cuisine kb has enough ingredients 
                    # replace our original ingredient with an ingredient from the cuisine category that we didn't already pull (based off counter)
                    if (len(cuisine_kb[category]) > counter[category]) and ps.stem(g) not in cuisine_stemmed[:2]:
                        new = cuisine_kb[category][counter[category]][0]
                        counter[category] += 1
                    
                    # if that ingredient was already in our list of transformed ingredients, get the next most frequent ingredient from the cuisine category
                    if ps.stem(new) in [ps.stem(x) for x in transformed_ingredients]:
                        if (len(cuisine_kb[category]) > counter[category]):
                            counter[category] += 1
                            new = cuisine_kb[category][counter[category]][0]
                    
                    transformed_ingredients.append(new)
                    og_simplified_ingredients.append(og)
                    match = True
                    break
                    
            if match:
                break
        
        # if we didn't find a replacement for an ingredient, just take the original ingredient
        if not match:
            og_simplified_ingredients.append(i[0])
            transformed_ingredients.append(i[0])
            
    return transformed_ingredients, og_simplified_ingredients

## to_cuisine_ingredients is a wrapper function that uses transform_ingredients

In [24]:
# takes list of og_ingredients (from main_parse) + name of cuisine ingredients json file, returns list of transformed ingredients
def to_cuisine_ingredients(og_ingredients, cuisine_kb, ingredients_kb):
        
    # get ingredients from recipe being transformed
    test_ingredients = get_test_ingredients(og_ingredients)
    
    # gets list of transformed ingredients
    transformed_ingredients, og_simplified_ingredients = transform_ingredients(test_ingredients, cuisine_kb, ingredients_kb)
    
    return transformed_ingredients, og_simplified_ingredients

## to_cuisine_directions parses the original directions and replaces all of the original ingredients with the transformed ingredients

In [25]:
# takes list of original ingredients, list of original directions, and list of transformed ingredients, and returns list of transformed directions
def to_cuisine_directions(og_simplified_ingredients, og_directions, transformed_ingredients):    
    new_directions = '@'.join(og_directions)
    
#    og_grams = get_ngrams(og_ingredients)    
#     for i in og_grams:
#         for j in i:
#             if j in new_directions:
#                 new_directions = new_directions.replace(j, transformed_ingredients[og_grams.index(i)])
#                 break
    
    for i in og_simplified_ingredients:
        if i in new_directions:
            new_directions = new_directions.replace(i, transformed_ingredients[og_simplified_ingredients.index(i)])

    new_directions = new_directions.split('@')
                           
    return new_directions

In [26]:
transformed_ingredients, og_simplified_ingredients = to_cuisine_ingredients(og_ingredients, cuisine_kb, ingredients_kb)

In [27]:
og_simplified_ingredients

['olive oil',
 'onion',
 'bay',
 'cumin',
 'oregano',
 'salt',
 'celery',
 'bell peppers',
 'jalapeno',
 'garlic',
 'peppers',
 'burger',
 'tomatoes',
 'chili powder',
 'pepper',
 'kidney',
 'garbanzo',
 'black beans',
 'corn']

In [28]:
transformed_ingredients

['olive oil',
 'onion',
 'pepper',
 'salt',
 'garlic',
 'parsley',
 'tomato',
 'mushrooms',
 'crushed red pepper',
 'rosemary',
 'capers',
 'chicken',
 'celery',
 'red pepper flakes',
 'sage',
 'bean',
 'lentils',
 'kidney',
 'corn']

In [29]:
to_cuisine_directions(og_simplified_ingredients, og_directions, transformed_ingredients)

['Heat the olive oil in a large pot over medium heat. Stir in the onion, and season with sage leaves, parsley, rosemary, and parsley. Cook and stir until onion is tender, then mix in the tomato, green mushrooms, crushed red sage capers, rosemary, and green chile capers. When vegetables are heated through, mix in the vegetarian chicken crumbles. Reduce heat to low, cover pot, and simmer 5 minutes.\n                                    Watch Now\n',
 'Mix the celery into the pot. Season chili with red sage flakes and sage. Stir in the bean beans, lentils beans, and kidney. Bring to a boil, reduce heat to low, and simmer 45 minutes. Stir in the corn, and continue cooking 5 minutes before serving.\n                                    Watch Now\n']

# because we can create a cuisine kb for any cuisine for which we have some number of recipes, this method is generalizable to any cuisine