In [2]:
import pandas as pd
import nltk
import re
import string
from collections import Counter
import json

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [4]:
from parse_url import *
from mapping_urls_make import *

In [9]:
def get_test_ingredients(og_ingredients):
    test_ingredients = [x['name'] for x in og_ingredients]
    
    return test_ingredients

In [10]:
def clean_text(s):
    # import stop words
    stop_words = set(stopwords.words('english'))
    
    s = re.sub("[^a-zA-Z ]", ' ', s)
    s = s.lower()
    s = word_tokenize(s)
    
    # filter out stop words
    words = [w for w in s if not w in stop_words]
    
    return words

In [11]:
def get_cuisine_ingredients(all_recipes, cuisine_name):
    cuisine_df = pd.read_json(all_recipes)
    my_df = cuisine_df[cuisine_df['cuisine'] == cuisine_name]
    
    cuisine = []
    #print(cuisine_df['ingredients'].tolist())
    for i in my_df['ingredients'].tolist():
        for j in i:
            #print(j)
            cuisine.append(j)
            
    return cuisine

In [12]:
def get_ngrams(i_list):
    all_grams = []
    
    for i in i_list:
        l = clean_text(i)
        grams = list(nltk.everygrams(l, max_len=len(l)))
        grams.sort(key=len, reverse=True)        
        grams = [' '.join(g) for g in grams]
        all_grams.append(grams)
    
    return all_grams

In [13]:
def get_cuisine_kb(cuisine_ingredients_list, ingredients_kb):
    # create PorterStemmer object for stemming words
    ps = PorterStemmer()
    
    cuisine_kb = {}
    
    cuisine_grams = get_ngrams(cuisine_ingredients_list)

    for category in ingredients_kb:
        counter = Counter()
        #l = [ps.stem(x) for x in ingredients_kb[category]]
        l = ingredients_kb[category]
        
        for i in cuisine_grams:
            for j in i:                
                if j in l:
                    #print(l[l.index(ps.stem(j))])
                    #print(j)
                    counter[j] += 1
                    break
                
        if counter:
            cuisine_kb[category] = counter.most_common()
            
    return cuisine_kb

In [14]:
def transform_ingredients(test_ingredients, cuisine_kb, ingredients_kb):
    # create PorterStemmer object for stemming words
    ps = PorterStemmer()
    
    transformed_ingredients = []
    og_simplified_ingredients = []
    counter = Counter()
    
    test_grams = get_ngrams(test_ingredients)

    for i in test_grams:    
        match = False
        
        for g in i:        
            for category in ingredients_kb:
                ingredients_stemmed = [ps.stem(x) for x in ingredients_kb[category]]
                cuisine_stemmed = [ps.stem(x[0]) for x in cuisine_kb[category]]

                if ps.stem(g) in ingredients_stemmed:
                    og = g
                    new = g 
                      
                    if (len(cuisine_kb[category]) > counter[category]) and ps.stem(g) not in cuisine_stemmed[:2]:
                        new = cuisine_kb[category][counter[category]][0]
                        #og_simplified_ingredients.append(g)
                        #transformed_ingredients.append(cuisine_kb[category][counter[category]][0])
                        counter[category] += 1
                        #break
                        
                    if ps.stem(new) in [ps.stem(x) for x in transformed_ingredients]:
                        if (len(cuisine_kb[category]) > counter[category]):
                            counter[category] += 1
                            new = cuisine_kb[category][counter[category]][0]
                    
                    transformed_ingredients.append(new)
                    og_simplified_ingredients.append(og)
                    match = True
                    break
                    
            if match:
                break
                
        if not match:
            og_simplified_ingredients.append(i[0])
            transformed_ingredients.append(i[0])
            
    return transformed_ingredients, og_simplified_ingredients

In [15]:
# takes our created cuisine ingredients KB and list of all recipes, writes dict of cuisine ingredients + frequencies divided by food group to json
def get_cuisine_ingredients_dict(cuisine_json, all_recipes, cuisine_name):        
    # import cuisine ingredients KB (dict of food categories)
    with open(cuisine_json) as json_file:
        ingredients_kb = json.load(json_file)
        
    # import all_recipes json, get ingredients from only cuisine recipes
    cuisine_ing = get_cuisine_ingredients(all_recipes, cuisine_name)
    
    # calculate dict of cuisine ingredients + frequencies (keys=food categories)
    cuisine_kb = get_cuisine_kb(cuisine_ing, ingredients_kb)
    
    # write dict to json file
    name = 'cuisine_kbs/' + cuisine_name + '_kb.json'
    with open(name, 'w') as f:
        json.dump(cuisine_kb, f)
        
    return cuisine_kb, cuisine_ing

In [28]:
cuisine_kb, cuisine_ing = get_cuisine_ingredients_dict('cuisine_kb.json', 'train.json', 'italian')

In [27]:
cuisine_kb

{'carb': ['bagels',
  'baguettes',
  'barley',
  'biscuits',
  'bran',
  'bread',
  'buns',
  'cereal',
  'cornbread',
  'couscous',
  'crackers',
  'croutons',
  'crusts',
  'dough',
  'granola',
  'hominy',
  'kasha',
  'masa',
  'matzo',
  'millet',
  'muffins',
  'oats',
  'pitas',
  'popcorn',
  'pretzels',
  'quinoa',
  'rice',
  'rolls',
  'shortbread',
  'sourdough',
  'stuffing',
  'tapioca',
  'toast',
  'tortillas',
  'wheat',
  'kaiser',
  'cornmeal',
  'breadcrumbs',
  'bread crumbs',
  'graham',
  'bulgur',
  'farina',
  'oatmeal',
  'croissants',
  'polenta',
  'grits',
  'pumpernickel',
  'sago',
  'seitan',
  'grains',
  'taters',
  'risotto',
  'shells',
  'amarettini',
  'mochi',
  'cornflakes',
  'pilaf',
  'puppies',
  'farfalle',
  'fettuccine',
  'lasagnas',
  'linguine',
  'mac',
  'macaroni',
  'manicotti',
  'noodles',
  'pasta',
  'farfel',
  'vermicelli',
  'tagliatelle',
  'cannelloni',
  'penne',
  'spaghetti',
  'fettucine',
  'waffle',
  'pancake'],
 'pr

In [21]:
# takes list of og_ingredients (from main_parse) + name of cuisine ingredients json file, returns list of transformed ingredients
def to_cuisine_ingredients(og_ingredients, cuisine_kb, ingredients_kb):
        
    # get ingredients from recipe being transformed
    test_ingredients = get_test_ingredients(og_ingredients)
    
    # gets list of transformed ingredients
    transformed_ingredients, og_simplified_ingredients = transform_ingredients(test_ingredients, cuisine_kb, ingredients_kb)
    
    return transformed_ingredients, og_simplified_ingredients

In [44]:
def to_cuisine_directions(og_ingredients, og_directions, transformed_ingredients):    
    new_directions = '@'.join(og_directions)
    
    test_ingredients = get_test_ingredients(og_ingredients)
    
    og_grams = get_ngrams(test_ingredients)
    
    for i in og_grams:
        for j in i:
            if j in new_directions:
                new_directions = new_directions.replace(j, transformed_ingredients[og_grams.index(i)])
                break
    
#     for i in og_simplified_ingredients:
#         if i in new_directions:
#             new_directions = new_directions.replace(i, transformed_ingredients[og_simplified_ingredients.index(i)])

    new_directions = new_directions.split('@')
                           
    return new_directions

In [25]:
with open('cuisine_kb.json') as f:
    ingredients_kb = json.loads(f.read())

In [46]:
og_ingredients, og_directions = main_parse('https://www.allrecipes.com/recipe/236805/king-ranch-chicken-casserole/',check='single')

In [47]:
transformed_ingredients, og_simplified_ingredients = to_cuisine_ingredients(og_ingredients, cuisine_kb, ingredients_kb)

In [34]:
test_ingredients = get_test_ingredients(og_ingredients)

In [40]:
get_ngrams(test_ingredients)

[['vegetable oil', 'vegetable', 'oil'],
 ['white onion diced',
  'white onion',
  'onion diced',
  'white',
  'onion',
  'diced'],
 ['red bell pepper diced',
  'red bell pepper',
  'bell pepper diced',
  'red bell',
  'bell pepper',
  'pepper diced',
  'red',
  'bell',
  'pepper',
  'diced'],
 ['green bell pepper diced',
  'green bell pepper',
  'bell pepper diced',
  'green bell',
  'bell pepper',
  'pepper diced',
  'green',
  'bell',
  'pepper',
  'diced'],
 ['condensed cream mushroom soup',
  'condensed cream mushroom',
  'cream mushroom soup',
  'condensed cream',
  'cream mushroom',
  'mushroom soup',
  'condensed',
  'cream',
  'mushroom',
  'soup'],
 ['condensed cream chicken soup',
  'condensed cream chicken',
  'cream chicken soup',
  'condensed cream',
  'cream chicken',
  'chicken soup',
  'condensed',
  'cream',
  'chicken',
  'soup'],
 ['diced tomatoes green chile peppers ro tel',
  'diced tomatoes green chile peppers ro',
  'tomatoes green chile peppers ro tel',
  'diced

In [48]:
to_cuisine_directions(og_ingredients, og_directions, transformed_ingredients)

['Preheat oven to 350 degrees F (175 degrees C).\n                            ',
 'Heat olive oil in a large skillet over high heat. Saute onion, pepper, and salt in hot olive oil until warmed through, about 2 minutes.\n                            ',
 'Combine onion-pepper mixture, cream of cream, cream of cream, tomato, eggs, sour cream, basil, ancho chile powder, oregano, and chipotle chile powder together in a large bowl and stir until sauce is well-combined.\n                            ',
 'Spread a few tablespoons of the sauce in the bottom of a 9x13-inch baking dish. Spread 1/2 the eggs over the sauce. Spread about half the sauce over the eggs and top with 1/3 the parmesan. Spread a layer of spinach over the parmesan. Spread remaining 1/2 the eggs over the spinach, and top with almost all of the remaining sauce, reserving 1/2 cup sauce. Top with 1/3 the parmesan, remaining spinach, the reserved 1/2 cup sauce, and remaining 1/3 parmesan.\n                            ',
 'Bake cas

In [52]:
train['cuisine'].value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

In [63]:
def get_times(og_direction):
    times = ['second', 'seconds', 'minute', 'minutes', 'hour', 'hours']
    direction_times = []
    tokenized = nltk.word_tokenize(og_direction)
    for i in tokenized:
        if i in times:
            idx = tokenized.index(i)
            if tokenized[idx-1]:
                direction_times.append(tokenized[idx-1]+' '+i)
                    
    return direction_times            

In [68]:
get_times(og_directions[4])

['40 minutes', '40 minutes']