In [1]:
import pandas as pd
import nltk
import re
import string
from collections import Counter
import json

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [3]:
from parse_url import *
from mapping_urls_make import *

In [4]:
def get_test_ingredients(og_ingredients):
    test_ingredients = [x['name'] for x in og_ingredients]
    
    return test_ingredients

In [5]:
def clean_text(s):
    # import stop words
    stop_words = set(stopwords.words('english'))
    
    s = re.sub("[^a-zA-Z ]", ' ', s)
    s = s.lower()
    s = word_tokenize(s)
    
    # filter out stop words
    words = [w for w in s if not w in stop_words]
    
    return words

In [6]:
def get_cuisine_ingredients(all_recipes, cuisine_name):
    cuisine_df = pd.read_json(all_recipes)
    my_df = cuisine_df[cuisine_df['cuisine'] == cuisine_name]
    
    cuisine = []
    #print(cuisine_df['ingredients'].tolist())
    for i in my_df['ingredients'].tolist():
        for j in i:
            #print(j)
            cuisine.append(j)
            
    return cuisine

In [7]:
def get_ngrams(i_list):
    all_grams = []
    
    for i in i_list:
        l = clean_text(i)
        grams = list(nltk.everygrams(l, max_len=len(l)))
        grams.sort(key=len, reverse=True)        
        grams = [' '.join(g) for g in grams]
        all_grams.append(grams)
    
    return all_grams

In [55]:
def get_cuisine_kb(cuisine_ingredients_list, ingredients_kb):
    # create PorterStemmer object for stemming words
    ps = PorterStemmer()
    
    cuisine_kb = {}
    
    cuisine_grams = get_ngrams(cuisine_ingredients_list)

    for category in ingredients_kb:
        counter = Counter()
        #l = [ps.stem(x) for x in ingredients_kb[category]]
        l = ingredients_kb[category]
        
        for i in cuisine_grams:
            for j in i:                
                if j in l:
                    #print(l[l.index(ps.stem(j))])
                    #print(j)
                    counter[j] += 1
                    break
                
        if counter:
            cuisine_kb[category] = counter.most_common()
            
    return cuisine_kb

In [9]:
def transform_ingredients(test_ingredients, cuisine_kb, ingredients_kb):
    # create PorterStemmer object for stemming words
    ps = PorterStemmer()
    
    transformed_ingredients = []
    og_simplified_ingredients = []
    counter = Counter()
    
    test_grams = get_ngrams(test_ingredients)

    for i in test_grams:    
        match = False
        
        for g in i:        
            for category in ingredients_kb:
                ingredients_stemmed = [ps.stem(x) for x in ingredients_kb[category]]
                cuisine_stemmed = [ps.stem(x[0]) for x in cuisine_kb[category]]

                if ps.stem(g) in ingredients_stemmed:
                    og = g
                    new = g 
                      
                    if (len(cuisine_kb[category]) > counter[category]) and ps.stem(g) not in cuisine_stemmed[:2]:
                        new = cuisine_kb[category][counter[category]][0]
                        #og_simplified_ingredients.append(g)
                        #transformed_ingredients.append(cuisine_kb[category][counter[category]][0])
                        counter[category] += 1
                        #break
                        
                    if ps.stem(new) in [ps.stem(x) for x in transformed_ingredients]:
                        if (len(cuisine_kb[category]) > counter[category]):
                            counter[category] += 1
                            new = cuisine_kb[category][counter[category]][0]
                    
                    transformed_ingredients.append(new)
                    og_simplified_ingredients.append(og)
                    match = True
                    break
                    
            if match:
                break
                
        if not match:
            og_simplified_ingredients.append(i[0])
            transformed_ingredients.append(i[0])
            
    return transformed_ingredients, og_simplified_ingredients

In [58]:
# takes our created cuisine ingredients KB and list of all recipes, writes dict of cuisine ingredients + frequencies divided by food group to json
def get_cuisine_ingredients_dict(cuisine_json, all_recipes, cuisine_name):        
    # import cuisine ingredients KB (dict of food categories)
    with open(cuisine_json) as json_file:
        ingredients_kb = json.load(json_file)
        
    # import all_recipes json, get ingredients from only cuisine recipes
    cuisine_ing = get_cuisine_ingredients(all_recipes, cuisine_name)
    
    # calculate dict of cuisine ingredients + frequencies (keys=food categories)
    cuisine_kb = get_cuisine_kb(cuisine_ing, ingredients_kb)
    
    # write dict to json file
    name = 'cuisine_kbs/' + cuisine_name + '_kb.json'
    with open(name, 'w') as f:
        json.dump(cuisine_kb, f)
        
    return cuisine_kb, cuisine_ing

In [64]:
cuisine_kb, cuisine_ing = get_cuisine_ingredients_dict('cuisine_kb.json', 'train.json', 'chinese')

In [57]:
cuisine_kb

{'carb': [('cornmeal', 318),
  ('grits', 259),
  ('bread', 160),
  ('rice', 154),
  ('bread crumbs', 90),
  ('biscuits', 67),
  ('wheat', 56),
  ('macaroni', 53),
  ('graham', 51),
  ('cornbread', 39),
  ('breadcrumbs', 34),
  ('oats', 33),
  ('rolls', 33),
  ('buns', 23),
  ('dough', 21),
  ('crackers', 18),
  ('pasta', 16),
  ('hominy', 15),
  ('stuffing', 15),
  ('crusts', 13),
  ('cereal', 12),
  ('shells', 8),
  ('tapioca', 8),
  ('sourdough', 6),
  ('croutons', 6),
  ('noodles', 6),
  ('tortillas', 5),
  ('matzo', 5),
  ('penne', 4),
  ('masa', 4),
  ('polenta', 3),
  ('cornflakes', 3),
  ('spaghetti', 2),
  ('pretzels', 2),
  ('waffle', 2),
  ('toast', 2),
  ('linguine', 2),
  ('pitas', 2),
  ('kaiser', 2),
  ('oatmeal', 2),
  ('pancake', 1),
  ('quinoa', 1),
  ('shortbread', 1),
  ('risotto', 1),
  ('granola', 1)],
 'protein': [('eggs', 1386),
  ('chicken', 909),
  ('bacon', 426),
  ('ham', 223),
  ('ribs', 116),
  ('beef', 86),
  ('sausages', 67),
  ('catfish', 63),
  ('turkey

In [13]:
# takes list of og_ingredients (from main_parse) + name of cuisine ingredients json file, returns list of transformed ingredients
def to_cuisine_ingredients(og_ingredients, cuisine_kb, ingredients_kb):
    # retrieve dict of cuisine ingredients + frequencies from json
    with open(cuisine_kb) as json_file:
        cuisine_kb = json.load(json_file)
        
    # get ingredients from recipe being transformed
    test_ingredients = get_test_ingredients(og_ingredients)
    
    # gets list of transformed ingredients
    transformed_ingredients, og_simplified_ingredients = transform_ingredients(test_ingredients, cuisine_kb, ingredients_kb)
    
    return transformed_ingredients, og_simplified_ingredients

In [14]:
def to_cuisine_directions(og_simplified_ingredients, og_directions, transformed_ingredients):    
    new_directions = '@'.join(og_directions)
    
    print(og_directions)
    
    for i in og_simplified_ingredients:
        if i in new_directions:
            new_directions = new_directions.replace(i, transformed_ingredients[og_simplified_ingredients.index(i)])

    new_directions = new_directions.split('@')
                           
    return new_directions

In [51]:
with open('train.json') as f:
    train = pd.read_json(f)
    


In [52]:
train['cuisine'].value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64