In [289]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from collections import Counter
import string

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import copy

In [290]:
# import nltk
# nltk.download()

In [291]:
def load_data():
    ### Load all three JSON datasets and merge into single dict ###
    filenames = ['recipes_raw_nosource_ar.json', 'recipes_raw_nosource_epi.json', 'recipes_raw_nosource_fn.json']
    websites = ['AllRecipes', 'FoodNetwork', 'Epicurious']
    abv = ['AR', 'FN', 'EP']
    
    full_data = []
    i = 0
    for file in filenames:
        with open('recipes_raw/'+file) as f:
            raw = json.load(f)
        data = list(raw.values())
        
        # add key for website scraped from
        for recipe in data:
            recipe['website'] = abv[i]
        
        full_data += list(data)
        
        # print statistics with regards to each dataset ##
        print('----------------')
        print(file)
        print('Total recipes from {} : {}'.format(websites[i], len(data)))
        print('Dictionary keys: {} \n'.format(data[0].keys()))
        
        i += 1
    
    print('----------------')
    print('Total recipes across all three websites: {}'.format(len(full_data)))
    return full_data

In [292]:
data = load_data()

----------------
recipes_raw_nosource_ar.json
Total recipes from AllRecipes : 39802
Dictionary keys: dict_keys(['title', 'ingredients', 'instructions', 'picture_link', 'website']) 

----------------
recipes_raw_nosource_epi.json
Total recipes from FoodNetwork : 25323
Dictionary keys: dict_keys(['ingredients', 'picture_link', 'instructions', 'title', 'website']) 

----------------
recipes_raw_nosource_fn.json
Total recipes from Epicurious : 60039
Dictionary keys: dict_keys(['instructions', 'ingredients', 'title', 'picture_link', 'website']) 

----------------
Total recipes across all three websites: 125164


In [298]:
# simple data preprocessing
def preprocessing(data):
    ### Remove all entries without title, ingredients, or instructions ###
    init_len = len(data)
    data_copy = copy.deepcopy(data)
    
    # keep track of recipe index
    i = 0
    for recipe in data:
        title = recipe.get('title')
        ingredients = recipe.get('ingredients')
        instructions = recipe.get('instructions')
        
        # if any of the three keys are blank, remove recipe from dataset
        if(not title or not ingredients or not instructions):
            del data_copy[i]
        else: # when element is deleted, index is removed
            i += 1 
        
        
    post_len = len(data_copy) 
    
    print('Preprocessing: \n----------------')
    print('Number of samples prior to preprocessing: {}'.format(init_len))
    print('Number of samples post preprocessing: {}'.format(post_len))
    print('Total number of recipes removed: {}'.format(init_len-post_len))
    
    return data_copy

In [304]:
data_complete = preprocessing(data)

print('---------------- \n')
print('SAMPLE RECIPE \n----------------')
for i in data_complete[0]:
    print(i, ': {} \n'.format(data_complete[0].get(i)))

Preprocessing: 
 ----------------
Number of samples prior to preprocessing: 125164
Number of samples post preprocessing: 122938
Total number of recipes removed: 2226
---------------- 

SAMPLE RECIPE 
----------------
title : Slow Cooker Chicken and Dumplings 

ingredients : ['4 skinless, boneless chicken breast halves ADVERTISEMENT', '2 tablespoons butter ADVERTISEMENT', '2 (10.75 ounce) cans condensed cream of chicken soup ADVERTISEMENT', '1 onion, finely diced ADVERTISEMENT', '2 (10 ounce) packages refrigerated biscuit dough, torn into pieces ADVERTISEMENT', 'ADVERTISEMENT'] 

instructions : Place the chicken, butter, soup, and onion in a slow cooker, and fill with enough water to cover.
Cover, and cook for 5 to 6 hours on High. About 30 minutes before serving, place the torn biscuit dough in the slow cooker. Cook until the dough is no longer raw in the center.
 

picture_link : 55lznCYBbs2mT8BTx6BTkLhynGHzM.S 

website : AR 



In [194]:
def exploration(data):
    # find highest word frequencies of ingredients
    all_ing = []
    
    i = 0
    for recipe in data:
        ingredients = recipe.get('ingredients')
        all_ing += ingredients
        
    # top 15 most frequent ingredient strings
    counter = Counter(all_ing)
    top15 = counter.most_common()[:15]
    
    print('Top 15 most frequent ingredient strings: ')
    print('----------------')
    
    for string in top15:
        print(string[0], ' --> Count: ', string[1])
    

In [286]:
exploration(data_complete)

Top 15 most frequent ingredient strings: 
----------------
ADVERTISEMENT  --> Count:  39519
Salt and freshly ground black pepper  --> Count:  5218
Kosher salt and freshly ground black pepper  --> Count:  4886
Kosher salt  --> Count:  4844
1/2 teaspoon salt  --> Count:  4246
1/2 teaspoon salt ADVERTISEMENT  --> Count:  3455
1 teaspoon salt  --> Count:  3372
2 tablespoons olive oil  --> Count:  3353
Salt and pepper  --> Count:  3203
Salt  --> Count:  3039
Freshly ground black pepper  --> Count:  3026
1 teaspoon vanilla extract ADVERTISEMENT  --> Count:  2998
1 teaspoon salt ADVERTISEMENT  --> Count:  2969
1/4 teaspoon salt  --> Count:  2889
salt and pepper to taste ADVERTISEMENT  --> Count:  2457


1) Need to get ride of 'ADVERTISEMENT' as it is present in almost every string.  
2) 11 of 15 top strings contain SALT. Need to strip down strings to individual words to get a better sense of most used ingredients. Clearly, salt is number 1.  
3) For Word2Vec representation, we need to eliminate all STOP words and measurements - 1/2 tsp salt --> salt

In [184]:
## List of common measurements ##
measurements = ['teaspon',
               'dessertspoon',
               'tablespoon',
               'fluid ounce',
               'cup',
               'pint',
               'quart',
               'gallon',
               'drop',
               'smidgen',
               'pinch',
               'dash',
               'saltspoon',
               'scruple',
               'coffeespoon',
               'dram',
               'teaspoon',
               'dessertspoon',
               'tablespoon',
               'fluid',
               'ounce',
               'wineglass',
               'gill',
               'teacup',
               'tsp',
               't',
               'fl',
               'oz',
               'tbsp',
               'dr',
               'gt',
               'gtt',
               'smdg',
               'smi',
               'pn',
               'ds',
               'ssp',
               'csp',
               'dsp',
               'dssp',
               'dstspn',
               'wgf',
               'tcf',
               'c',
               'pt',
               'qt',
               'gal',
               'minim',
               'bu',
               'bushel',
               'gal',
               'ml',
               'milliliter',
                'millilitre',
                'centiliter',
                'centilitre',
                'centimeter',
                'cl',
                'kilogramme',
                'gramme',
                'milligramme',
                '#',
                'slice',
                'heaped',
                'halves',
                'bulb',
                'level',
                'dl',
                'deciliter',
                'decilitre',
               'l',
               'liter',
               'litre',
               'pk',
               'peck',
               'tb',
               'g',
               'gram',
               'kg',
               'kilogram',
               'lb',
               'pound',
               'mg',
               'milligram',
               'doz',
               'dozen',
               'lg',
               'large',
               'small',
               'medium',
               'sm',
               'package',
               'half',
               'full',
               'third',
               'fourth',
               'whole',
               'inch',
                'clove',
                'bunch',
                'container'
               ]

miscellaneous_words = ['refrigerated',
                      'can',
                      'canned',
                      'fresh',
                      'chopped',
                      'skinless',
                      'boneless',
                      'kosher',
                      'ground',
                      'skin',
                      'off',
                      'on',
                      'boil',
                      'mild',
                      'spicy',
                      'cold',
                      'warm',
                      'grain',
                      'spray',
                      'cooking',
                      'condensed',
                      'bulk',
                      'round',
                      'loaf',
                      'temperature',
                      'wedge',
                      'slice',
                      'diced',
                      'peeled',
                      'zested',
                      'juiced',
                      'cooked',
                      'lightly',
                      'heavy',
                      'whipped',
                      'room',
                      'sour',
                      'sweet',
                      'bunch',
                      'floret',
                      'mix',
                      'dry',
                      'seasoned',
                      'prepared',
                      'melted',
                      'fried',
                      'grilled',
                      'sun',
                      'dried',
                      'pitted',
                      'thawed',
                      'lean',
                      'skim',
                      'fat-free',
                      'fat',
                      'free',
                      'whole',
                      'crumbled',
                      'head',
                      'root',
                      'shredded',
                      'pitted',
                      'soft',
                      'hard',
                      'tail',
                      'head',
                      'rinsed',
                      'spices',
                      'thin',
                      'thick-cut',
                      'thick',
                      'thin-cut',
                      'cut',
                      'beaten',
                      'baked',
                      'uncooked',
                      'tin',
                       'jar',
                       'needed',
                       'water',
                       'sauce',
                       'chunk',
                       'dressing',
                       'shoulder',
                       'rib',
                       'prime',
                       'ground',
                       'coarse',
                       'fine',
                       'leg',
                       'foot',
                       'liver',
                       'tongue',
                       'drained',
                       'leftover',
                       'frozen',
                       'seeded',
                       'roughly',
                       'grated',
                       'browned',
                       'natural',
                       'vegan',
                       'powder',
                       'baby',
                       'optional',
                       'bone',
                       'finely',
                       'torn',
                       'pieces',
                       'piece',
                       'minced',
                       'freshly'
                      ]

In [282]:
def clean_ingredients(data, measurements, miscellaneous_words):
    ### Takes list of ingredients as input and returns processed ingredients list ###
    data_copy = copy.deepcopy(data)
    
    # lemmatize list of measurements to remove
    lemmatizer = WordNetLemmatizer()
    for me in measurements:
        me = lemmatizer.lemmatize(me)
    for mi in miscellaneous_words:
        mi = lemmatizer.lemmatize(mi)
    
    # set stopwords using nltk
    stop_words = set(stopwords.words('english'))
    
    # iterate through each recipe's ingredients
    for recipe1 in data_copy:
        ingredients = recipe1.get('ingredients')
        
        ing_list = []
        # iterate through all ingredients
        for i in ingredients:
            # get rid of punctuation
            i = i.translate(str.maketrans('', '', string.punctuation))

            # turn string into list
            i = i.split(' ')

            # remove 'ADVERTISEMENT'
            if 'ADVERTISEMENT' in i: i.remove('ADVERTISEMENT')

            # remove any ints from measurements and non-alphabet words 
            i = [ing for ing in i if not ing.isdigit() or ing.isalpha()]

            # turn to lowercase
            i = [ing.lower() for ing in i]

            # lemmatize word
            i = [lemmatizer.lemmatize(ing) for ing in i]

            # remove any measurement words
            i = [ing for ing in i if ing not in measurements]

            # remove any STOP words
            i = [ing for ing in i if ing not in stop_words]

            # remove any miscellaneous words
            i = [ing for ing in i if ing not in miscellaneous_words]

            # only append to ing list is non-empty
            if i:
                i_str = ' '.join(i)
                ing_list.append(i_str)
                
        recipe1.update({'ingredients': ing_list})
                
    return data_copy

In [283]:
# further processing of 'data_complete'
clean_data = clean_ingredients(data_complete, measurements, miscellaneous_words)

In [285]:
# see how top ingredients have changed
exploration(clean_data)

Top 15 most frequent ingredient strings: 
----------------
salt  --> Count:  48473
egg  --> Count:  23693
olive oil  --> Count:  19389
sugar  --> Count:  18784
allpurpose flour  --> Count:  17418
onion  --> Count:  15859
butter  --> Count:  15683
garlic  --> Count:  15618
milk  --> Count:  13078
cream  --> Count:  12136
white sugar  --> Count:  10974
vanilla extract  --> Count:  10602
salt freshly black pepper  --> Count:  10526
garlic minced  --> Count:  10146
vegetable oil  --> Count:  9584
