In 0-clean-ingredients notebook, we use NYtimes-parser to process all the ingredients in the data. Here, we use many rule-based approaches to clean all the fields (e.g. title, ingredients, instructions) we want to use.

Even though the code is a little bit messy, this processing can be finished in 30min.

In [1]:
from dependency import parent_dir
from common.basics import *
from common.save import save_pickle, load_pickle 

Load two files: <br>
layer1 is provided by http://pic2recipe.csail.mit.edu/; <br>
recipe1M_ny.pickle is from  0-clean-recipe1m.ipynb

In [2]:
layer1 = json.load(open('../big_data/layer1.json','r'))
ny_data = load_pickle('../big_data/recipe1M_ny.pickle')

Define rule-based methods to clean the data

In [3]:
start_with = ['tbsp','pkt','g','tsp','x','cups','oz','mrs','can',
              'lb', 'pkg','tbsp','lbs','qt','lrg','grams','sm',
              'cans','bottle','and','cubes','o',',','handful',
              'container','t','bag','gram','jar','c','lg','ml','ounces','ounce','box']

remove = ['%s.' %str(i) for i in range(30)]

def clean_line(line):
    '''
    Args:
        line: a string, such as food name, sentences...
    '''
    assert type(line) == str
    
    # all lowercase
    line = line.lower()
    line = line.replace(' .', '.')
    line = line.replace(' !', '!')
    line = line.replace(')', '')
    line = line.replace('*', '')
    line = line.replace('..', '.')
    line = line.replace(' - ', '')
    
    # only reserve number and alphabets
    line = re.sub(r"[^a-z0-9+()-/?&'!.,]", ' ', line)
    
    # replace things in brace
    line = re.sub(r'\([^)]*\)', '', line)
    
    # remove extra spaces
    line = re.sub(' +',' ',line).strip()
    return line

def clean_prefix(ingr):
    cleaned = []
    for ans in ingr:
        
        # strip
        ans = re.sub(' +',' ',ans).strip()
        
        # remove number
        ans = re.sub(r'\d+', '', ans)
        
        # remove period
        ans = ans.replace('.', '')
        
        # remove prefixes
        for prefix in start_with:
            ans = re.sub('^'+prefix+'\s', '', ans)
            
        # strip again
        ans = re.sub(' +',' ',ans).strip()

        if ans:
            cleaned.append(ans)
            
    return cleaned

In [4]:
lst_undetectable = []
new_data = []

for i, v in tqdm.tqdm(enumerate(ny_data)):
    '''
    1. dealing with undetectable cases
    '''
    ingr = []
    for ny_full_ingredients in v['ny_full_ingredients']:
        if 'half and half' in ny_full_ingredients['input']:
            ingr.append('half and half')
        elif 'purpose flour' in ny_full_ingredients['input']:
            ingr.append('all purpose flour')
            
        elif type(ny_full_ingredients['name'])==float:
            ans = ''
            for word in ['salt', 'sugar', 'oil','mustard','water',
                         'steak','nuts','butter','garnish','ketchup',
                         'milk','mayonnaise','pepper','cumin', 'rice',
                         'seasoning','grated parmesan','raisin','olive oil',
                         'stuffing mix', 'sauce','syrup','mushroom soup',
                         'white sugar','brown sugar',
                         'chopped onions','sour cream','lean ground beef','tortilla',
                         'cayenne','paprika','corn', 'egg yolks', 'egg whites'
                         'condensed milk',
                         'crumb crust','jell o vanilla flavor instant pudding']:
                if word in ny_full_ingredients['input']:
                    ans = word
                    
            if ',' in ny_full_ingredients['input'] and not ans:
                ans = ny_full_ingredients['input'].split(',')[0]
                ans = ans if ans.count(' ') ==0 else ''
            
            if ans:
                ingr.append(ans)
            else:
                lst_undetectable.append(ny_full_ingredients['input'])
        
        elif 'recipe' not in ny_full_ingredients['name']:
            ans = ny_full_ingredients['name']
            ingr.append(ans)
    '''
    2. cleaning instruction
    '''
    # drop numbered list
    instr = ''
    instr = [line['text'] for line in layer1[i]['instructions'] if line['text'] not in remove]
    instr = [line[:-2] if line[-2:] in remove else line for line in instr]
    instr = [line[2:] if line[:2] in remove else line for line in instr]
    instr = [line for line in instr if line]
    
    # add period for certain sentences
    instr = [line+'.' if line[-1] not in ['!', '.', ';',','] else line for line in instr]
    
    # clean braces
    instr = ' '.join(instr)
    instr = clean_line(instr)
    
    # contain calorie info
    if 'calorie' in instr:
        for term in ['per serving','nutrition information','servings','each serving has','per slice','calories']:
            instr, sep, tail = instr.partition(term)
        if not instr.endswith('.'):
            instr, sep, tail = instr.rpartition('.')
            instr = instr+sep
            
    # contain author info
    instr, sep, tail = instr.partition('recipe from new new')
    tit = clean_line(layer1[i]['title'])
    
    # long enough
    cond1 = (len(ingr) == len(v['ny_full_ingredients']))
    cond2 = (instr.count('.') + instr.count('!') >=2)
    cond3 = (len(instr.split(' '))>10)
    cond4 = (i!=8219)

    '''
    3. cleaning mistakes of ny-times-parser
    '''   
    if cond1 and cond2 and cond3 and cond4:
        ingr = clean_prefix(ingr)
        ingr = clean_prefix(ingr)
        ingr = list(set(ingr))
        
        if len(ingr)>=2 and '. 1 tablespoon' not in instr and '. 2 tablespoons' not in instr:
            
            recipe = {'ingredients':ingr, 'title':tit, 
                      'instructions': instr, 'recipe1m_idx': i, 'url': layer1[i]['url']
                     }
            new_data.append(recipe)

1029720it [20:28, 837.96it/s]


In [5]:
#save_pickle(obj = new_data, filename='../big_data/data_1218.pickle',overwrite=True)
save_pickle(obj = new_data, filename='../big_data/data.pickle',overwrite=True)

In [6]:
len(new_data)

904401

In [7]:
data = new_data
len(data)/len(layer1)

0.8782979839179583