In [2]:
import sys
sys.path.append('..')
import re
import nltk
import json
import warnings
from tqdm import tqdm
import numpy as np
warnings.filterwarnings('ignore')

from nltk.corpus import stopwords

from utilities.helpers import *

In [3]:
user_data, recipes_users = load_data()

In [4]:
recipe_id = 6675

In [5]:
recipe_data = load_recipe(recipe_id)

In [6]:
pprint(recipe_data)

{'cook_time': '30 m',
 'description': '\r\n'
                '"A soft, moist cornbread center with a crunchy '
                'crust."        ',
 'featured_in_magazine': False,
 'ingredients': [{'id': '16317', 'text': '1 egg'},
                 {'id': '16278', 'text': '1 1/3 cups milk'},
                 {'id': '6379', 'text': '1/4 cup vegetable oil'},
                 {'id': '166', 'text': '2 cups self-rising corn meal mix'},
                 {'id': '4315', 'text': '1 (8 ounce) can cream-style corn'},
                 {'id': '16261', 'text': '1 cup sour cream'}],
 'instructions': ['Heat oven to 425 degrees F (220 degrees C). Grease a 9 inch '
                  'iron skillet.',
                  'In a large bowl, beat the egg. Add milk, oil, sour cream, '
                  'cream corn, and cornmeal mix; stir until cornmeal is just '
                  'dampened. Pour batter into greased skillet.',
                  'Bake for 25 to 30 minutes, or until knife inserted in '
              

In [7]:
def recipe_to_pd(recipe_id):
    import re
    recipe_data = load_recipe(recipe_id)
    values = []
    units = []
    for nutrient in recipe_data['nutri_facts'].keys():
        value_unit = recipe_data['nutri_facts'][nutrient]
        value = re.findall(r"[+-]? *(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?", value_unit)
        values.append(value[0])
        units.append(value_unit.replace(value[0],'').replace(' ',''))
    return pd.DataFrame(data={'Nutrient': list(recipe_data['nutri_facts'].keys()), 'Unit': units, 'Value': values})

In [8]:
recipe_to_pd(6675)

Unnamed: 0,Nutrient,Unit,Value
0,sodiumContent,mg,772.0
1,carbohydrateContent,g,40.8
2,fatContent,g,15.9
3,proteinContent,g,6.4
4,calories,kcal,328.0
5,cholesterolContent,mg,39.0


In [9]:
nutrients = ['fatContent',  'cholesterolContent', 'sodiumContent',  'carbohydrateContent', 'proteinContent']
units = ['g',  'mg', 'mg', 'g', 'g']
values = [65, 300, 2400, 300, 50]

In [10]:
dvr = pd.DataFrame(data={'Nutrient': nutrients, 'Unit': units, 'Value': values})
print('There are {} nutrients defining the daily reference.'.format(len(dvr)))
dvr.head()

There are 5 nutrients defining the daily reference.


Unnamed: 0,Nutrient,Unit,Value
0,fatContent,g,65
1,cholesterolContent,mg,300
2,sodiumContent,mg,2400
3,carbohydrateContent,g,300
4,proteinContent,g,50


In [11]:
raw_ingredients = [ingredient['text'] for ingredient in recipe_data['ingredients']]

In [12]:
raw_ingredients

['1 egg',
 '1 1/3 cups milk',
 '1/4 cup vegetable oil',
 '2 cups self-rising corn meal mix',
 '1 (8 ounce) can cream-style corn',
 '1 cup sour cream']

# Recipe similarity

In [15]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()

In [16]:
for index, ingredient in enumerate(raw_ingredients):
    raw_ingredients[index] = " ".join([lemmatiser.lemmatize(item) for item in nltk.word_tokenize(ingredient)])


In [17]:
from __future__ import division
from fractions import Fraction

def convert_fraction(measurement):
    return sum([float(Fraction(unit)) for unit in measurement.split()])

In [18]:
nutrition = pd.read_json('../data/recipe_data/6676/nutrition.json')

In [19]:
with open ('../data/recipe_data/6676/nutrition.json') as json_file:
    nutrition_json = json.load(json_file)

In [20]:
sorted(nutrition_json['nutrition'].keys())

['calcium',
 'calories',
 'caloriesFromFat',
 'carbohydrates',
 'cholesterol',
 'fat',
 'fiber',
 'folate',
 'iron',
 'magnesium',
 'niacin',
 'potassium',
 'protein',
 'saturatedFat',
 'sodium',
 'sugars',
 'thiamin',
 'vitaminA',
 'vitaminB6',
 'vitaminC']

In [21]:
recipes_data = pd.read_json('../data/recipes.json')
columns = ['id','total_time','ingredients', 'instructions', 'health_score'] + sorted(nutrition_json['nutrition'].keys())
recipes_data = recipes_data[columns]
recipes_data = recipes_data.set_index('id')
recipes_data.head()

Unnamed: 0_level_0,total_time,ingredients,instructions,health_score,calcium,calories,caloriesFromFat,carbohydrates,cholesterol,fat,...,niacin,potassium,protein,saturatedFat,sodium,sugars,thiamin,vitaminA,vitaminB6,vitaminC
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100008,45 m,"[{'text': '8 ounces fresh young ginger root, p...",[Cut the ginger into chunks and place them int...,4,"{'amount': 1.171771, 'hasCompleteData': True, ...","{'amount': 13.6625, 'hasCompleteData': True, '...","{'amount': 0.47250000000000003, 'hasCompleteDa...","{'amount': 3.326817, 'hasCompleteData': True, ...","{'amount': 0.0, 'hasCompleteData': True, 'disp...","{'amount': 0.052500000000000005, 'hasCompleteD...",...,"{'amount': 0.0665, 'hasCompleteData': False, '...","{'amount': 29.76198, 'hasCompleteData': False,...","{'amount': 0.1274, 'hasCompleteData': True, 'd...","{'amount': 0.01421, 'hasCompleteData': True, '...","{'amount': 83.41, 'hasCompleteData': True, 'di...","{'amount': 2.200458, 'hasCompleteData': True, ...","{'amount': 0.00161, 'hasCompleteData': False, ...","{'amount': 0.0, 'hasCompleteData': True, 'disp...","{'amount': 0.0112, 'hasCompleteData': False, '...","{'amount': 0.35000000000000003, 'hasCompleteDa..."
10001,,"[{'id': '16317', 'text': '4 eggs'}, {'id': '14...",[Whisk the eggs with the sugar and salt until ...,8,"{'amount': 96.83675, 'hasCompleteData': True, ...","{'amount': 357.6104, 'hasCompleteData': True, ...","{'amount': 203.0924, 'hasCompleteData': True, ...","{'amount': 35.48761, 'hasCompleteData': True, ...","{'amount': 102.6708, 'hasCompleteData': True, ...","{'amount': 22.56582, 'hasCompleteData': True, ...",...,"{'amount': 2.7157400000000003, 'hasCompleteDat...","{'amount': 152.4793, 'hasCompleteData': True, ...","{'amount': 5.999063, 'hasCompleteData': True, ...","{'amount': 10.78192, 'hasCompleteData': True, ...","{'amount': 196.5283, 'hasCompleteData': True, ...","{'amount': 17.2474, 'hasCompleteData': True, '...","{'amount': 0.2310342, 'hasCompleteData': True,...","{'amount': 555.8644, 'hasCompleteData': True, ...","{'amount': 0.08333621000000001, 'hasCompleteDa...","{'amount': 0.5352042, 'hasCompleteData': True,..."
100011,1 h,"[{'text': '2 tablespoons vegetable oil', 'id':...",[Heat 1 tablespoon vegetable oil in a skillet ...,7,"{'percentDailyValue': '8', 'hasCompleteData': ...","{'percentDailyValue': '24', 'hasCompleteData':...","{'percentDailyValue': '-', 'hasCompleteData': ...","{'percentDailyValue': '9', 'hasCompleteData': ...","{'percentDailyValue': '26', 'hasCompleteData':...","{'percentDailyValue': '41', 'hasCompleteData':...",...,"{'percentDailyValue': '99', 'hasCompleteData':...","{'percentDailyValue': '24', 'hasCompleteData':...","{'percentDailyValue': '66', 'hasCompleteData':...","{'percentDailyValue': '42', 'hasCompleteData':...","{'percentDailyValue': '54', 'hasCompleteData':...","{'percentDailyValue': '0', 'hasCompleteData': ...","{'percentDailyValue': '96', 'hasCompleteData':...","{'percentDailyValue': '< 1', 'hasCompleteData'...","{'percentDailyValue': '46', 'hasCompleteData':...","{'percentDailyValue': '11', 'hasCompleteData':..."
10002,,"[{'id': '16421', 'text': '1/2 teaspoon salt'},...",[Preheat oven to 325 degrees F (170 degrees C)...,6,"{'amount': 13.48675, 'hasCompleteData': True, ...","{'amount': 199.9955, 'hasCompleteData': True, ...","{'amount': 104.309, 'hasCompleteData': True, '...","{'amount': 21.94938, 'hasCompleteData': True, ...","{'amount': 12.44067, 'hasCompleteData': True, ...","{'amount': 11.58989, 'hasCompleteData': True, ...",...,"{'amount': 2.251066, 'hasCompleteData': True, ...","{'amount': 79.8545, 'hasCompleteData': True, '...","{'amount': 3.463482, 'hasCompleteData': True, ...","{'amount': 2.7348, 'hasCompleteData': True, 'd...","{'amount': 127.2791, 'hasCompleteData': True, ...","{'amount': 14.64034, 'hasCompleteData': True, ...","{'amount': 0.07540625000000001, 'hasCompleteDa...","{'amount': 17.19408, 'hasCompleteData': True, ...","{'amount': 0.05833117, 'hasCompleteData': True...","{'amount': 0.001016667, 'hasCompleteData': Tru..."
10003,35 m,"[{'text': '1 cup butter', 'id': '16157'}, {'te...",[Preheat oven to 350 degrees F (175 degrees C)...,6,"{'percentDailyValue': '< 1', 'hasCompleteData'...","{'percentDailyValue': '6', 'hasCompleteData': ...","{'percentDailyValue': '-', 'hasCompleteData': ...","{'percentDailyValue': '4', 'hasCompleteData': ...","{'percentDailyValue': '7', 'hasCompleteData': ...","{'percentDailyValue': '12', 'hasCompleteData':...",...,"{'percentDailyValue': '5', 'hasCompleteData': ...","{'percentDailyValue': '< 1', 'hasCompleteData'...","{'percentDailyValue': '2', 'hasCompleteData': ...","{'percentDailyValue': '24', 'hasCompleteData':...","{'percentDailyValue': '2', 'hasCompleteData': ...","{'percentDailyValue': '0', 'hasCompleteData': ...","{'percentDailyValue': '6', 'hasCompleteData': ...","{'percentDailyValue': '5', 'hasCompleteData': ...","{'percentDailyValue': '< 1', 'hasCompleteData'...","{'percentDailyValue': '0', 'hasCompleteData': ..."


In [22]:
unit_converter = {'gallon': 3785, 'bag':1, 'package':1, 'roll':1, 'quart':0.001,'count':1, 'bone':1, 'lb':453, 'ml':100,'g':1,'oz':28.3, 'millilit': 1,'no_unit':1, 'bunch':150, 'stalk':200, 'pinch':1, 'pint':500, 'inch':1, 'clove':1, 'cup': 340, 'tablespoon':14.3, 'pound':453, 'ounc':28.3,  'teaspoon':4.2, 'kg':1000, 'gram':1, 'liter':1000, 'deciliter':100, 'mL':1}


In [23]:
units = unit_converter.keys()

In [25]:
def get_quantity_unit(ingredient):
    m = re.match('[-]?[0-9]+[,. ]?[0-9]*([\/][0-9]+[,.]?[0-9]*)*', ingredient)
    if m is not None:
        o = re.match('\([-]?[0-9]+[,. ]?[0-9]*([\/][0-9]+[,.]?[0-9]*)* ?[a-z]*[A-Z]*\)', ingredient[m.end():])
        if o is not None:
            b = re.match('[-]?[0-9]+[,. ]?[0-9]*([\/][0-9]+[,.]?[0-9]*)*', o.group()[1:-1])
            if stemmer.stem(o.group()[1:-1].split()[-1]) in units:
                return convert_fraction(b.group()), stemmer.stem(o.group()[1:-1].split()[-1])
            else:
                return convert_fraction(o.group()[1:-1]),stemmer.stem(ingredient[len(o.group())+m.end():].split()[0])
        else:
            ing_parts = [stemmer.stem(word) for word in nltk.word_tokenize(ingredient)]
            for part in ing_parts:
                if part in units:
                    return convert_fraction(m.group()), part
            return convert_fraction(m.group()), 'no_unit'
    
    else:
        return 1, 'no_unit'


In [28]:
ings = [(recipe_ing['text'], recipe_ing['id']) for recipe_ings in recipes_data['ingredients'] for recipe_ing in recipe_ings]

In [64]:
recipes_copy = pd.read_json('../data/recipes_clean.json')

In [None]:
recipes_data = pd.read_json('../data/recipes.json')
columns = ['id','total_time','ingredients', 'instructions', 'health_score'] + sorted(nutrition_json['nutrition'].keys())
recipes_data = recipes_data[columns]
recipes_data = recipes_data.set_index('id')
recipes_data.head()

In [108]:
with open('../data/clean_recipes.csv', 'w') as f:
    recipes_copy.to_csv(f, header=True, index=True)

In [55]:
all_ingredients = set([ingredient['id'] for recipe in recipes_copy['ingredients'] for ingredient in recipe])

## Recipes bitmap

In [164]:
columns = sorted(full_ingredients)

with open('../data/ingredients_bitmap.csv', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=['id']+columns)
    writer.writeheader()

    recipe_bitmap_vector = {}

    for ingredient in full_ingredients:
        recipe_bitmap_vector.update({ingredient:0})
    for recipe_id in tqdm(recipes_data.index):
        recipe_bitmap_vector.update({'id':recipe_id})
        for ingredient in (build_ingredients_vector(recipe_id)):
            recipe_bitmap_vector.update({ingredient:1}) 
        
        writer.writerow(recipe_bitmap_vector)
        for ingredient in (build_ingredients_vector(recipe_id)):
            recipe_bitmap_vector.update({ingredient:0})  

100%|██████████| 44065/44065 [03:40<00:00, 199.47it/s]


## Recipes quantity map

In [127]:
len(all_ingredients)

6253

In [None]:
to_convert = []
critical = []
rows = []
columns = sorted(all_ingredients)

with open('ingredients_quantity_map.csv', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=['id']+columns)
    writer.writeheader()

    recipe_bitmap_vector = Counter()
    for ingredient in all_ingredients:
        recipe_bitmap_vector[ingredient]=0
    
    for recipe_id in tqdm(recipes_copy.index):
        try:
            recipe_bitmap_vector['id']=recipe_id
            for ingredient in (recipes_copy.at[recipe_id, 'ingredients']):
                quantity, unit = get_quantity_unit(ingredient['text'])    
                recipe_bitmap_vector[ingredient['id']]+= quantity*unit_converter[unit]
            writer.writerow(recipe_bitmap_vector)
            
            
            for ingredient in (recipe_bitmap_vector):
                recipe_bitmap_vector[ingredient]=0

        except Exception as e:            
            print(e, recipe_id, ingredient)


In [9]:
import pandas as pd

In [47]:
quantity_map = pd.read_csv('ingredients_quantity_map.csv', ',',  index_col='id')

In [49]:
bitmap = pd.read_csv('../data/clean_ingredients_bitmap.csv', ',',  index_col='id')