Based on:
https://jakevdp.github.io/PythonDataScienceHandbook/03.10-working-with-strings.html

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%config IPCompleter.use_jedi = False

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
%%script false --no-raise-error

# The link below is corrupt (see https://github.com/fictivekin/openrecipes/issues/218)
#!curl -O http://openrecipes.s3.amazonaws.com/recipeitems-latest.json.gz
# Try this one:
!curl -O https://s3.amazonaws.com/openrecipes/20170107-061401-recipeitems.json.gz
!gunzip 20170107-061401-recipeitems.json.gz
!mv 20170107-061401-recipeitems.json ./data/20170107-061401-recipeitems.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 29.3M  100 29.3M    0     0  46.4M      0 --:--:-- --:--:-- --:--:-- 46.3M


In [3]:
file = './data/20170107-061401-recipeitems.json'
try:
    recipes = pd.read_json(file)
except ValueError as e:
    print("ValueError:", e)

ValueError: Trailing data


In [4]:
with open(file) as f:
    line = f.readline()
    print(line)
try:
    pd.read_json(line).shape
except ValueError as e:
    print("ValueError:", e)

{ "_id" : { "$oid" : "5160756b96cc62079cc2db15" }, "name" : "Drop Biscuits and Sausage Gravy", "ingredients" : "Biscuits\n3 cups All-purpose Flour\n2 Tablespoons Baking Powder\n1/2 teaspoon Salt\n1-1/2 stick (3/4 Cup) Cold Butter, Cut Into Pieces\n1-1/4 cup Butermilk\n SAUSAGE GRAVY\n1 pound Breakfast Sausage, Hot Or Mild\n1/3 cup All-purpose Flour\n4 cups Whole Milk\n1/2 teaspoon Seasoned Salt\n2 teaspoons Black Pepper, More To Taste", "url" : "http://thepioneerwoman.com/cooking/2013/03/drop-biscuits-and-sausage-gravy/", "image" : "http://static.thepioneerwoman.com/cooking/files/2013/03/bisgrav.jpg", "ts" : { "$date" : 1365276011104 }, "cookTime" : "PT30M", "source" : "thepioneerwoman", "recipeYield" : "12", "datePublished" : "2013-03-11", "prepTime" : "PT10M", "description" : "Late Saturday afternoon, after Marlboro Man had returned home with the soccer-playing girls, and I had returned home with the..." }

ValueError: Protocol not known: { "_id" : { "$oid" : "5160756b96cc62079cc2db1

In [5]:
from io import StringIO

In [6]:
# read the entire file into a Python array
with open(file, 'r') as f:
    # Extract each line
    data = (line.strip() for line in f)
    # Reformat so each line is the element of a list
    data_json = "[{0}]".format(','.join(data))
# read the result as a JSON
recipes = pd.read_json(StringIO(data_json))

In [7]:
recipes.columns

Index(['_id', 'name', 'ingredients', 'url', 'image', 'ts', 'cookTime',
       'source', 'recipeYield', 'datePublished', 'prepTime', 'description',
       'totalTime', 'creator', 'recipeCategory', 'dateModified',
       'recipeInstructions'],
      dtype='object')

In [8]:
recipes.ingredients.str.len().describe()

count    173278.000000
mean        244.617926
std         146.705285
min           0.000000
25%         147.000000
50%         221.000000
75%         314.000000
max        9067.000000
Name: ingredients, dtype: float64

In [9]:
recipes.name[np.argmax(recipes.ingredients.str.len())]

'Carrot Pineapple Spice &amp; Brownie Layer Cake with Whipped Cream &amp; Cream Cheese Frosting and Marzipan Carrots'

In [10]:
max_ingredients = recipes.ingredients.str.len()==9067
recipes.name[max_ingredients].values

array(['Carrot Pineapple Spice &amp; Brownie Layer Cake with Whipped Cream &amp; Cream Cheese Frosting and Marzipan Carrots'],
      dtype=object)

In [11]:
recipes.description.str.contains('[Bb]reakfast').sum()

3524

In [12]:
recipes.ingredients.str.contains('[Pp]oop').sum()

1

In [13]:
# Oh boy!
poop_recipe = recipes.ingredients.str.contains('[Pp]oop')
print(recipes[poop_recipe].name.values)
print(recipes[poop_recipe].ingredients.values)

['Adam’s Big Kid Hot Chocolate']
['Adam’s Big Kid Hot Chocolate\n\nAs a disclaimer, I should mention that, like many of the better things in life, the Big Kid Hot Chocolate recipe is pretty simple (despite Phoebe’s latent desire to somehow make it into bruschetta or a crostini). But this recipe also involves a lot of gut feeling and improvisation, so if you’re fully sober while you’re making this concoction, you’re missing out on half of the fun.\n1 Full-Sized Nesquik hot chocolate mix carton (or another brand…basically just a poopload of hot chocolate)\n\n1 Handle of Vodka (you’re not going to use the whole thing, but I only measure vodka in handles and you can probably figure out something to do with what’s left over)\n\n1 bottle of Bailey’s']


In [14]:
# Not what you might think!
penis_recipe = recipes.description.str.contains('[Pp]enis').fillna(False)
print(recipes[penis_recipe].name)
print(recipes[penis_recipe].ingredients.values)
print(recipes[penis_recipe].description.values)

7988    Scialatielli Gambine &amp; Zuccheri Recipe
Name: name, dtype: object
['scialatielli 200gr for 2 persons\nthree zucchine small-medium ofr 2 persons\n24 shrimps for 2 persons\na bit of besciamella ready to use']
['Scialatielli is a homemade pasta typical of my beloved Penisola Sorrentina (Italy).As back-up solution you can use "fusilli lunghi" or, if you prefer short pasta...']


### Simple Recipe Recommender

To Do: frequency of some ingredients per year? (how many years are there?)
    