In [1]:
import pandas as pd
import dask.dataframe as dd
import json

In [2]:
recipes1M = dd.read_csv('0-datasets/rec1M/rec1M-*.csv')
recipes1M = recipes1M.drop(['Unnamed: 0', 'id', 'partition'], axis=1)
recipes1M = recipes1M.rename(columns={"instructions":"directions", "url":"link"})

In [3]:
recipes1M.ingredients = recipes1M.ingredients.map(json.loads, meta=pd.Series([], dtype=object, name='ingredients'))
recipes1M.directions = recipes1M.directions.map(json.loads, meta=pd.Series([], dtype=object, name='directions'))
recipes1M.ingredients = recipes1M.ingredients.map(lambda x: list(map(lambda y: y['text'], x)), meta=pd.Series([], dtype=object, name='ingredients'))
recipes1M.directions = recipes1M.directions.map(lambda x: list(map(lambda y: y['text'], x)), meta=pd.Series([], dtype=object, name='directions'))

In [4]:
df = recipes1M.compute()
df.head()

Unnamed: 0,ingredients,directions,title,link
0,"[6 ounces penne, 2 cups Beechers Flagship Chee...",[Preheat the oven to 350 F. Butter or oil an 8...,Worlds Best Mac and Cheese,http://www.epicurious.com/recipes/food/views/-...
1,"[1 c. elbow macaroni, 1 c. cubed American chee...",[Cook macaroni according to package directions...,Dilly Macaroni Salad Recipe,http://cookeatshare.com/recipes/dilly-macaroni...
2,"[8 tomatoes, quartered, Kosher salt, 1 red oni...",[Add the tomatoes to a food processor with a p...,Gazpacho,http://www.foodnetwork.com/recipes/gazpacho1.html
3,"[2 12 cups milk, 1 12 cups water, 14 cup butte...","[Preheat oven to 350 degrees Fahrenheit., Spra...",Crunchy Onion Potato Bake,http://www.food.com/recipe/crunchy-onion-potat...
4,"[1 (3 ounce) package watermelon gelatin, 14 cu...","[Dissolve Jello in boiling water., Allow to co...",Cool 'n Easy Creamy Watermelon Pie,http://www.food.com/recipe/cool-n-easy-creamy-...


In [5]:
df.link.describe()

count                                               1029720
unique                                              1028642
top       http://www.kraftrecipes.com/recipes/surejell-a...
freq                                                     13
Name: link, dtype: object

In [6]:
def get_domain(url: str) -> str:
    w = url.split('//')[1]
    return w.split('/')[0]

assert get_domain('http://food.com/recipes') == 'food.com'

In [7]:
source = df.link.map(get_domain)
ingredients_count = df.ingredients.map(len)
directions_count = df.directions.map(len)
df.insert(4, 'source', source)
df.insert(5, "ingredients_count", ingredients_count)
df.insert(6, "directions_count", directions_count)

In [8]:
df.groupby(["source"]).agg({
    'title': 'count',
    'ingredients_count': ['min', 'mean', 'max'],
    'directions_count': ['min', 'mean', 'max']
})

Unnamed: 0_level_0,title,ingredients_count,ingredients_count,ingredients_count,directions_count,directions_count,directions_count
Unnamed: 0_level_1,count,min,mean,max,min,mean,max
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
allrecipes.com,49006,1,9.078847,39,1,8.488471,52
cookeatshare.com,60628,1,8.290427,49,1,9.645197,126
cooking.nytimes.com,17453,1,9.828797,63,1,12.773964,61
cookpad.com,61438,1,8.205231,99,1,10.522006,163
online-cookbook.com,5763,1,8.816589,44,1,8.68211,58
recipeland.com,27332,1,9.954888,49,1,10.751903,151
tastykitchen.com,75537,1,9.752916,44,1,13.702424,116
www.chowhound.com,6361,1,9.013363,37,1,11.610124,89
www.comidakraft.com,1,6,6.0,6,6,6.0,6
www.cookstr.com,9240,1,10.413636,54,1,14.652165,148


In [9]:
df.agg({
    'title': 'count',
    'ingredients_count': ['min', 'mean', 'median', 'max'],
    'directions_count': ['min', 'mean', 'median', 'max']
})

Unnamed: 0,title,ingredients_count,directions_count
count,1029720.0,,
max,,99.0,167.0
mean,,9.328161,10.456821
median,,9.0,9.0
min,,1.0,1.0
