# Introduction
This notebook is about gathering all scraped recipes into single dataset

In [1]:
### imports
import json

import dask.bag as db
import dask.dataframe as dd

## Format veryfication
Assure that all recipes have the same json structure

In [2]:
def validate(recipe : dict):
    try:
        assert 'title' in recipe
        assert 'link' in recipe
        assert 'ingredients' in recipe
        assert 'directions' in recipe
    except:
        print(recipe)
        raise AssertionException('Invalid recipe format')

# Source I
JSON files in `packed` directiories of subfolders

In [3]:
%%bash
ls ./*/packed/packed*

./allrecipes/packed/packed-0.json
./cookbooks/packed/packed-0.json
./cookbooks/packed/packed-1.json
./cookbooks/packed/packed-2.json
./cookbooks/packed/packed-3.json
./cookbooks/packed/packed-4.json
./cookbooks/packed/packed-5.json
./cookbooks/packed/packed-6.json
./cookbooks/packed/packed-7.json
./cookbooks/packed/packed-8.json
./cookbooks/packed/packed-9.json
./foodcom/packed/packed-0.json
./foodcom/packed/packed-1.json
./foodcom/packed/packed-2.json
./recipes-plus/packed/packed-0.json


In [4]:
folders = ['cookbooks', 'allrecipes', 'foodcom', 'recipes-plus']
s1 = db.read_text('./' + folders[0] + '/packed/packed-*.json').map(json.loads).flatten().to_dataframe()
for folder in folders[1:]:
    s1 = s1.append(db.read_text('./' + folder + '/packed/packed-*.json').map(json.loads).flatten().to_dataframe())

In [5]:
s1.count().compute()

title          1332521
ingredients    1332521
directions     1332521
link           1332521
dtype: int64

# Source II
JSON-line files in `0-mongo-dumps` directory

In [6]:
%%bash
ls ./0-mongo-dumps/scrapy_items*

./0-mongo-dumps/scrapy_items_epicurious.jsonl
./0-mongo-dumps/scrapy_items_food52.jsonl
./0-mongo-dumps/scrapy_items_myrecipes.jsonl
./0-mongo-dumps/scrapy_items_seriouseats.jsonl
./0-mongo-dumps/scrapy_items_tasteofhome.jsonl
./0-mongo-dumps/scrapy_items_tastykitchen.jsonl
./0-mongo-dumps/scrapy_items_yummly.jsonl


In [7]:
s2 = db.read_text('./0-mongo-dumps/scrapy_items*').map(json.loads).to_dataframe().drop(labels='_id', axis=1)

In [8]:
s2.count().compute()

title          422492
ingredients    422492
directions     422492
link           422492
dtype: int64

## Overall

In [9]:
dataset = s1.append(s2)

In [10]:
dataset.count().compute()

title          1755013
ingredients    1755013
directions     1755013
link           1755013
dtype: int64

In [11]:
dataset.head()

Unnamed: 0,title,ingredients,directions,link
0,saucy shrimp casserole,"[1/4 c. margarine, 1/4 c. flour, 2 c. milk,...","[Make, cream, sauce, with, margarine, flour, m...",http://www.cookbooks.com/Recipe-Details.aspx?i...
1,no-bake nut cookies,"[1 c. firmly packed brown sugar, 1/2 c. evap...","[In a heavy 2-quart saucepan, mix brown sugar,...",http://www.cookbooks.com/Recipe-Details.aspx?i...
2,jewell ball's chicken,"[1 small jar chipped beef, cut up, 4 boned c...","[Place chipped beef on bottom of baking dish.,...",http://www.cookbooks.com/Recipe-Details.aspx?i...
3,creamy corn,"[2 (16 oz.) pkg. frozen corn, 1 (8 oz.) pkg....","[In a slow cooker, combine all ingredients. Co...",http://www.cookbooks.com/Recipe-Details.aspx?i...
4,chicken funny,"[1 large whole chicken, 2 (10 1/2 oz.) cans ...","[Boil and debone chicken., Put bite size piece...",http://www.cookbooks.com/Recipe-Details.aspx?i...


## Transform list of directions and ingredients do json strings

In [12]:
list(dataset.head()['ingredients'])[0]

['1/4  c. margarine',
 '1/4  c. flour',
 '2  c. milk',
 '1  tsp. salt',
 '1/2  tsp. Worcestershire sauce',
 '  dash of pepper',
 '8  oz. Cracker Barrel cheese, shredded',
 '6  hard boiled eggs',
 '1  lb. cooked shrimp']

In [13]:
## run it only once or get mistakes
dataset.ingredients = dataset.ingredients.map(json.dumps)
dataset.directions = dataset.directions.map(json.dumps)

In [14]:
dataset.head()

Unnamed: 0,title,ingredients,directions,link
0,saucy shrimp casserole,"[""1/4 c. margarine"", ""1/4 c. flour"", ""2 c. ...","[""Make"", ""cream"", ""sauce"", ""with"", ""margarine,...",http://www.cookbooks.com/Recipe-Details.aspx?i...
1,no-bake nut cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. e...","[""In a heavy 2-quart saucepan, mix brown sugar...",http://www.cookbooks.com/Recipe-Details.aspx?i...
2,jewell ball's chicken,"[""1 small jar chipped beef, cut up"", ""4 bone...","[""Place chipped beef on bottom of baking dish....",http://www.cookbooks.com/Recipe-Details.aspx?i...
3,creamy corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) p...","[""In a slow cooker, combine all ingredients. C...",http://www.cookbooks.com/Recipe-Details.aspx?i...
4,chicken funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) ca...","[""Boil and debone chicken."", ""Put bite size pi...",http://www.cookbooks.com/Recipe-Details.aspx?i...


# Export

In [15]:
version = '0.9.2'

In [16]:
dataset.to_csv('./0-datasets/v-'+version+'/dataset-*.csv', index=False)

['/home/wojtek/recipes/scraping/0-datasets/v-0.9.2/dataset-00.csv',
 '/home/wojtek/recipes/scraping/0-datasets/v-0.9.2/dataset-01.csv',
 '/home/wojtek/recipes/scraping/0-datasets/v-0.9.2/dataset-02.csv',
 '/home/wojtek/recipes/scraping/0-datasets/v-0.9.2/dataset-03.csv',
 '/home/wojtek/recipes/scraping/0-datasets/v-0.9.2/dataset-04.csv',
 '/home/wojtek/recipes/scraping/0-datasets/v-0.9.2/dataset-05.csv',
 '/home/wojtek/recipes/scraping/0-datasets/v-0.9.2/dataset-06.csv',
 '/home/wojtek/recipes/scraping/0-datasets/v-0.9.2/dataset-07.csv',
 '/home/wojtek/recipes/scraping/0-datasets/v-0.9.2/dataset-08.csv',
 '/home/wojtek/recipes/scraping/0-datasets/v-0.9.2/dataset-09.csv',
 '/home/wojtek/recipes/scraping/0-datasets/v-0.9.2/dataset-10.csv',
 '/home/wojtek/recipes/scraping/0-datasets/v-0.9.2/dataset-11.csv',
 '/home/wojtek/recipes/scraping/0-datasets/v-0.9.2/dataset-12.csv',
 '/home/wojtek/recipes/scraping/0-datasets/v-0.9.2/dataset-13.csv',
 '/home/wojtek/recipes/scraping/0-datasets/v-0.9

In [17]:
dataset.to_csv('./0-datasets/v-'+version+'-single.csv', single_file=True, index=False)

['/home/wojtek/recipes/scraping/0-datasets/v-0.9.2-single.csv']