# Dataset aggragation into one parquet file

In [None]:
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import hvplot.pandas  # noqa
import json
import ast
import random
from fastparquet import write
hvplot.extension("bokeh")

In [44]:
output_table = pd.DataFrame(columns=["title","ingredients","directions","keywords"])

## Food.com - Recipes

In [None]:
# Gather necessary rows for external application 
food_com_recipes = pd.read_parquet('./receipe_datasets/food.com/recipes.parquet')
nRow, nCol = food_com_recipes.shape
print(f'food_com_recipes: There are {nRow} rows and {nCol} columns')
print(food_com_recipes.dtypes)

In [35]:
for i in range(0, nCol):
    title = str(food_com_recipes.loc[i, 'Name'])
    ingredients = food_com_recipes.loc[i, 'RecipeIngredientParts']
    directions = food_com_recipes.loc[i, 'RecipeInstructions']
    keywords = food_com_recipes.loc[i, 'Keywords']
    # tmp_df = {'title': title, 'ingredients': ingredients, 'directions': directions, 'keywords':keywords}
    tmp_df = [ title,  ingredients, directions, keywords ]
    output_table.loc[len(output_table)] = tmp_df
    if(i == 0):
        print(tmp_df)

output_table.head()

['Low-Fat Berry Blue Frozen Dessert', ['blueberries', 'granulated sugar', 'vanilla yogurt', 'lemon juice'], ['Toss 2 cups berries with sugar.', 'Let stand for 45 minutes, stirring occasionally.', 'Transfer berry-sugar mixture to food processor.', 'Add yogurt and process until smooth.', "Strain through fine sieve. Pour into baking pan (or transfer to ice cream maker and process according to manufacturers' directions). Freeze uncovered until edges are solid but centre is soft.  Transfer to processor and blend until smooth again.", 'Return to pan and freeze until edges are solid.', 'Transfer to processor and blend until smooth again.', 'Fold in remaining 2 cups of blueberries.', 'Pour into plastic mold and freeze overnight. Let soften slightly to serve.'], ['Dessert', 'Low Protein', 'Low Cholesterol', 'Healthy', 'Free Of...', 'Summer', 'Weeknight', 'Freezer', 'Easy']]


Unnamed: 0,title,ingredients,directions,keywords
0,Low-Fat Berry Blue Frozen Dessert,"[blueberries, granulated sugar, vanilla yogurt...","[Toss 2 cups berries with sugar., Let stand fo...","[Dessert, Low Protein, Low Cholesterol, Health..."
1,Biryani,"[saffron, milk, hot green chili peppers, onion...",[Soak saffron in warm milk for 5 minutes and p...,"[Chicken Thigh & Leg, Chicken, Poultry, Meat, ..."
2,Best Lemonade,"[sugar, lemons, rind of, lemon, zest of, fresh...","[Into a 1 quart Jar with tight fitting lid, pu...","[Low Protein, Low Cholesterol, Healthy, Summer..."
3,Carina's Tofu-Vegetable Kebabs,"[extra firm tofu, eggplant, zucchini, mushroom...","[Drain the tofu, carefully squeezing out exces...","[Beans, Vegetable, Low Cholesterol, Weeknight,..."
4,Cabbage Soup,"[plain tomato juice, cabbage, onion, carrots, ...","[Mix everything together and bring to a boil.,...","[Low Protein, Vegan, Low Cholesterol, Healthy,..."


## Epicurious.com - Recipes

In [40]:
# Food Ingredients and Recipe Dataset with Image Name Mapping.csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
epicurious_recipes = pd.read_csv('./receipe_datasets/Epicurious Recepies Kagle.csv', delimiter=',')
nRow, nCol = epicurious_recipes.shape
print(f'epicurious_recipes: There are {nRow} rows and {nCol} columns')
print(epicurious_recipes.dtypes)

epicurious_recipes: There are 13501 rows and 6 columns
Unnamed: 0              int64
Title                  object
Ingredients            object
Instructions           object
Image_Name             object
Cleaned_Ingredients    object
dtype: object


In [45]:
for i in range(0, nCol):
    title = str(epicurious_recipes.loc[i, 'Title'])
    ingredients = ast.literal_eval(str(epicurious_recipes.loc[i, 'Cleaned_Ingredients']))
    directions = str(epicurious_recipes.loc[i, 'Instructions']).split('\n')
    keywords = []
    # tmp_df = {'title': title, 'ingredients': ingredients, 'directions': directions, 'keywords':keywords}
    tmp_df = [ title,  ingredients, directions, keywords ]
    output_table.loc[len(output_table)] = tmp_df
    if(i == 0):
        print(tmp_df)

output_table.head()

['Miso-Butter Roast Chicken With Acorn Squash Panzanella', ['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher salt, divided, plus more', '2 small acorn squash (about 3 lb. total)', '2 Tbsp. finely chopped sage', '1 Tbsp. finely chopped rosemary', '6 Tbsp. unsalted butter, melted, plus 3 Tbsp. room temperature', '¼ tsp. ground allspice', 'Pinch of crushed red pepper flakes', 'Freshly ground black pepper', '⅓ loaf good-quality sturdy white bread, torn into 1" pieces (about 2½ cups)', '2 medium apples (such as Gala or Pink Lady; about 14 oz. total), cored, cut into 1" pieces', '2 Tbsp. extra-virgin olive oil', '½ small red onion, thinly sliced', '3 Tbsp. apple cider vinegar', '1 Tbsp. white miso', '¼ cup all-purpose flour', '2 Tbsp. unsalted butter, room temperature', '¼ cup dry white wine', '2 cups unsalted chicken broth', '2 tsp. white miso', 'Kosher salt', 'freshly ground pepper'], ['Pat chicken dry with paper towels, season all over with 2 tsp. salt, and tie legs together with kitchen twi

Unnamed: 0,title,ingredients,directions,keywords
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"[1 (3½–4-lb.) whole chicken, 2¾ tsp. kosher sa...","[Pat chicken dry with paper towels, season all...",[]
1,Crispy Salt and Pepper Potatoes,"[2 large egg whites, 1 pound new potatoes (abo...",[Preheat oven to 400°F and line a rimmed bakin...,[]
2,Thanksgiving Mac and Cheese,"[1 cup evaporated milk, 1 cup whole milk, 1 ts...",[Place a rack in middle of oven; preheat to 40...,[]
3,Italian Sausage and Bread Stuffing,"[1 (¾- to 1-pound) round Italian loaf, cut int...",[Preheat oven to 350°F with rack in middle. Ge...,[]
4,Newton's Law,"[1 teaspoon dark brown sugar, 1 teaspoon hot w...",[Stir together brown sugar and hot water in a ...,[]
