In [23]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
# import recipes.parquet into a pandas DataFrame
df: pd.DataFrame = pq.read_table('recipes.parquet').to_pandas()

In [24]:
COLUMNS_TO_KEEP = [
    'RecipeId',
    'Name',
    'AuthorName',
    'Description',
    'RecipeCategory',
    'Keywords',
    'RecipeIngredientParts',
    'Calories',
    'FatContent',
    'SaturatedFatContent',
    'CholesterolContent',
    'SodiumContent',
    'CarbohydrateContent',
    'FiberContent',
    'SugarContent',
    'ProteinContent',
]

df_columns= df[COLUMNS_TO_KEEP]
df_columns.head()

Unnamed: 0,RecipeId,Name,AuthorName,Description,RecipeCategory,Keywords,RecipeIngredientParts,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent
0,38.0,Low-Fat Berry Blue Frozen Dessert,Dancer,Make and share this Low-Fat Berry Blue Frozen ...,Frozen Desserts,"[Dessert, Low Protein, Low Cholesterol, Health...","[blueberries, granulated sugar, vanilla yogurt...",170.9,2.5,1.3,8.0,29.8,37.1,3.6,30.2,3.2
1,39.0,Biryani,elly9812,Make and share this Biryani recipe from Food.com.,Chicken Breast,"[Chicken Thigh & Leg, Chicken, Poultry, Meat, ...","[saffron, milk, hot green chili peppers, onion...",1110.7,58.8,16.6,372.8,368.4,84.4,9.0,20.4,63.4
2,40.0,Best Lemonade,Stephen Little,This is from one of my first Good House Keepi...,Beverages,"[Low Protein, Low Cholesterol, Healthy, Summer...","[sugar, lemons, rind of, lemon, zest of, fresh...",311.1,0.2,0.0,0.0,1.8,81.5,0.4,77.2,0.3
3,41.0,Carina's Tofu-Vegetable Kebabs,Cyclopz,This dish is best prepared a day in advance to...,Soy/Tofu,"[Beans, Vegetable, Low Cholesterol, Weeknight,...","[extra firm tofu, eggplant, zucchini, mushroom...",536.1,24.0,3.8,0.0,1558.6,64.2,17.3,32.1,29.3
4,42.0,Cabbage Soup,Duckie067,Make and share this Cabbage Soup recipe from F...,Vegetable,"[Low Protein, Vegan, Low Cholesterol, Healthy,...","[plain tomato juice, cabbage, onion, carrots, ...",103.6,0.4,0.1,0.0,959.3,25.1,4.8,17.7,4.3


In [25]:
keywords = set()
for kw in df_columns['Keywords']:
    keywords.update(kw)
len(keywords)

315

In [26]:
ingredients = set()
for ing in df_columns['RecipeIngredientParts']:
    ingredients.update(ing)
len(ingredients)

7368

In [27]:
categories = set()
categories.update(df_columns['RecipeCategory'])
len(categories)

312

In [28]:
from typing import List

def nutrient_score(nutients: List[float], a: float = 0.1):
    return sum(map(lambda x: max(0, a * (x - 5) * (x - 20)), nutients))

NUTRIENT_COLUMNS = [
    'Calories',
    'FatContent',
    'SaturatedFatContent',
    'CholesterolContent',
    'SodiumContent',
    'CarbohydrateContent',
    'FiberContent',
#    'SugarContent', # There is no published DV for sugar
    'ProteinContent',
]

NUTRIENT_TARGET_DV = {
    'Calories': 2000,
    'FatContent': 78,
    'SaturatedFatContent': 20,
    'CholesterolContent': 300,
    'SodiumContent': 2300,
    'CarbohydrateContent': 275,
    'FiberContent': 28,
#    'SugarContent': 50, # 50g is the recommended daily limit for *added* sugar
    'ProteinContent': 50,
}

df_nutrients = df_columns.copy()
for column in NUTRIENT_COLUMNS:
    df_nutrients[column + 'DV'] = df_nutrients[column] / NUTRIENT_TARGET_DV[column]

df_nutrients['NutrientScore'] = df_nutrients[NUTRIENT_COLUMNS].apply(nutrient_score, axis=1)

In [29]:
# Drop values with nutrient score greater than MAX_SCORE
df_outliers = df_nutrients.copy()
print(len(df_outliers))
MAX_SCORE = 400_000
df_outliers = df_outliers[df_outliers['NutrientScore'] < MAX_SCORE]
print(len(df_outliers))

522517
479647


In [30]:
df_outliers.drop(columns=['Keywords', 'RecipeIngredientParts']).to_csv('recipes-dv.csv', index=False)

In [31]:
# Find the ratio of recipes with a calorie DV between 0.05 and 0.2 without creating a new df
calories = df_nutrients['CaloriesDV']
calories_filtered = calories[(calories > 0.05) & (calories < 0.2)]
calories_filtered_ratio = len(calories_filtered) / len(calories)
calories_filtered_ratio

0.5014152649578866

In [32]:
# The bottom calories_filitered_ration% of the nutrient score column, are healthy
# Create a new boolean column called Healthy
df_healthy = df_nutrients.copy()
df_healthy['Healthy'] = df_healthy['NutrientScore'] < df_healthy['NutrientScore'].quantile(calories_filtered_ratio)

In [33]:
df_healthy.drop(columns=['Keywords', 'RecipeIngredientParts']).to_csv('recipes-healthy.csv', index=False)