In [28]:
import numpy as np
import pandas as pd

import sys, os

# Helpers
abspath = os.path.abspath
dirname = os.path.dirname
sep = os.sep

# Update sys.path for in-house libraries
folder_ = dirname(abspath(os.getcwd()))
for i in range(1): folder_ = dirname(folder_)
sys.path.append(folder_)

# In-house libraries
import src.utils.mining_data_tb as md
import src.utils.folder_tb as fo

In [29]:
# Path to nutrition data
nutrition_data_path = fo.path_to_folder(2,"data" + sep + "environment")

# Load the excel file as dataframe
nutrition_df = pd.read_excel(nutrition_data_path + "2017-2018 FNDDS At A Glance - FNDDS Nutrient Values.xlsx", skiprows = 1)
nutrition_df.head()

Unnamed: 0,Food code,Main food description,WWEIA Category number,WWEIA Category description,Energy (kcal),Protein (g),Carbohydrate (g),"Sugars, total\n(g)","Fiber, total dietary (g)",Total Fat (g),...,20:1\n(g),22:1\n(g),18:2\n(g),18:3\n(g),18:4\n(g),20:4\n(g),20:5 n-3\n(g),22:5 n-3\n(g),22:6 n-3\n(g),Water\n(g)
0,11000000,"Milk, human",9602,Human milk,70,1.03,6.89,6.89,0.0,4.38,...,0.04,0.0,0.374,0.052,0.0,0.026,0.0,0.0,0.0,87.5
1,11100000,"Milk, NFS",1004,"Milk, reduced fat",51,3.34,4.87,4.89,0.0,1.99,...,0.002,0.0,0.069,0.007,0.0,0.003,0.0,0.001,0.0,89.04
2,11111000,"Milk, whole",1002,"Milk, whole",60,3.28,4.67,4.81,0.0,3.2,...,0.004,0.0,0.115,0.012,0.0,0.004,0.001,0.002,0.0,88.1
3,11111100,"Milk, low sodium, whole",1002,"Milk, whole",61,3.1,4.46,4.46,0.0,3.46,...,0.0,0.0,0.078,0.05,0.0,0.0,0.0,0.0,0.0,88.2
4,11111150,"Milk, calcium fortified, whole",1002,"Milk, whole",60,3.28,4.67,4.81,0.0,3.2,...,0.004,0.0,0.115,0.012,0.0,0.004,0.001,0.002,0.0,88.1


In [30]:
nutrition_df.columns

Index(['Food code', 'Main food description', 'WWEIA Category number',
       'WWEIA Category description', 'Energy (kcal)', 'Protein (g)',
       'Carbohydrate (g)', 'Sugars, total\n(g)', 'Fiber, total dietary (g)',
       'Total Fat (g)', 'Fatty acids, total saturated (g)',
       'Fatty acids, total monounsaturated (g)',
       'Fatty acids, total polyunsaturated (g)', 'Cholesterol (mg)',
       'Retinol (mcg)', 'Vitamin A, RAE (mcg_RAE)', 'Carotene, alpha (mcg)',
       'Carotene, beta (mcg)', 'Cryptoxanthin, beta (mcg)', 'Lycopene (mcg)',
       'Lutein + zeaxanthin (mcg)', 'Thiamin (mg)', 'Riboflavin (mg)',
       'Niacin (mg)', 'Vitamin B-6 (mg)', 'Folic acid (mcg)',
       'Folate, food (mcg)', 'Folate, DFE (mcg_DFE)', 'Folate, total (mcg)',
       'Choline, total (mg)', 'Vitamin B-12 (mcg)',
       'Vitamin B-12, added\n(mcg)', 'Vitamin C (mg)',
       'Vitamin D (D2 + D3) (mcg)', 'Vitamin E (alpha-tocopherol) (mg)',
       'Vitamin E, added\n(mg)', 'Vitamin K (phylloquinone) (

In [31]:
nutrition_df.shape

(7083, 69)

In [32]:
### Choosing the variables that I will use from the dataframe

# Key nutrients for the comparison with recommended daily intake
daily_intake_nutrients = ["Protein (g)", "Water\n(g)", "Fiber, total dietary (g)", "Vitamin A, RAE (mcg_RAE)", "Thiamin (mg)", "Riboflavin (mg)", "Niacin (mg)", "Vitamin B-6 (mg)", "Vitamin B-12 (mcg)",  "Vitamin B-12, added\n(mcg)", "Folate, total (mcg)", "Vitamin C (mg)", "Calcium (mg)", "Iron\n(mg)", "Magnesium (mg)", "Potassium (mg)", "Sodium (mg)", "Zinc\n(mg)"]

# Additional interesting nutrients to explore
additional_nutrients = ["Energy (kcal)", "Sugars, total\n(g)", "Total Fat (g)", "Fatty acids, total saturated (g)", "Fatty acids, total monounsaturated (g)", "Fatty acids, total polyunsaturated (g)", "Cholesterol (mg)", "Vitamin D (D2 + D3) (mcg)"]

# For grouping and categorization
support_columns = ["Main food description", "WWEIA Category number", "WWEIA Category description"]

nutrition_df = nutrition_df.loc[:, support_columns + daily_intake_nutrients + additional_nutrients]

In [33]:
nutrition_df.head(2)

Unnamed: 0,Main food description,WWEIA Category number,WWEIA Category description,Protein (g),Water\n(g),"Fiber, total dietary (g)","Vitamin A, RAE (mcg_RAE)",Thiamin (mg),Riboflavin (mg),Niacin (mg),...,Sodium (mg),Zinc\n(mg),Energy (kcal),"Sugars, total\n(g)",Total Fat (g),"Fatty acids, total saturated (g)","Fatty acids, total monounsaturated (g)","Fatty acids, total polyunsaturated (g)",Cholesterol (mg),Vitamin D (D2 + D3) (mcg)
0,"Milk, human",9602,Human milk,1.03,87.5,0.0,61,0.014,0.036,0.177,...,17,0.17,70,6.89,4.38,2.009,1.658,0.497,14,0.1
1,"Milk, NFS",1004,"Milk, reduced fat",3.34,89.04,0.0,59,0.057,0.137,0.11,...,39,0.42,51,4.89,1.99,1.164,0.426,0.065,8,1.1


In [34]:
### Once filtered the dataframe, I clean the column names

# Key nutrients for the comparison with recommended daily intake
cleaned_daily_intake_nutrients = ["Protein (g)", "Water (g)", "Fiber, total dietary (g)", "Vitamin A, RAE (mcg)", "Thiamin (mg)", "Riboflavin (mg)", "Niacin (mg)", "Vitamin B-6 (mg)", "Vitamin B-12 (mcg)",  "Vitamin B-12, added (mcg)", "Folate, total (mcg)", "Vitamin C (mg)", "Calcium (mg)", "Iron (mg)", "Magnesium (mg)", "Potassium (mg)", "Sodium (mg)", "Zinc (mg)"]

# Additional interesting nutrients to explore
cleaned_additional_nutrients = ["Energy (kcal)", "Sugars, total (g)", "Total Fat (g)", "Fatty acids, total saturated (g)", "Fatty acids, total monounsaturated (g)", "Fatty acids, total polyunsaturated (g)", "Cholesterol (mg)", "Vitamin D (D2 + D3) (mcg)"]

# For grouping and categorization
cleaned_support_columns = ["Food name", "Category number", "Category name"]

nutrition_df.columns = cleaned_support_columns + cleaned_daily_intake_nutrients + cleaned_additional_nutrients

In [35]:
# Joining Vitamin B-12 column, as the total value is the sum of the food raw value and the added amount
nutrition_df["Vitamin B-12 (mcg)"] = nutrition_df["Vitamin B-12 (mcg)"] + nutrition_df["Vitamin B-12, added (mcg)"]
# Dropping the two old Vitamin B12 columns, as we now have them together in a new one
nutrition_df.drop(["Vitamin B-12, added (mcg)"], axis = 1, inplace = True)

In [36]:
nutrition_df.columns

Index(['Food name', 'Category number', 'Category name', 'Protein (g)',
       'Water (g)', 'Fiber, total dietary (g)', 'Vitamin A, RAE (mcg)',
       'Thiamin (mg)', 'Riboflavin (mg)', 'Niacin (mg)', 'Vitamin B-6 (mg)',
       'Vitamin B-12 (mcg)', 'Folate, total (mcg)', 'Vitamin C (mg)',
       'Calcium (mg)', 'Iron (mg)', 'Magnesium (mg)', 'Potassium (mg)',
       'Sodium (mg)', 'Zinc (mg)', 'Energy (kcal)', 'Sugars, total (g)',
       'Total Fat (g)', 'Fatty acids, total saturated (g)',
       'Fatty acids, total monounsaturated (g)',
       'Fatty acids, total polyunsaturated (g)', 'Cholesterol (mg)',
       'Vitamin D (D2 + D3) (mcg)'],
      dtype='object')

In [42]:
### I create some positive and negative filters for later use

### NEGATIVE FILTERS
others = ['Formula, ready-to-feed', 'Formula, prepared from powder', 'Formula, prepared from concentrate', 'Sugar substitutes', 'Not included in a food category']
baby_food = ['Baby food: yogurt', 'Baby food: snacks and sweets', 'Baby food: meat and dinners', ]
desserts_and_snacks = ['Ice cream and frozen dairy desserts', 'Milk shakes and other dairy drinks', 'Cakes and pies', 'Candy not containing chocolate', 'Doughnuts, sweet rolls, pastries', 'Crackers, excludes saltines', 'Cookies and brownies', 'Biscuits, muffins, quick breads', 'Pancakes, waffles, French toast', 'Cereal bars', 'Nutrition bars', 'Saltine crackers', 'Pretzels/snack mix', 'Potato chips', 'Candy containing chocolate', 'Pancakes, waffles, French toast']
drinks = ['Soft drinks', 'Diet soft drinks', 'Flavored or carbonated water', 'Other diet drinks', 'Beer', 'Liquor and cocktails', 'Wine', 'Nutritional beverages', 'Protein and nutritional powders', 'Sport and energy drinks', 'Diet sport and energy drinks']
sandwiches = ['Burritos and tacos', 'Other sandwiches (single code)', 'Burgers (single code)', 'Egg/breakfast sandwiches (single code)', 'Frankfurter sandwiches (single code)', 'Frankfurter sandwiches (single code)', 'Vegetables on a sandwich']
prepared_dishes = ['Rolls and buns', 'Egg rolls, dumplings, sushi', 'Pasta mixed dishes, excludes macaroni and cheese', 'Macaroni and cheese', 'Pizza', 'Meat mixed dishes', 'Stir-fry and soy-based sauce mixtures', 'Bean, pea, legume dishes', 'Seafood mixed dishes', 'Rice mixed dishes', 'Fried rice and lo/chow mein', 'Poultry mixed dishes']
sauces = ['Dips, gravies, other sauces''Pasta sauces, tomato-based', 'Mustard and other condiments', 'Mayonnaise', 'Jams, syrups, toppings']

full_negative_filter = others + baby_food + desserts_and_snacks + drinks + sandwiches + prepared_dishes + sauces

negative_filters_list = [others, baby_food, desserts_and_snacks, drinks, sandwiches, prepared_dishes, sauces, full_negative_filter]

### POSITIVE FILTERS
milks = ['Lamb, goat, game', 'Human milk', 'Milk, reduced fat', 'Milk, whole', 'Milk, lowfat', 'Milk, nonfat', 'Flavored milk, whole', 'Yogurt, regular', 'Yogurt, Greek']
cheese = ['Cheese', 'Cottage/ricotta cheese']
other_animal_products = ['Eggs and omelets', 'Butter and animal fats']
meats = ['Ground beef', 'Cold cuts and cured meats', 'Bacon', 'Pork', 'Liver and organ meats', 'Frankfurters', 'Sausages']
chicken = ['Turkey, duck, other poultry', 'Chicken, whole pieces', 'Chicken patties, nuggets and tenders']
fish = ['Fish', 'Shellfish']

milk_substitutes = ['Milk substitutes']
beans = ['Beans, peas, legumes']
soy_products = ['Processed soy products']
nuts = ['Nuts and seeds']
other_veggie_products = ['Peanut butter and jelly sandwiches (single code)', 'Oatmeal']

animal_filter = milks + cheese + other_animal_products + meats + chicken + fish
veggie_filter = milk_substitutes + beans + soy_products + nuts + other_veggie_products

full_positive_filter = animal_filter + veggie_filter

positive_filters_list = [milks, cheese, other_animal_products, meats, chicken, fish, milk_substitutes, beans, soy_products, nuts, other_veggie_products, animal_filter, veggie_filter, full_positive_filter]

In [51]:
category_2 = ["Milks", "Cheese", "Other Animal Products", "Meats", "Chicken", "Fish", "Milk Substitutes", "Beans", "Soy Products", "Nuts", "Other Veggie Products"]
category_3 = ["Animal", "Veggie"]

# I create 2 new support columns to better group the foods
nutrition_df["Category 2"] = None
nutrition_df["Category 3"] = None

# Iterate over category 2 list
for ind, val in enumerate(category_2):
    # Take as the filter, the corresponding value from the positive_filter_list using the index
    filter_ = positive_filters_list[ind]
    # Get the index of the foods whose "Category name" appead in the list
    condition = nutrition_df[nutrition_df["Category name"].isin(filter_)].index
    # For those values, replace the previous "None" with the category_2 list value
    nutrition_df.loc[condition, "Category 2"] = val

# Similar procedure as before
for ind, val in enumerate(category_3):
    # + 11 --> because that's where the animal/veggie filters are in the list
    filter_ = positive_filters_list[ind + 11]
    condition = nutrition_df[nutrition_df["Category name"].isin(filter_)].index
    nutrition_df.loc[condition, "Category 3"] = val

In [None]:
nutrition_df.columns

In [None]:
nutrition_df.info()

In [59]:
# Let's save this dataframe
environment_data_path = fo.path_to_folder(2, "data" + sep + "environment")
nutrition_df.to_csv(environment_data_path + "nutritional_values.csv")