In [1]:
# -------------------------------- IMPORT LIBRARIES --------------------------------
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import re
from varname import nameof

import requests
from bs4 import BeautifulSoup
import html
import lxml

import sys, os

dir = os.path.dirname
sys.path.append(dir(os.getcwd()))

import src.utils.mining_data_tb as md
import src.utils.visualization_tb as vis
import src.utils.folder_tb as fo

In [2]:
# -------------------------------- IMPORT FILE --------------------------------

# path to file
path = fo.path_to_folder(2,"data")

main = pd.read_excel(path + "2017-2018 FNDDS At A Glance - FNDDS Nutrient Values.xlsx", skiprows = 1)

In [11]:
# -------------------------------- FILTERS --------------------------------

def nutrients_filter(num):
    # Key nutrients for the comparison with recommended daily intake
    daily_intake_nutrients = ["Protein (g)", "Water\n(g)", "Fiber, total dietary (g)", "Vitamin A, RAE (mcg_RAE)", "Thiamin (mg)", "Riboflavin (mg)", "Niacin (mg)", "Vitamin B-6 (mg)", "Vitamin B-12 (mcg)",  "Vitamin B-12, added\n(mcg)", "Folate, total (mcg)", "Vitamin C (mg)", "Calcium (mg)", "Iron\n(mg)", "Magnesium (mg)", "Potassium (mg)", "Sodium (mg)", "Zinc\n(mg)"]

    # Additional interesting nutrients to explore
    additional_nutrients = ["Energy (kcal)", "Total Fat (g)", "Fatty acids, total saturated (g)", "Fatty acids, total monounsaturated (g)", "Fatty acids, total polyunsaturated (g)", "Cholesterol (mg)", "Vitamin D (D2 + D3) (mcg)"]

    # For grouping and categorization
    support_columns = ["Main food description", "WWEIA Category number", "WWEIA Category description"]

    full_column_filter = support_columns + daily_intake_nutrients + additional_nutrients

    if num == 1:
        return daily_intake_nutrients
    elif num == 2:
        return additional_nutrients
    elif num == 3:
        return support_columns
    elif num == 4:
        return full_column_filter
    else:
        print("Number not allowed")

In [20]:
# -------------------------------- RENAMING THE COLUMNS --------------------------------
def column_rename(df):
    # Key nutrients for the comparison with recommended daily intake
    new_daily_intake_nutrients = ["Protein (g)", "Water (g)", "Fiber, total dietary (g)", "Vitamin A, RAE (mcg_RAE)", "Thiamin (mg)", "Riboflavin (mg)", "Niacin (mg)", "Vitamin B-6 (mg)", "Vitamin B-12 (mcg)",  "Vitamin B-12, added (mcg)", "Folate, total (mcg)", "Vitamin C (mg)", "Calcium (mg)", "Iron (mg)", "Magnesium (mg)", "Potassium (mg)", "Sodium (mg)", "Zinc (mg)"]

    # Additional interesting nutrients to explore
    new_additional_nutrients = ["Energy (kcal)", "Total Fat (g)", "Fatty acids, total saturated (g)", "Fatty acids, total monounsaturated (g)", "Fatty acids, total polyunsaturated (g)", "Cholesterol (mg)", "Vitamin D (D2 + D3) (mcg)"]

    # For grouping and categorization
    new_support_columns = ["Food name", "Category number", "Category name"]

    full_column_rename = new_support_columns + new_daily_intake_nutrients + new_additional_nutrients

    df.columns = full_column_rename

    return df

In [29]:
def negative_filters(filter_):
    # NEGATIVE FILTERS
    others = ['Formula, ready-to-feed', 'Formula, prepared from powder', 'Formula, prepared from concentrate', 'Sugar substitutes', 'Not included in a food category']
    baby_food = ['Baby food: yogurt', 'Baby food: snacks and sweets', 'Baby food: meat and dinners', ]
    desserts_and_snacks = ['Ice cream and frozen dairy desserts', 'Milk shakes and other dairy drinks', 'Cakes and pies', 'Candy not containing chocolate', 'Doughnuts, sweet rolls, pastries', 'Crackers, excludes saltines', 'Cookies and brownies', 'Biscuits, muffins, quick breads', 'Pancakes, waffles, French toast', 'Cereal bars', 'Nutrition bars', 'Saltine crackers', 'Pretzels/snack mix', 'Potato chips', 'Candy containing chocolate', 'Pancakes, waffles, French toast']
    drinks = ['Soft drinks', 'Diet soft drinks', 'Flavored or carbonated water', 'Other diet drinks', 'Beer', 'Liquor and cocktails', 'Wine', 'Nutritional beverages', 'Protein and nutritional powders', 'Sport and energy drinks', 'Diet sport and energy drinks']
    sandwiches = ['Burritos and tacos', 'Other sandwiches (single code)', 'Burgers (single code)', 'Egg/breakfast sandwiches (single code)', 'Frankfurter sandwiches (single code)', 'Frankfurter sandwiches (single code)', 'Vegetables on a sandwich']
    prepared_dishes = ['Rolls and buns', 'Egg rolls, dumplings, sushi', 'Pasta mixed dishes, excludes macaroni and cheese', 'Macaroni and cheese', 'Pizza', 'Meat mixed dishes', 'Stir-fry and soy-based sauce mixtures', 'Bean, pea, legume dishes', 'Seafood mixed dishes', 'Rice mixed dishes', 'Fried rice and lo/chow mein', 'Poultry mixed dishes']
    sauces = ['Dips, gravies, other sauces''Pasta sauces, tomato-based', 'Mustard and other condiments', 'Mayonnaise', 'Jams, syrups, toppings']
    full_negative_filter = others + baby_food + desserts_and_snacks + drinks + sandwiches + prepared_dishes + sauces


    if filter_ == 0:
        return others

    elif filter_ == 1:
        return baby_food

    elif filter_ == 2:
        return desserts_and_snacks

    elif filter_ == 3:
        return drinks

    elif filter_ == 4:
        return sandwiches

    elif filter_ == 5:
        return prepared_dishes

    elif filter_ == 6:
        return sauces

    elif filter_ == 7:
        return full_negative_filter

    else:
        return "Filter not available"

In [30]:
def positive_filters(filter_):
    #POSITIVE FILTERS
    milks = ['Lamb, goat, game', 'Human milk', 'Milk, reduced fat', 'Milk, whole', 'Milk, lowfat', 'Milk, nonfat', 'Flavored milk, whole', 'Yogurt, regular', 'Yogurt, Greek']
    cheese = ['Cheese', 'Cottage/ricotta cheese']
    other_animal_products = ['Eggs and omelets', 'Butter and animal fats']
    meats = ['Ground beef', 'Cold cuts and cured meats', 'Bacon', 'Pork', 'Liver and organ meats', 'Frankfurters', 'Sausages']
    chicken = ['Turkey, duck, other poultry', 'Chicken, whole pieces', 'Chicken patties, nuggets and tenders']
    fish = ['Fish', 'Shellfish']
    milk_substitutes = ['Milk substitutes']
    beans = ['Beans, peas, legumes']
    soy_products = ['Processed soy products']
    nuts = ['Nuts and seeds']
    other_veggie_products = ['Peanut butter and jelly sandwiches (single code)', 'Oatmeal']
    animal_filter = milks + cheese + other_animal_products + meats + chicken + fish
    veggie_filter = milk_substitutes + beans + soy_products + nuts + other_veggie_products
    full_positive_filter = animal_filter + veggie_filter

    if filter_ == 0:
        return milks

    elif filter_ == 1:
        return cheese

    elif filter_ == 2:
        return other_animal_products

    elif filter_ == 3:
        return meats

    elif filter_ == 4:
        return chicken

    elif filter_ == 5:
        return fish

    elif filter_ == 6:
        return milk_substitutes

    elif filter_ == 7:
        return beans

    elif filter_ == 8:
        return soy_products

    elif filter_ == 9:
        return nuts

    elif filter_ == 10:
        return other_veggie_products

    ###
    elif filter_ == 11:
        return animal_filter

    elif filter_ == 12:
        return veggie_filter

    elif filter_ == 13:
        return full_positive_filter

    else:
        return "Filter not available" 

In [26]:
def conditional(df, to_filter, negative_filter = True):
    '''
    df : dataframe to filter
    to_filter : filter that will be used
    out : if True, it will filter out and if False, it will simply filter. By default, is True.
    '''
    if negative_filter == True:
        filter_ = negative_filters(to_filter)
        return df[~df["Category name"].isin(filter_)].index

    filter_ = positive_filters(to_filter)
    return df[df["Category name"].isin(filter_)].index

In [45]:
def several_filters(df, to_filter_list, negative_filter = True):

    if negative_filter == False:
        positive_df = pd.DataFrame(columns = df.columns)

        for filter_ in to_filter_list:
            condition = conditional(df, filter_, negative_filter)
            positive_filter = df.loc[condition]
            positive_df = pd.concat([positive_df, positive_filter])

        return positive_df

    else:
        for filter_ in to_filter_list:
            condition = conditional(df, filter_, negative_filter)
            df = df.loc[condition]

        return df

In [64]:
def nutrition_data_prep(df):
    # Step 1: Filtering the columns I need
    df = df[nutrients_filter(4)]

    # Step 2: Column rename
    df = column_rename(df)

    # Step 3: Adding two extra columns
    category_2 = ["milks", "cheese", "other_animal_products", "meats", "chicken", "fish", "milk_substitutes", "beans", "soy_products", "nuts", "other_veggie_products"]

    category_3 = ["animal", "veggie"]

    df["Category 2"] = None
    df["Category 3"] = None

    for ind, val in enumerate(category_2):
        condition = conditional(df, ind, False)
        df.loc[condition, "Category 2"] = val

    for ind, val in enumerate(category_3):
        condition = conditional(df, ind + 11, False)
        df.loc[condition, "Category 3"] = val

    return df

In [68]:
def get_nutrition_data(path, filename):
    df = pd.read_excel(path + filename, skiprows = 1)

    df = nutrition_data_prep(df)

    return df

In [69]:
path = fo.path_to_folder(2,"data")
filename = "2017-2018 FNDDS At A Glance - FNDDS Nutrient Values.xlsx"

get_nutrition_data(path, filename)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Unnamed: 0,Food name,Category number,Category name,Protein (g),Water (g),"Fiber, total dietary (g)","Vitamin A, RAE (mcg_RAE)",Thiamin (mg),Riboflavin (mg),Niacin (mg),...,Zinc (mg),Energy (kcal),Total Fat (g),"Fatty acids, total saturated (g)","Fatty acids, total monounsaturated (g)","Fatty acids, total polyunsaturated (g)",Cholesterol (mg),Vitamin D (D2 + D3) (mcg),Category 2,Category 3
0,"Milk, human",9602,Human milk,1.03,87.50,0.0,61,0.014,0.036,0.177,...,0.17,70,4.38,2.009,1.658,0.497,14,0.1,milks,animal
1,"Milk, NFS",1004,"Milk, reduced fat",3.34,89.04,0.0,59,0.057,0.137,0.110,...,0.42,51,1.99,1.164,0.426,0.065,8,1.1,milks,animal
2,"Milk, whole",1002,"Milk, whole",3.28,88.10,0.0,32,0.056,0.138,0.105,...,0.41,60,3.20,1.860,0.688,0.108,12,1.1,milks,animal
3,"Milk, low sodium, whole",1002,"Milk, whole",3.10,88.20,0.0,29,0.020,0.105,0.043,...,0.38,61,3.46,2.154,0.999,0.128,14,1.3,milks,animal
4,"Milk, calcium fortified, whole",1002,"Milk, whole",3.28,88.10,0.0,32,0.056,0.138,0.105,...,0.41,60,3.20,1.860,0.688,0.108,12,1.1,milks,animal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7078,Tomatoes as ingredient in omelet,9999,Not included in a food category,1.11,92.57,1.6,43,0.045,0.024,0.637,...,0.21,25,0.23,0.038,0.035,0.094,0,0.0,,
7079,Other vegetables as ingredient in omelet,9999,Not included in a food category,3.46,90.37,1.4,1,0.085,0.410,3.678,...,0.58,29,0.38,0.061,0.002,0.175,0,0.2,,
7080,Vegetables as ingredient in curry,9999,Not included in a food category,1.81,85.59,2.2,98,0.066,0.046,0.773,...,0.28,52,0.19,0.051,0.017,0.064,0,0.0,,
7081,Sauce as ingredient in hamburgers,9999,Not included in a food category,1.34,55.97,0.6,21,0.028,0.112,0.917,...,0.21,271,22.85,3.544,5.321,13.522,13,0.1,,
