## Import libraries

In [1]:
import re
import os

from dotenv import load_dotenv
import numpy as np
import pandas as pd
import requests

In [2]:
load_dotenv()
api_key = os.getenv('FOOD_DATA_CENTRAL_API_KEY')

## Functions

In [3]:
def fetch_fdc_data(query: str, api_key: str, data_type: str='Foundation',page_size: int=50,page_number: int=1) -> requests.models.Response:
    '''
    Fetch single page of data from FoodData Central acquired after applying a query.
    
    Args:
        query: query passed to FDC API
        api_key: unique API key required for API request
        data_type: one of four basic data types provided by FDC. Allowed values: Foundation, Branded, Survey (FNDDS), SR Legacy
        page_size: number of results per page
        page_number: number of page to fetch
    
    Return: 
        requests.models.Response: fetched data in json format
    '''
    
    base_url = 'https://api.nal.usda.gov/fdc/v1'
    search_endpoint = '/foods/search'
    
    url_search = f'{base_url}{search_endpoint}?api_key={api_key}&query={query}&dataType={data_type}&pageSize={page_size}&pageNumber={page_number}'
    response = requests.get(url_search)
    return response.json()

In [4]:
def fetch_all(query: str,api_key: str,nutrients_list: list,data_type: str='Foundation',page_size: int=50,start_page: int=1,page_limit: int=None) -> pd.DataFrame:
    '''
    Fetch all available data from FDC database for given query.
    
    Args:
        query: query passed to FDC API
        api_key: unique API key required for API request
        data_type: one of four basic data types provided by FDC. Allowed values: Foundation, Branded, Survey (FNDDS), SR Legacy
        page_size: number of results per page
        start_page: first page to fetch
        page_limit: last page to fetch
    
    Return: 
        pd.DataFrame: fetched data
    '''
    
    # Fetch start_page and retrieve basic info about the query
    foods = fetch_fdc_data(query=query,api_key=api_key,data_type=data_type,page_number=start_page,page_size=page_size)
    df = get_food_info(foods,nutrients_list)
    total_pages = foods['totalPages']
    hit_threshold = 500
    
    # Fetch all pages if page_limit is not specified
    if page_limit is None:
        page_limit = total_pages
        
    total_hits = (page_limit - start_page) * page_size
    
    # Warning if user tries to fetch very large number of entries
    if  total_hits > hit_threshold:
        decision = input(f'WARNING: This query will potentially result in page size ({page_size}) * number of pages ({page_limit - start_page}) = {total_hits} total hits. Do you want to continue? (Y/N): ')
        
        if decision not in ['Y','y']:
            return None        
    
    for i in range(start_page+1,page_limit+1):
        foods = fetch_fdc_data(query=query,api_key=api_key,data_type=data_type,page_number=i)
        df_new = get_food_info(foods,nutrients_list)
        df = pd.concat([df,df_new],ignore_index=True)
    
    return df

In [5]:
def get_nutrients(food_entry: dict) -> dict:
    '''
    Get nutrient-value pairs for specific food.
    Important: for some foods there are two values of energy (corresponding to Atwater General and Specific
    Factors). Only those for General Factors are included in the final dictionary. Also, in some cases the energy is expressed
    both in kcal and kJ. Only values expressed in kcal are included.
    
    Args:
        food_entry: dictionary corresponding to a single food sample obtained from FDC database
    
    Return:
        dict: nutrient-value pairs
    '''
    
    nutrients = dict()
    energy_pattern = re.compile(r'\bEnergy\b')
    
    # Populate dictionary of nutirents for given food
    for nutrient in food_entry['foodNutrients']:        
        if not energy_pattern.match(nutrient['nutrientName']):
            nutrients[nutrient['nutrientName']] = nutrient.get('value',np.nan)
            
        if nutrient.get('nutrientName', '').strip() == 'Energy (Atwater General Factors)' and nutrient.get('unitName', '') == 'KCAL':
            nutrients['Energy'] = np.round(nutrient.get('value',np.nan))
        elif nutrient.get('nutrientName', '').split()[0] == 'Energy' and \
            len(nutrient.get('nutrientName', '').strip().split()) == 1 and \
            nutrient.get('unitName', '') == 'KCAL': 
            nutrients['Energy'] = np.round(nutrient.get('value',np.nan))
    
    if 'Energy' not in nutrients:
        nutrients['Energy'] = np.nan
    
    return nutrients

In [17]:
def _create_description_columns(foods):
        description_columns = ['Description']
        if 'Survey (FNDDS)' in foods['foodSearchCriteria']['dataType']:
            description_columns += ['Additional description']
        if 'Branded' in foods['foodSearchCriteria']['dataType']:
            description_columns += ['Food category','Brand owner','Brand name']
        return description_columns
    
def _fill_description_columns(foods,food,filtered_nutrient_list):
    filtered_nutrient_list['Description'] = food['description']
    if 'Survey (FNDDS)' in foods['foodSearchCriteria']['dataType']:
        filtered_nutrient_list['Additional description'] = food.get('additionalDescriptions',np.nan)
    if 'Branded' in foods['foodSearchCriteria']['dataType']:
        filtered_nutrient_list['Food category'] = food.get('foodCategory',np.nan)
        filtered_nutrient_list['Brand owner'] = food.get('brandOwner',np.nan)
        filtered_nutrient_list['Brand name'] = food.get('brandName',np.nan)
    return filtered_nutrient_list

In [6]:
def get_food_info(foods: dict, nutrients_list: list) -> pd.DataFrame:
    '''
    Convert FDC json data to pandas DataFrame containing foods with selected nutrients
    
    Args:
        foods: json data fetched from FDC 
        nutrients_list: list of nutrients to include in the final DataFrame. Nutrient names have to be the same as those provided by FDC.
    
    Return:
        pd.DataFrame: food descriptions and selected nutrients
    '''
    
    # Specify, which columns describing food samples should be included, based on the data type in the query result
    description_columns = _create_description_columns(foods)
        
    data = []
    for food in foods['foods']:
        # Get nutrients for each food and store them for DataFrame creation
        nutrients = get_nutrients(food)
        filtered_nutrients = {nutrient: value for nutrient, value in nutrients.items() if nutrient in nutrients_list}
        filtered_nutrients = _fill_description_columns(foods,food,filtered_nutrients)
        data.append(filtered_nutrients)

    df = pd.DataFrame(data,columns=description_columns + nutrients_list)
    
    return df

In [7]:
def list_nutrients_and_units(food_entry):
    print(food_entry['description'])
    for nutrient in food_entry['foodNutrients']:
        print(f"{nutrient.get('nutrientName','')} : {nutrient.get('value',np.nan)} : {nutrient.get('unitName',np.nan)}")

In [8]:
def remove_unnecessary_nutrients(nutrients_list,nutrients_to_remove):
    nutrients_list_copy = nutrients_list.copy()
    for element in nutrients_to_remove:
        if element in nutrients_list_copy:
            nutrients_list_copy.remove(element)
        else:
            print(f"{element} not found in the list")
        
    return nutrients_list_copy

## Fetching data

### Nutrient categories

In [9]:
basic_nutrients = ['Energy','Protein','Total lipid (fat)','Carbohydrate, by difference']
proximates = ['Energy','Water','Protein','Total lipid (fat)','Carbohydrate, by difference','Ash']
vitamins = ['Vitamin K (Dihydrophylloquinone)','Vitamin K (phylloquinone)','Folate, total',
           'Vitamin C, total ascorbic acid','Niacin','Vitamin B-6','Riboflavin','Thiamin','Pantothenic acid',
           'Vitamin A, RAE','Lutein + zeaxanthin','Vitamin D (D2 + D3)']
minerals = ['Calcium, Ca','Potassium, K','Zinc, Zn','Selenium, Se','Manganese, Mn','Phosphorus, P',
           'Magnesium, Mg','Copper, Cu','Iron, Fe','Sodium, Na']
aminoacids = ['Tryptophan','Threonine','Methionine','Tyrosine','Alanine','Glutamic acid','Glycine','Proline']
essential_aminoacids = ['Histidine','Isoleucine','Leucine','Lysine','Methionine','Phenylalanine','Threonine','Tyrosine','Valine']
organic_acids = ['Citric acid','Malic acid','Oxalic acid','Quinic acid']

### Fruits

Whole fruits and fruit juice

In [26]:
fruits_nutrients = proximates + ['Sugars, Total'] + vitamins + minerals + organic_acids
fruits_nutrients = remove_unnecessary_nutrients(fruits_nutrients,['Vitamin K (Dihydrophylloquinone)','Vitamin K (phylloquinone)'])
fruits = fetch_all(query='fruit',
                   api_key=api_key,
                   nutrients_list=fruits_nutrients,
                    data_type='Foundation',start_page=1)

fruits.to_csv('fruits.csv',index=False)

### Vegetables

Dark green vegs, red and orange vegs, starchy, other

In [27]:
vegetables_nutrients = proximates + vitamins + minerals
vegetables = fetch_all(query='vegetable',
                   api_key=api_key,
                   nutrients_list=vegetables_nutrients,
                    data_type='Foundation',start_page=1)

vegetables.to_csv('vegetables.csv',index=False)

### Grains

Whole wheat bread, brown rice, popcorn, oatmeal (whole grains); refined grains

In [28]:
grains_nutrients = proximates + minerals + ['Molybdenum, Mo'] + ['Fiber, total dietary'] + ['Thiamin','Riboflavin','Niacin','Vitamin B-6','Biotin']
grains = fetch_all(query='grain',
                   api_key=api_key,
                   nutrients_list=grains_nutrients,
                    data_type='Foundation',start_page=1)
grains.to_csv('grains.csv',index=False)

### Protein Foods

Beans and peas, seafood, meat, poultry and eggs, nuts, seeds and soy

In [67]:
fish_nutrients = proximates + minerals + vitamins + essential_aminoacids + ['PUFA 18:3 n-3 c,c,c (ALA)','PUFA 20:5 n-3 (EPA)','PUFA 22:6 n-3 (DHA)'] + \
['Fatty acids, total saturated','Fatty acids, total monounsaturated','Fatty acids, total polyunsaturated','Fatty acids, total trans','Cholesterol']
fish_nutrients = remove_unnecessary_nutrients(fish_nutrients,['Vitamin K (Dihydrophylloquinone)','Vitamin K (phylloquinone)','Lutein + zeaxanthin'])
fish = fetch_all(query='fish',
                   api_key=api_key,
                   nutrients_list=fish_nutrients,
                    data_type='SR Legacy',start_page=1)

fish = fish[fish['Description'].str.contains('raw')]

fish.to_csv('../Foods/fish.csv',index=False)

In [66]:
beans_nutrients = proximates + minerals + ['Sulfur, S','Nickel, Ni','Molybdenum, Mo','Cobalt, Co','Boron, B'] 
beans = fetch_all(query='bean',
                   api_key=api_key,
                   nutrients_list=beans_nutrients,
                    data_type='Foundation',start_page=1)

# Duplicate from vegetables
beans.drop([0,1],inplace=True)
beans.to_csv('beans.csv',index=False)

In [31]:
nuts_nutrients = proximates + minerals
nuts = fetch_all(query='nut',
                   api_key=api_key,
                   nutrients_list=nuts_nutrients,
                    data_type='Foundation',start_page=1)
nuts.to_csv('nuts.csv',index=False)

In [32]:
poultry_nutrients = proximates + minerals + ['Fatty acids, total saturated','Fatty acids, total monounsaturated','Fatty acids, total polyunsaturated','Fatty acids, total trans','Cholesterol']
poultry = fetch_all(query='poultry',
                   api_key=api_key,
                   nutrients_list=poultry_nutrients,
                    data_type='Foundation',start_page=1)
poultry.to_csv('poultry.csv',index=False)

In [33]:
meat_nutrients = proximates + minerals + ['Fatty acids, total saturated','Fatty acids, total monounsaturated','Fatty acids, total polyunsaturated','Fatty acids, total trans','Cholesterol']
meat = fetch_all(query='meat -restaurant',
                   api_key=api_key,
                   nutrients_list=meat_nutrients,
                    data_type='Foundation',start_page=1)
meat.to_csv('meat.csv',index=False)

In [34]:
meat_and_poultry = pd.concat([poultry,meat],axis=0)
meat_and_poultry.drop_duplicates(inplace=True)
meat_and_poultry.to_csv('meat_and_poultry.csv',index=False)

### Diary

Milk and yogurt, cheese

In [35]:
cheese_nutrients = proximates + vitamins + minerals + ['Fatty acids, total saturated','Fatty acids, total monounsaturated','Fatty acids, total polyunsaturated','Fatty acids, total trans']
cheese = fetch_all(query='cheese',
                   api_key=api_key,
                   nutrients_list=cheese_nutrients,
                    data_type='Foundation',start_page=1)
cheese.to_csv('cheese.csv',index=False)

In [36]:
egg_milk_yogurt_nutrients = proximates + vitamins + minerals + ['Fatty acids, total saturated','Fatty acids, total monounsaturated','Fatty acids, total polyunsaturated','Fatty acids, total trans'] + aminoacids
egg_milk_yogurt = fetch_all(query='egg -cheese',
                   api_key=api_key,
                   nutrients_list=egg_milk_yogurt_nutrients,
                    data_type='Foundation',start_page=1)
egg_milk_yogurt.to_csv('egg_milk_yogurt.csv',index=False)