In [1]:
import pandas as pd
import numpy as np
from utils import *

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


## Food - Ingredient Connection

In [4]:
# The most recent three years of FNDDS tables contain nutrition data. The column names are slightly different.
df_1516 = pd.read_excel('../data/2015-2016 Ingredients.xlsx', skiprows=1)
df_1718 = pd.read_excel('../data/2017-2018 Ingredients.xlsx', skiprows=1)
df_1920 = pd.read_excel('../data/2019-2020 Ingredients.xlsx', skiprows=1)

df_1516 = df_1516.rename(columns={'WWEIA Category code': 'WWEIA Category number'})

In [5]:
# A small proportion of FNDDS data, such as code and descriptions change over the years.
# Here we take the latest version of data if there are duplicates.
df = pd.concat([df_1516, df_1718, df_1920])
df = df[['Food code', 'Main food description', 'WWEIA Category number', 'WWEIA Category description', 'Ingredient code', 'Ingredient description']]
df = df.drop_duplicates(subset=['Food code', 'WWEIA Category number', 'Ingredient code'], keep='last')
df = df.sort_values(by='Food code')

# This table records the connections between food and ingredients.
df = df.rename(columns={'Food code': 'food_id', 'Main food description': 'food_desc', 'WWEIA Category number': 'WWEIA_id',
                        'WWEIA Category description': 'WWEIA_desc', 'Ingredient code': 'ingredient_id', 'Ingredient description': 'ingredient_desc'})

df.to_csv('../processed_data/food_ingredients.csv', index=False)

In [6]:
# There are 9260 foods in total.
len(set(df['food_id'].tolist()))

9260

In [7]:
df

Unnamed: 0,food_id,food_desc,WWEIA_id,WWEIA_desc,ingredient_id,ingredient_desc
0,11000000,"Milk, human",9602,Human milk,1107,"Milk, human, mature, fluid (For Reference Only)"
4,11100000,"Milk, NFS",1004,"Milk, reduced fat",1085,"Milk, nonfat, fluid, with added vitamin A and ..."
3,11100000,"Milk, NFS",1004,"Milk, reduced fat",1082,"Milk, lowfat, fluid, 1% milkfat, with added vi..."
2,11100000,"Milk, NFS",1004,"Milk, reduced fat",1079,"Milk, reduced fat, fluid, 2% milkfat, with add..."
1,11100000,"Milk, NFS",1004,"Milk, reduced fat",1077,"Milk, whole, 3.25% milkfat, with added vitamin D"
...,...,...,...,...,...,...
19301,99998130,Sauce as ingredient in hamburgers,9999,Not included in a food category,11935,Catsup
19305,99998210,Industrial oil as ingredient in food,9999,Not included in a food category,4701,"Oil, industrial, soy, fully hydrogenated"
19306,99998210,Industrial oil as ingredient in food,9999,Not included in a food category,4699,"Oil, industrial, soy, low linolenic"
19304,99998210,Industrial oil as ingredient in food,9999,Not included in a food category,4698,"Oil, industrial, canola, high oleic"


## User - Food Connection

In [7]:
years = ['0304', '0506', '0708', '0910', '1112', '1314', '1516', '1718', '1720']
year_char = 'C'
type_dietary = 'dietary'

In [34]:
df_IFF1 = concat_data_across_years(type_dietary, 'DR1IFF', years, year_char)
df_IFF2 = concat_data_across_years(type_dietary, 'DR2IFF', years, year_char)

### We first create a table that contains the nutrition information of each food.

In [35]:
food_columns_1 = ['SEQN', 'food_id', 'DR1IGRMS',
 'DR1IKCAL', 'DR1IPROT', 'DR1ICARB', 'DR1ISUGR', 'DR1IFIBE', 'DR1ITFAT',
 'DR1ISFAT', 'DR1IMFAT', 'DR1IPFAT', 'DR1ICHOL', 'DR1IATOC', 'DR1IATOA',
 'DR1IRET', 'DR1IVARA', 'DR1IACAR', 'DR1IBCAR', 'DR1ICRYP', 'DR1ILYCO',
 'DR1ILZ', 'DR1IVB1', 'DR1IVB2', 'DR1INIAC', 'DR1IVB6', 'DR1IFOLA',
 'DR1IFA', 'DR1IFF', 'DR1IFDFE', 'DR1ICHL', 'DR1IVB12', 'DR1IB12A',
 'DR1IVC', 'DR1IVD', 'DR1IVK', 'DR1ICALC', 'DR1IPHOS', 'DR1IMAGN',
 'DR1IIRON', 'DR1IZINC', 'DR1ICOPP', 'DR1ISODI', 'DR1IPOTA', 'DR1ISELE',
 'DR1ICAFF', 'DR1ITHEO', 'DR1IALCO', 'DR1IMOIS'
]
food_columns_2 = ['SEQN', 'food_id', 'DR2IGRMS',
 'DR2IKCAL', 'DR2IPROT', 'DR2ICARB', 'DR2ISUGR', 'DR2IFIBE', 'DR2ITFAT',
 'DR2ISFAT', 'DR2IMFAT', 'DR2IPFAT', 'DR2ICHOL', 'DR2IATOC', 'DR2IATOA',
 'DR2IRET', 'DR2IVARA', 'DR2IACAR', 'DR2IBCAR', 'DR2ICRYP', 'DR2ILYCO',
 'DR2ILZ', 'DR2IVB1', 'DR2IVB2', 'DR2INIAC', 'DR2IVB6', 'DR2IFOLA',
 'DR2IFA', 'DR2IFF', 'DR2IFDFE', 'DR2ICHL', 'DR2IVB12', 'DR2IB12A',
 'DR2IVC', 'DR2IVD', 'DR2IVK', 'DR2ICALC', 'DR2IPHOS', 'DR2IMAGN',
 'DR2IIRON', 'DR2IZINC', 'DR2ICOPP', 'DR2ISODI', 'DR2IPOTA', 'DR2ISELE',
 'DR2ICAFF', 'DR2ITHEO', 'DR2IALCO', 'DR2IMOIS'
]

In [68]:
df_IFF1 = df_IFF1.rename(columns={'DR1IFDCD': 'food_id'})
df_IFF1 = df_IFF1[food_columns_1].astype(float)
df_IFF2 = df_IFF2.rename(columns={'DR2IFDCD': 'food_id'})
df_IFF2 = df_IFF2[food_columns_2].astype(float)

df_food  = pd.DataFrame(np.vstack((df_IFF1.to_numpy(), df_IFF2.to_numpy())), columns=df_IFF1.columns)

In [69]:
# Create a new DataFrame for the processed data
processed_df = pd.DataFrame()
processed_df['food_id'] = df_food['food_id'].unique()

In [70]:
df_food = df_food.dropna(subset=['DR1IGRMS'])

In [71]:
for col in df_food.columns.tolist()[3:]:
    df_food[col] = df_food[col] / df_food['DR1IGRMS'] * 100

df_food.drop(['SEQN', 'DR1IGRMS'], axis=1, inplace=True)

df_food = df_food.groupby('food_id').mean().reset_index()
df_food = df_food.fillna(-1)
df_food['food_id'] = df_food['food_id'].astype(int)

In [73]:
processed_df = processed_df.merge(df_food, how='left', on='food_id')

In [77]:
processed_df.to_csv('../processed_data/food_nutrition.csv', index=False)

### Now go back to the user-food connection.

In [75]:
df_IFF1 = df_IFF1[['SEQN', 'food_id']].astype(int)
df_IFF2 = df_IFF2[['SEQN', 'food_id']].astype(int)
df_food = pd.concat([df_IFF1, df_IFF2])

In [76]:
len(set(df_food['food_id'].tolist()))

9640

In [12]:
food_eaten = set(df_food['food_id'].tolist())
food_fndds_have = set(df['food_id'].tolist())

In [13]:
"""
There are 9640 food items show up in the users' diet. But from 15-20 FNDDS data, we only get 9260 food items that contains ingredients. And the overlap is only 8119.
This is not necessary a problem, because some food items, such as white sugar, don't have ingredients.
But we need to address this issue.
"""
len(food_eaten.intersection(food_fndds_have))

8119

In [14]:
"""
We use the food code NHANES provided, which is more complete than FNDDS. For duplications, we also keep the latest records.
In this way, every food users reported has its corresponding food description.
We use this as the connections between users and food.
"""

food_dictionary = concat_data_across_years(type_dietary, 'DRXFCD', years, year_char)
food_dictionary = food_dictionary.rename(columns={'DRXFDCD': 'food_id', 'DRXFCLD': 'food_desc'})

food_dictionary = food_dictionary[['food_id', 'food_desc', 'years']]
food_dictionary['food_id'] = food_dictionary['food_id'].astype(int)
food_dictionary = food_dictionary.drop_duplicates(subset='food_id', keep='last')

In [15]:
food_nhanes_have = set(food_dictionary['food_id'].tolist())
len(food_nhanes_have)

11338

In [16]:
len(food_eaten.intersection(food_nhanes_have))

9640

In [18]:
df_food[['SEQN', 'food_id']] = df_food[['SEQN', 'food_id']].astype(int)

In [23]:
food_nutrition = df_food.drop('SEQN', axis=1).sample(10000)
food_nutrition.to_csv('../processed_data/food_nutrition.csv', index=False)

In [38]:
df_food = df_food.merge(food_dictionary, how='left', on='food_id')

In [39]:
df_food.to_csv('../processed_data/user_food.csv', index=False)

### Todo: Food - Category Connections

In [8]:
"""
A food doesn't necessarily have ingredients, but it should always belong to a food category.
This can be retrieved from FNDDS food tables. Turns out that all foods are assigned at least one ingredient and category.
"""
df_food_1516 = pd.read_excel('../data/2015-2016 Foods and Beverages.xlsx', skiprows=1)
df_food_1718 = pd.read_excel('../data/2017-2018 Foods and Beverages.xlsx', skiprows=1)
df_food_1920 = pd.read_excel('../data/2019-2020 Foods and Beverages.xlsx', skiprows=1)

In [14]:
df_food_1516 = df_food_1516.rename(columns={'WWEIA Category code': 'WWEIA Category number'})
# A small proportion of FNDDS data, such as code and descriptions change over the years.
# Here we take the latest version of data if there are duplicates.
df_food_main = pd.concat([df_food_1516, df_food_1718, df_food_1920])
df_food_main = df_food_main[['Food code', 'Main food description', 'WWEIA Category number', 'WWEIA Category description']]
df_food_main = df_food_main.drop_duplicates(subset=['Food code', 'WWEIA Category number'], keep='last')
df_food_main = df_food_main.sort_values(by='Food code')

# This table records the connections between food and ingredients.
df_food_main = df_food_main.rename(columns={'Food code': 'food_id', 'Main food description': 'food_desc', 'WWEIA Category number': 'WWEIA_id',
                        'WWEIA Category description': 'WWEIA_desc'})

In [15]:
# This matches the food id number in the ingredient table. So it means the food category relations has been covered using that table.
df_food_main['food_id'].unique().shape

(9260,)