# Table of Contents
- [Load food log DataFrames](#Load-food-log-DataFrames)
- [Inspect food logs](#Inspect-food-logs)
- [Fix column names for Patient_003 food log](#Fix-column-names-for-Patient_003-food-log)
- [Fix time of day column names for Patients 007, 013, 015, 016](#Fix-time-of-day-column-names-for-Patients-007-013-015-016)
- [Concatenate all food logs into one DataFrame](#Concatenate-all-food-logs-into-one-DataFrame)
- [Inspect final DataFrame with all food log data](#Inspect-final-DataFrame-with-all-food-log-data)
- [Convert columns to suitable data types](#Convert-columns-to-suitable-data-types)
- [Save final DataFrame as food_df CSV](#Save-final-DataFrame-as-food_df-CSV)


In [5]:
import pandas as pd
import os
import numpy as np

### Load food log dfs

In [6]:
def read_data(file_path):
    """Read CSV file and return DataFrame"""
    # Use os.path.basename and split text to get filename without extension
    # This is a more robust method compared to splitting on '/' and '.'
    key = os.path.splitext(os.path.basename(file_path))[0]
    df = pd.read_csv(file_path)

    return key, df


# Create a list of str-type patient IDs ranging from '001' to '016'
patient_ids = [str(i).zfill(3) for i in range(1, 17)]

# Filepaths in the local directory
template_filepaths = [
    'Data/001/Food_Log_001.csv',
]

# Dictionary to hold the dataframes for all patients
all_dfs = {}

# Iterate through patient ids
for patient_id in patient_ids:
    # Create a new dictionary for each patient
    patient_dfs = {}

    # Update the patient_id in filepaths
    # Use f-string for cleaner string formatting
    filepaths = [path.replace('001', patient_id) for path in template_filepaths]

    # Read each CSV file into a pandas DataFrame and store it in the dictionary
    for file_path in filepaths:
        key, df = read_data(file_path)
        patient_dfs[key] = df

    # Add this patient's dataframes to the overall dictionary
    all_dfs[patient_id] = patient_dfs


### Inspect food logs

In [7]:
# Iterate over all patients in the dictionary
for patient_id, patient_dfs in all_dfs.items():
    print(f"Patient ID: {patient_id}\n")

    # Iterate over all dataframes for the current patient
    for df_name, df in patient_dfs.items():
        print(f"    DataFrame: {df_name}")
        print("    Columns:", df.columns.tolist())
        print()


Patient ID: 001

    DataFrame: Food_Log_001
    Columns: ['date', 'time', 'time_begin', 'time_end', 'logged_food', 'amount', 'unit', 'searched_food', 'calorie', 'total_carb', 'dietary_fiber', 'sugar', 'protein', 'total_fat']

Patient ID: 002

    DataFrame: Food_Log_002
    Columns: ['date', 'time', 'time_begin', 'time_end', 'logged_food', 'amount', 'unit', 'searched_food', 'calorie', 'total_carb', 'dietary_fiber', 'sugar', 'protein', 'total_fat']

Patient ID: 003

    DataFrame: Food_Log_003
    Columns: ['2020-02-22', '10:30:00', '2020-02-22 10:30:00', 'Chicken Nuggets', '8.0', 'piece', 'Chicken Nuggets.1', '393.0', '19.0', '0.1', '20.0']

Patient ID: 004

    DataFrame: Food_Log_004
    Columns: ['date', 'time', 'time_begin', 'time_end', 'logged_food', 'amount', 'unit', 'searched_food', 'calorie', 'total_carb', 'dietary_fiber', 'sugar', 'protein', 'total_fat']

Patient ID: 005

    DataFrame: Food_Log_005
    Columns: ['date', 'time', 'time_begin', 'time_end', 'logged_food', 'amoun

### Fix column names for Patient_003 food log

In [8]:
# Get the dataframe
df_003 = all_dfs['003']['Food_Log_003']

# Set the column names
df_003.columns = ['date', 'time', 'time_begin', 'logged_food', 'amount', 'unit', 'searched_food', 'calorie',
                  'total_carb', 'sugar', 'protein']

# Now print out the columns to verify the change
print(df_003.columns.tolist())


['date', 'time', 'time_begin', 'logged_food', 'amount', 'unit', 'searched_food', 'calorie', 'total_carb', 'sugar', 'protein']


### Fix time of day column names for Patients 007, 013, 015, 016

In [9]:
# Get the relevant DataFrames
df_007 = all_dfs['007']['Food_Log_007']
df_013 = all_dfs['013']['Food_Log_013']
df_015 = all_dfs['015']['Food_Log_015']
df_016 = all_dfs['016']['Food_Log_016']

# Rename the column 'time_of_day' to 'time' in each DataFrame
df_007.rename(columns={'time_of_day': 'time'}, inplace=True)
df_013.rename(columns={'time_of_day': 'time'}, inplace=True)
df_015.rename(columns={'time_of_day': 'time'}, inplace=True)
df_016.rename(columns={'time_of_day': 'time'}, inplace=True)


In [10]:
# Iterate over all patients in the dictionary
for patient_id, patient_dfs in all_dfs.items():
    print(f"Patient ID: {patient_id}\n")

    # Iterate over all dataframes for the current patient
    for df_name, df in patient_dfs.items():
        print(f"    DataFrame: {df_name}")
        print("    Columns:", df.columns.tolist())
        print()


Patient ID: 001

    DataFrame: Food_Log_001
    Columns: ['date', 'time', 'time_begin', 'time_end', 'logged_food', 'amount', 'unit', 'searched_food', 'calorie', 'total_carb', 'dietary_fiber', 'sugar', 'protein', 'total_fat']

Patient ID: 002

    DataFrame: Food_Log_002
    Columns: ['date', 'time', 'time_begin', 'time_end', 'logged_food', 'amount', 'unit', 'searched_food', 'calorie', 'total_carb', 'dietary_fiber', 'sugar', 'protein', 'total_fat']

Patient ID: 003

    DataFrame: Food_Log_003
    Columns: ['date', 'time', 'time_begin', 'logged_food', 'amount', 'unit', 'searched_food', 'calorie', 'total_carb', 'sugar', 'protein']

Patient ID: 004

    DataFrame: Food_Log_004
    Columns: ['date', 'time', 'time_begin', 'time_end', 'logged_food', 'amount', 'unit', 'searched_food', 'calorie', 'total_carb', 'dietary_fiber', 'sugar', 'protein', 'total_fat']

Patient ID: 005

    DataFrame: Food_Log_005
    Columns: ['date', 'time', 'time_begin', 'time_end', 'logged_food', 'amount', 'unit', 

In [12]:
# Add missing columns and fill with NaN or desired value
df_003['time_end'] = np.nan
df_003['dietary_fiber'] = np.nan
df_003['total_fat'] = np.nan

# Validate the columns have been added
print(df_003.columns.tolist())

['date', 'time', 'time_begin', 'logged_food', 'amount', 'unit', 'searched_food', 'calorie', 'total_carb', 'sugar', 'protein', 'time_end', 'dietary_fiber', 'total_fat']


### Concatenate all food logs into one DataFrame

In [13]:
def process_patient_dataframe(patient_dfs, patient_id):
    """Combine all dataframes of a patient, collapse them, set datetime as index,
    resample them and return the final dataframe."""
    patient_df = pd.concat(patient_dfs.values(), ignore_index=True)

    # Set 'datetime' as index of the DataFrame
    patient_df.set_index(['time_begin'], inplace=True)

    # Sort DataFrame by index
    patient_df.sort_index(inplace=True)

    # Group by 'datetime' and take the first entry for each group 
    collapsed_df = patient_df.groupby('time_begin').first()


    # Add new column 'patient_id' with a value equal to the index in the list + 1
    patient_df['patient_id'] = patient_id

    return patient_df


# process each patient's dataframes and store in a list
resampled_df_list = [process_patient_dataframe(dfs, id) for id, dfs in all_dfs.items()]


### Inspect final DataFrame with all food log data

In [14]:
final_df = pd.concat(resampled_df_list)

In [15]:
final_df.head()


Unnamed: 0_level_0,date,time,time_end,logged_food,amount,unit,searched_food,calorie,total_carb,dietary_fiber,sugar,protein,total_fat,patient_id
time_begin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-02-13 18:00:00,2020-02-13,18:00:00,,Berry Smoothie,20.0,fluid ounce,Strawberry Smoothie,456.0,85.0,1.7,83.0,16.0,3.3,1
2020-02-13 20:30:00,2020-02-13,20:30:00,,Chicken Leg,1.0,,chicken leg,475.0,0.0,0.0,0.0,62.0,23.0,1
2020-02-13 20:30:00,2020-02-13,20:30:00,,Asparagus,4.0,,Asparagus,13.0,2.5,1.2,0.8,1.4,0.1,1
2020-02-14 07:10:00,2020-02-14,07:10:00,,Natrel Lactose Free 2 Percent,8.0,fluid ounce,(Natrel) Lactose Free 2% Partly Skimmed Milk,120.0,9.0,,8.0,12.0,,1
2020-02-14 07:10:00,2020-02-14,07:10:00,,Standard Breakfast,0.75,cup,"(Kellogg's) Frosted Flakes, Cereal",110.0,26.0,,10.0,1.0,,1


In [16]:
final_df['searched_food'].value_counts()


searched_food
Coffee                                                52
(Natrel) Lactose Free 2% Partly Skimmed Milk          36
(Kellogg's) Frosted Flakes                            28
Stevia                                                19
Coffee Cream                                          18
                                                      ..
(Outback Steakhouse) Grilled Asparagus                 1
(Outback Steakhouse) Aussie Tacos, Chicken             1
Mixed Drink                                            1
(Outback Steakhouse) Honey Wheat Bread with Butter     1
Pound Cake                                             1
Name: count, Length: 440, dtype: int64

In [17]:
final_df.drop(['date', 'time'], axis=1, inplace=True)

In [18]:
# Remove the 'time_end' column
final_df.drop('time_end', axis=1, inplace=True)


In [19]:
# Verify the change
final_df.columns

Index(['logged_food', 'amount', 'unit', 'searched_food', 'calorie',
       'total_carb', 'dietary_fiber', 'sugar', 'protein', 'total_fat',
       'patient_id'],
      dtype='object')

### Convert columns to suitable data types

In [20]:
# Convert columns to suitable data types
for col in ['amount', 'calorie', 'total_carb', 'dietary_fiber', 'sugar', 'protein', 'total_fat']:
    final_df[col] = pd.to_numeric(final_df[col], errors='coerce')

# Group by 'patient_id' and 'time_begin' and apply sum to numerical columns
final_df_grouped = final_df.groupby(['patient_id', 'time_begin']).sum()

# Reset index and sort by 'patient_id' and 'time_begin'
final_df_grouped.reset_index(inplace=True)
final_df_grouped.sort_values(['patient_id', 'time_begin'], inplace=True)

# Check the result
final_df_grouped.head(20)


Unnamed: 0,patient_id,time_begin,logged_food,amount,unit,searched_food,calorie,total_carb,dietary_fiber,sugar,protein,total_fat
0,1,2020-02-13 18:00:00,Berry Smoothie,20.0,fluid ounce,Strawberry Smoothie,456.0,85.0,1.7,83.0,16.0,3.3
1,1,2020-02-13 20:30:00,Chicken LegAsparagus,5.0,0,chicken legAsparagus,488.0,2.5,1.2,0.8,63.4,23.1
2,1,2020-02-14 07:10:00,Natrel Lactose Free 2 PercentStandard Breakfast,8.75,fluid ouncecup,(Natrel) Lactose Free 2% Partly Skimmed Milk(K...,230.0,35.0,0.0,18.0,13.0,0.0
3,1,2020-02-14 09:38:00,Breakfast Trail Mix,0.5,cup,"(Giant) Breakfast Blend, Trail Mix",280.0,30.0,0.0,22.0,4.0,0.0
4,1,2020-02-14 12:38:00,Spinach Salad w/ strawberries and cheeseEgg,201.0,grams,Spinach And Strawberry SaladLarge Egg,358.0,14.4,0.0,8.7,13.9,0.0
5,1,2020-02-14 19:30:00,Acai Smoothie,20.0,fluid ounce,(Smoothie King) Acai Adventure Smoothie,440.0,92.0,0.0,75.0,5.0,0.0
6,1,2020-02-14 20:00:00,(Trader Joe's) Mac and CheeseCoconut Shrimp,5.5,cup,(Trader Joe's) Macaroni and CheeseCoconut Shrimp,452.0,53.5,0.0,1.6,16.0,0.0
7,1,2020-02-15 07:30:00,Spinach Smoothie,20.0,fluid ounce,Spinach Smoothie,308.0,69.0,0.0,38.0,7.2,0.0
8,1,2020-02-15 11:02:00,Breakfast Trail Mix,0.5,cup,"(Giant) Breakfast Blend, Trail Mix",280.0,30.0,0.0,22.0,4.0,0.0
9,1,2020-02-15 12:38:00,"Spinach Salad w/ blueberries, egg, and cheeseEgg",201.0,grams,Spinach And Strawberry SaladLarge Egg,358.0,14.4,0.0,8.7,13.9,0.0


In [21]:
food_df = final_df_grouped.copy()

### Save final DataFrame as food_df CSV

In [22]:
food_df.to_csv('food_df.csv')

In [23]:
food_df.head()

Unnamed: 0,patient_id,time_begin,logged_food,amount,unit,searched_food,calorie,total_carb,dietary_fiber,sugar,protein,total_fat
0,1,2020-02-13 18:00:00,Berry Smoothie,20.0,fluid ounce,Strawberry Smoothie,456.0,85.0,1.7,83.0,16.0,3.3
1,1,2020-02-13 20:30:00,Chicken LegAsparagus,5.0,0,chicken legAsparagus,488.0,2.5,1.2,0.8,63.4,23.1
2,1,2020-02-14 07:10:00,Natrel Lactose Free 2 PercentStandard Breakfast,8.75,fluid ouncecup,(Natrel) Lactose Free 2% Partly Skimmed Milk(K...,230.0,35.0,0.0,18.0,13.0,0.0
3,1,2020-02-14 09:38:00,Breakfast Trail Mix,0.5,cup,"(Giant) Breakfast Blend, Trail Mix",280.0,30.0,0.0,22.0,4.0,0.0
4,1,2020-02-14 12:38:00,Spinach Salad w/ strawberries and cheeseEgg,201.0,grams,Spinach And Strawberry SaladLarge Egg,358.0,14.4,0.0,8.7,13.9,0.0
