In [None]:
# First, make sure you have the necessary libraries
# !pip install pandas pyarrow

import pandas as pd


In [2]:
# Load the dataset from the local file
# Or directly from Hugging Face after downloading
# Correct relative path for your project
file_path = "../raw_data/data.parquet"
df = pd.read_parquet(file_path, engine="fastparquet")

# Explore the data
print(f"Dataset loaded with {len(df)} rows.")
print("Sample of the data:")
display(df.head())

Dataset loaded with 501887 rows.
Sample of the data:


Unnamed: 0,fdc_id,food_item,data_type,ingredients,serving_description,serving_amount,serving_unit,serving_gram_weight,Caffeine,"Calcium, Ca",...,Energy,"Fatty acids, total saturated","Fatty acids, total trans","Fiber, total dietary","Iron, Fe","Potassium, K",Protein,"Sodium, Na",Total lipid (fat),Vitamin D (D2 + D3)
0,1104647,"Garlic, raw",foundation_food,,Standard 100g serving,100.0,g,100.0,,,...,597.0,,,2.7,,,6.62,,0.38,
1,1104705,"Flour, soy, defatted",foundation_food,,Standard 100g serving,100.0,g,100.0,,338.0,...,366.0,,,,7.34,2480.0,51.1,2.0,3.33,
2,1104766,"Flour, soy, full-fat",foundation_food,,Standard 100g serving,100.0,g,100.0,,258.0,...,1890.0,,,,9.51,1860.0,38.6,2.0,20.7,
3,1104812,"Flour, rice, brown",foundation_food,,Standard 100g serving,100.0,g,100.0,,10.0,...,365.25,,,,1.5,265.0,7.19,1.0,3.85,
4,1104867,"Flour, rice, glutinous",foundation_food,,Standard 100g serving,100.0,g,100.0,,10.0,...,1500.0,,,,0.34,80.0,6.69,6.0,1.16,


In [5]:
df.shape

(501887, 22)

In [3]:
df['Energy'].isnull().sum()

np.int64(51860)

In [6]:
COLUMNS_TO_KEEP = [
    "food_item",
    "data_type",
    "Energy",
    "Total lipid (fat)",
    "Fatty acids, total saturated",
    "Carbohydrate, by difference",
    "Protein",
    "Fiber, total dietary"
]

In [7]:
df_clean = df[COLUMNS_TO_KEEP].copy()
df_clean.head(20)

len(df_clean)

501887

In [8]:
# Remove branded_food rows
df_clean = df_clean[df_clean["data_type"] != "branded_food"].reset_index(drop=True)

len(df_clean)

8028

In [9]:
# add column with energy in kcal calculated
df_clean["energy_kcal_calculated"] = (
    df_clean["Total lipid (fat)"] * 9 +
    df_clean["Carbohydrate, by difference"] * 4 +
    df_clean["Protein"] * 4
).round(1)

df_clean.head(50)

Unnamed: 0,food_item,data_type,Energy,Total lipid (fat),"Fatty acids, total saturated","Carbohydrate, by difference",Protein,"Fiber, total dietary",energy_kcal_calculated
0,"Garlic, raw",foundation_food,597.0,0.38,,28.2,6.62,2.7,142.7
1,"Flour, soy, defatted",foundation_food,366.0,3.33,,32.9,51.1,,366.0
2,"Flour, soy, full-fat",foundation_food,1890.0,20.7,,27.9,38.6,,452.3
3,"Flour, rice, brown",foundation_food,365.25,3.85,,75.5,7.19,,365.4
4,"Flour, rice, glutinous",foundation_food,1500.0,1.16,,80.1,6.69,,357.6
5,"Flour, pastry, unenriched, unbleached",foundation_food,1500.0,1.64,,77.2,8.75,,358.6
6,"Onions, white, raw",foundation_food,35.0,0.13,,7.68,0.89,1.2,35.4
7,"Apples, red delicious, with skin, raw",foundation_food,258.0,0.21,,14.8,0.19,2.0,61.8
8,"Apples, honeycrisp, with skin, raw",foundation_food,60.0,0.1,,14.7,0.1,1.7,60.1
9,"Apples, granny smith, with skin, raw",foundation_food,59.0,0.14,,14.2,0.27,2.5,59.1


In [None]:
# replace NaN in sat fat column with 0 where likely 0
import numpy as np

df_clean["Fatty acids, total saturated"] = np.where(
    (df_clean["Total lipid (fat)"] < 3) &
    (df_clean["Fatty acids, total saturated"].isna()),
    0,
    df_clean["Fatty acids, total saturated"]
)

df_clean.head(50)

Unnamed: 0,food_item,data_type,Energy,Total lipid (fat),"Fatty acids, total saturated","Carbohydrate, by difference",Protein,"Fiber, total dietary",energy_kcal_calculated
0,"Garlic, raw",foundation_food,597.0,0.38,0.0,28.2,6.62,2.7,142.7
1,"Flour, soy, defatted",foundation_food,366.0,3.33,,32.9,51.1,,366.0
2,"Flour, soy, full-fat",foundation_food,1890.0,20.7,,27.9,38.6,,452.3
3,"Flour, rice, brown",foundation_food,365.25,3.85,,75.5,7.19,,365.4
4,"Flour, rice, glutinous",foundation_food,1500.0,1.16,0.0,80.1,6.69,,357.6
5,"Flour, pastry, unenriched, unbleached",foundation_food,1500.0,1.64,0.0,77.2,8.75,,358.6
6,"Onions, white, raw",foundation_food,35.0,0.13,0.0,7.68,0.89,1.2,35.4
7,"Apples, red delicious, with skin, raw",foundation_food,258.0,0.21,0.0,14.8,0.19,2.0,61.8
8,"Apples, honeycrisp, with skin, raw",foundation_food,60.0,0.1,0.0,14.7,0.1,1.7,60.1
9,"Apples, granny smith, with skin, raw",foundation_food,59.0,0.14,0.0,14.2,0.27,2.5,59.1


In [None]:
# replace NaN in fibre column with 0 where likely 0
import numpy as np

df_clean["Fiber, total dietary"] = np.where(
    (df_clean["Carbohydrate, by difference"] < 5) &
    (df_clean["Fiber, total dietary"].isna()),
    0,
    df_clean["Fiber, total dietary"]
)

df_clean.head(50)

Unnamed: 0,food_item,data_type,Energy,Total lipid (fat),"Fatty acids, total saturated","Carbohydrate, by difference",Protein,"Fiber, total dietary",energy_kcal_calculated
0,"Garlic, raw",foundation_food,597.0,0.38,0.0,28.2,6.62,2.7,142.7
1,"Flour, soy, defatted",foundation_food,366.0,3.33,,32.9,51.1,,366.0
2,"Flour, soy, full-fat",foundation_food,1890.0,20.7,,27.9,38.6,,452.3
3,"Flour, rice, brown",foundation_food,365.25,3.85,,75.5,7.19,,365.4
4,"Flour, rice, glutinous",foundation_food,1500.0,1.16,0.0,80.1,6.69,,357.6
5,"Flour, pastry, unenriched, unbleached",foundation_food,1500.0,1.64,0.0,77.2,8.75,,358.6
6,"Onions, white, raw",foundation_food,35.0,0.13,0.0,7.68,0.89,1.2,35.4
7,"Apples, red delicious, with skin, raw",foundation_food,258.0,0.21,0.0,14.8,0.19,2.0,61.8
8,"Apples, honeycrisp, with skin, raw",foundation_food,60.0,0.1,0.0,14.7,0.1,1.7,60.1
9,"Apples, granny smith, with skin, raw",foundation_food,59.0,0.14,0.0,14.2,0.27,2.5,59.1


In [None]:
# the following columns are mandatory for modelling, in case they are NaN they are dropped
# CAVE: there will reamain NaN in fibre and saturated fats!
cols_required = [
    "food_item",
    "energy_kcal_calculated",
    "Total lipid (fat)",
    "Carbohydrate, by difference",
    "Protein",
    "Fiber, total dietary",
    "Fatty acids, total saturated"
]

df_clean = df_clean.dropna(subset=cols_required)

In [10]:
# df_clean.head(50)

len(df_clean)

7521

In [63]:
import sys
import os

# Add parent folder to Python path
sys.path.append(os.path.abspath(".."))

In [None]:
from nutrimap_app.category_mapping import assign_food_group
df_clean["food_category"] = df_clean.apply(assign_food_group, axis=1)
df_clean.head(50)