# Data generation

After multiple failed requests to get access to the data used in the paper and no answer from the authors, I decided to generate the data myself. The data generated should have a clode structure to the one used in the paper, however less complex and accurate. However, I tried to keep a logic in the data generation process based on common sense, information from the paper and llm knowledge.

# Imports

In [1]:
import numpy as np
import pandas as pd

# Generation

In [2]:
np.random.seed(0) # To keep consistency

In [3]:
num_users: int = 10000 # number of samples in the dataset

In [4]:
type(np.random.rand() < 0.1)

bool

In [5]:
def generate_data(num_users: int) :
    X_features: list = []    # user feature vectors
    Y_meals: list = []       # ground-truth meal class sequences (length 6 for each user)
    target_EIs: list = []    # target daily energy intakes for each user
    min_macros: list = []    # minimum recommended macronutrient values for each user (based on guidelines)
    max_macros: list = []    # maximum recommended macronutrient values for each user

    for i in range(num_users):
        # Random user profile
        weight: float = round(np.random.uniform(50, 100), 2)   # kg
        height: float = np.random.uniform(150, 200)  # cm
        BMI: float = weight / ((height/100)**2)
        age: int = np.random.randint(18, 60)
        # Basal Metabolic Rate (BMR) using Mifflin-St Jeor formula (for a male user as an example)
        BMR: float = 10*weight + 6.25*height - 5*age + 5
        PAL: float = np.random.uniform(1.2, 2.0)     # Physical Activity Level (sedentary ~1.2 to very active ~2.0)
        # Medical conditions (binary flags for presence of cardiovascular disease, type-2 diabetes, iron deficiency)
        has_CVD: bool = np.random.rand() < 0.1      # 10% chance
        has_T2D: bool = np.random.rand() < 0.1      # 10% chance
        has_iron_def: bool = np.random.rand() < 0.1 # 10% chance
        
        # Compose feature vector
        user_features: list = [weight, height, BMI, BMR, PAL, int(has_CVD), int(has_T2D), int(has_iron_def)]
        X_features.append(user_features)
        
        # Determine target daily energy intake using factorial method (BMR * PAL) adjusted by factor D for BMI (based on nutritional guidelines)
        D: float = 1.0
        if BMI < 18.5:    # underweight: increase target EI to encourage weight gain
            D = 1.1
        elif BMI > 25:    # overweight: decrease target EI for weight loss
            D = 0.9
        target_EI = BMR * PAL * D
        target_EIs.append(target_EI)
        
        # Recommended macronutrient intake ranges (based on nutritional guidelines)
        # For simplicity, use fixed percentage ranges of total energy for each macro:
        # Protein: 10-35%, Carbs: 45-65%, Fat: 20-35%, SFA: 0-10% of total energy.
        min_prot, max_prot = 0.10, 0.35
        min_carb, max_carb = 0.45, 0.65
        min_fat, max_fat   = 0.20, 0.35
        min_sfa, max_sfa   = 0.00, 0.10
        # Convert these fractions to absolute amounts (grams) using energy densities (4 kcal/g for protein & carbs, 9 kcal/g for fat & SFA)
        min_prot_g = min_prot * target_EI / 4.0;   max_prot_g = max_prot * target_EI / 4.0
        min_carb_g = min_carb * target_EI / 4.0;   max_carb_g = max_carb * target_EI / 4.0
        min_fat_g  = min_fat  * target_EI / 9.0;   max_fat_g  = max_fat  * target_EI / 9.0
        min_sfa_g  = min_sfa  * target_EI / 9.0;   max_sfa_g  = max_sfa  * target_EI / 9.0
        min_macros.append([min_prot_g, min_carb_g, min_fat_g, min_sfa_g])
        max_macros.append([max_prot_g, max_carb_g, max_fat_g, max_sfa_g])
        
        # Generate a synthetic "ground truth" meal plan (sequence of 6 meal class labels) for the user.
        # We bias the meal choices based on user's BMI category for realism:
        # Underweight users get more high-calorie meals, overweight get more low-calorie meals.
        meal_classes = []
        if BMI < 18.5:
            # Underweight: 80% chance to pick a high-calorie meal (class 0-4), 20% chance low-calorie (5-9)
            for t in range(6):
                if np.random.rand() < 0.8:
                    meal_classes.append(np.random.randint(0, 5))   # high-calorie meal class
                else:
                    meal_classes.append(np.random.randint(5, 10))  # low-calorie meal class
        elif BMI > 25:
            # Overweight: 80% chance low-calorie meal, 20% high-calorie meal
            for t in range(6):
                if np.random.rand() < 0.8:
                    meal_classes.append(np.random.randint(5, 10))  # low-calorie meal class
                else:
                    meal_classes.append(np.random.randint(0, 5))   # high-calorie meal class
        else:
            # Normal weight: no strong bias, random meals
            for t in range(6):
                meal_classes.append(np.random.randint(0, 10))
        Y_meals.append(meal_classes)
    return X_features, Y_meals, min_macros, max_macros, target_EIs

In [6]:
X_features, Y_meals, min_macros, max_macros, target_EIs = generate_data(num_users=num_users)

In [7]:
type(X_features), type(Y_meals), type(min_macros), type(max_macros), type(target_EIs)

(list, list, list, list, list)

In [8]:
X: np.array = np.array(X_features, dtype=float)
Y: np.array = np.array(Y_meals, dtype=int)
min_macros: np.array = np.array(min_macros, dtype=float)
max_macros: np.array = np.array(max_macros, dtype=float)
target_EIs: np.array = np.array(target_EIs, dtype=float)

In [9]:
type(X), type(Y), type(min_macros), type(max_macros), type(target_EIs)

(numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray)

# Data Exploration

Let's start by exploring the data generated. We will see if we can improve it for further training.

In [10]:
X[0]

array([8.57600000e+01, 1.80138169e+02, 2.64285469e+01, 1.88346356e+03,
       1.87780139e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [11]:
X.shape, Y.shape, min_macros.shape, max_macros.shape, target_EIs.shape

((10000, 8), (10000, 6), (10000, 4), (10000, 4), (10000,))

The shapes are correct. `X` has 8 features as in the paper, `Y` has 6 columns for the 6 meals in a day, `min_macros` and `max_macros` have 4 columns for the 4 macros selected, and `target_EIs` has 1 column for the target energy intake. 

In [12]:
user_features = ["weight","height","BMI","BMR","PAL","has_CVD","has_T2D","has_iron_def"]
df_features = pd.DataFrame(X, columns=user_features)
df_features.head(2)

Unnamed: 0,weight,height,BMI,BMR,PAL,has_CVD,has_T2D,has_iron_def
0,85.76,180.138169,26.428547,1883.463555,1.877801,0.0,0.0,0.0
1,98.93,189.957928,27.416573,1946.537051,1.824423,0.0,0.0,0.0


Normalize the data for better training and avoid exploding gradients.

In [13]:
df_features_normalized = (df_features - df_features.mean())/df_features.std() # Normalize data
df_features_normalized.head(2)

Unnamed: 0,weight,height,BMI,BMR,PAL,has_CVD,has_T2D,has_iron_def
0,0.737059,0.369419,0.211762,1.243814,1.208554,-0.329601,-0.334242,-0.348845
1,1.648117,1.050903,0.366467,1.588776,0.977632,-0.329601,-0.334242,-0.348845


Convert boolean columns to boolean type.

In [14]:
df_features[[col for col in df_features.columns if col.startswith("has")]] = df_features[[col for col in df_features.columns if col.startswith("has")]].astype(bool)
df_features.head(2)

Unnamed: 0,weight,height,BMI,BMR,PAL,has_CVD,has_T2D,has_iron_def
0,85.76,180.138169,26.428547,1883.463555,1.877801,False,False,False
1,98.93,189.957928,27.416573,1946.537051,1.824423,False,False,False


In [15]:
meal_columns = [f"meal_{i+1}" for i in range(Y.shape[1])]
df_meals = pd.DataFrame(Y, columns=meal_columns)
df_meals.head(2)

Unnamed: 0,meal_1,meal_2,meal_3,meal_4,meal_5,meal_6
0,9,5,6,6,5,0
1,2,8,7,7,9,5


In [16]:
unique_values = [df_meals[i].unique() for i in df_meals.columns]
my_set = set(int(value) for values in unique_values for value in values)
my_set

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

In [17]:
min_macro_columns = ["min_prot", "min_carb", "min_fat", "min_sfa"]
df_min_macros = pd.DataFrame(min_macros, columns=min_macro_columns)
df_min_macros.head(2)

Unnamed: 0,min_prot,min_carb,min_fat,min_sfa
0,79.577336,358.098011,70.73541,0.0
1,79.904422,359.569898,71.026153,0.0


Before a standardization, we need to check if `min_sfa` values. If they are all zeros, we can drop the column during standardization.

In [18]:
df_min_macros["min_sfa"].unique()

array([0.])

In [19]:
df_min_macros_temp = df_min_macros[[col for col in df_min_macros.columns if col != "min_sfa"]]
df_min_macros_temp = (df_min_macros_temp - df_min_macros_temp.mean())/ df_min_macros_temp.std()

df_min_macros_normalized = pd.concat((df_min_macros_temp, df_min_macros["min_sfa"]), axis=1)
df_min_macros_normalized.head(2)

Unnamed: 0,min_prot,min_carb,min_fat,min_sfa
0,1.320618,1.320618,1.320618,0.0
1,1.348376,1.348376,1.348376,0.0


In [20]:
max_macro_columns = ["max_prot", "max_carb", "max_fat", "max_sfa"]
df_max_macros = pd.DataFrame(max_macros, columns=max_macro_columns)
df_max_macros.head(2)

Unnamed: 0,max_prot,max_carb,max_fat,max_sfa
0,278.520676,517.252683,123.786967,35.367705
1,279.665476,519.378741,124.295767,35.513076


In [21]:
df_max_macros_normalized = (df_max_macros - df_max_macros.mean())/df_max_macros.std()
df_max_macros_normalized.head(2)

Unnamed: 0,max_prot,max_carb,max_fat,max_sfa
0,1.320618,1.320618,1.320618,1.320618
1,1.348376,1.348376,1.348376,1.348376


In [22]:
df_target_EIs = pd.DataFrame(target_EIs, columns=["target_EI"])
df_target_EIs.head(2)

Unnamed: 0,target_EI
0,3183.093435
1,3196.176868


In [23]:
df_target_EIs_normalized = (df_target_EIs - df_target_EIs.mean())/ df_target_EIs.std()
df_target_EIs_normalized.head(2)

Unnamed: 0,target_EI
0,1.320618
1,1.348376


In [24]:
df_full = pd.concat([df_features_normalized, df_meals, df_min_macros_normalized, df_max_macros_normalized, df_target_EIs_normalized], axis=1)
df_full.head()

Unnamed: 0,weight,height,BMI,BMR,PAL,has_CVD,has_T2D,has_iron_def,meal_1,meal_2,...,meal_6,min_prot,min_carb,min_fat,min_sfa,max_prot,max_carb,max_fat,max_sfa,target_EI
0,0.737059,0.369419,0.211762,1.243814,1.208554,-0.329601,-0.334242,-0.348845,9,5,...,0,1.320618,1.320618,1.320618,0.0,1.320618,1.320618,1.320618,1.320618,1.320618
1,1.648117,1.050903,0.366467,1.588776,0.977632,-0.329601,-0.334242,-0.348845,2,8,...,5,1.348376,1.348376,1.348376,0.0,1.348376,1.348376,1.348376,1.348376,1.348376
2,0.397401,1.552624,-0.67062,0.518913,-0.166482,-0.329601,-0.334242,2.866316,9,3,...,0,0.362253,0.362253,0.362253,0.0,0.362253,0.362253,0.362253,0.362253,0.362253
3,-0.645786,-0.460085,-0.285705,-0.722933,0.471452,-0.329601,-0.334242,-0.348845,4,4,...,7,0.087511,0.087511,0.087511,0.0,0.087511,0.087511,0.087511,0.087511,0.087511
4,-0.567616,0.619218,-0.823533,0.280668,-1.043371,-0.329601,-0.334242,2.866316,2,4,...,2,-0.516128,-0.516128,-0.516128,0.0,-0.516128,-0.516128,-0.516128,-0.516128,-0.516128


In [25]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   weight        10000 non-null  float64
 1   height        10000 non-null  float64
 2   BMI           10000 non-null  float64
 3   BMR           10000 non-null  float64
 4   PAL           10000 non-null  float64
 5   has_CVD       10000 non-null  float64
 6   has_T2D       10000 non-null  float64
 7   has_iron_def  10000 non-null  float64
 8   meal_1        10000 non-null  int64  
 9   meal_2        10000 non-null  int64  
 10  meal_3        10000 non-null  int64  
 11  meal_4        10000 non-null  int64  
 12  meal_5        10000 non-null  int64  
 13  meal_6        10000 non-null  int64  
 14  min_prot      10000 non-null  float64
 15  min_carb      10000 non-null  float64
 16  min_fat       10000 non-null  float64
 17  min_sfa       10000 non-null  float64
 18  max_prot      10000 non-nul

In [26]:
df_full.describe()

Unnamed: 0,weight,height,BMI,BMR,PAL,has_CVD,has_T2D,has_iron_def,meal_1,meal_2,...,meal_6,min_prot,min_carb,min_fat,min_sfa,max_prot,max_carb,max_fat,max_sfa,target_EI
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,1.730101e-14,2.250218e-14,1.56831e-14,-1.058424e-14,-3.748291e-15,-2.842171e-17,-3.1974420000000004e-17,4.6540550000000006e-17,4.9648,4.9453,...,4.9725,-4.567369e-15,-2.813749e-16,-4.649081e-15,0.0,1.738769e-14,-4.810374e-16,-1.543867e-14,-4.649081e-15,1.182343e-15
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.836823,2.83921,...,2.867258,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
min,-1.736704,-1.721929,-1.963093,-2.717928,-1.723661,-0.3296006,-0.3342418,-0.348845,0.0,0.0,...,0.0,-2.52702,-2.52702,-2.52702,0.0,-2.52702,-2.52702,-2.52702,-2.52702,-2.52702
25%,-0.8733774,-0.8648403,-0.764437,-0.7348211,-0.8706945,-0.3296006,-0.3342418,-0.348845,3.0,3.0,...,3.0,-0.7481879,-0.7481879,-0.7481879,0.0,-0.7481879,-0.7481879,-0.7481879,-0.7481879,-0.7481879
50%,0.01450693,-0.006056629,-0.1056404,-0.004988837,0.003276508,-0.3296006,-0.3342418,-0.348845,5.0,5.0,...,5.0,-0.06967191,-0.06967191,-0.06967191,0.0,-0.06967191,-0.06967191,-0.06967191,-0.06967191,-0.06967191
75%,0.8567346,0.8726734,0.6619641,0.7459397,0.8644255,-0.3296006,-0.3342418,-0.348845,7.0,7.0,...,7.0,0.698215,0.698215,0.698215,0.0,0.698215,0.698215,0.698215,0.698215,0.698215
max,1.721445,1.747689,2.999132,2.696972,1.736572,3.033671,2.991547,2.866316,9.0,9.0,...,9.0,3.358114,3.358114,3.358114,0.0,3.358114,3.358114,3.358114,3.358114,3.358114


# Save Data

In [None]:
# df_full.to_csv("../../../../ressources/datasets/synthetic_nutrition_data.csv", index=False)