In [19]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load dataset
data_path = r"D:\Projects\SmartFit-SmartDiet\data\raw\daily_food_nutrition_dataset.csv"
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(df.head())

# Step 1: Check and handle missing values
print(df.isnull().sum())

df.dropna(subset=['Calories (kcal)', 'Protein (g)', 'Carbohydrates (g)', 'Fat (g)', 'Date', 'User_ID'], inplace=True)

# Step 2: Convert 'Date' to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Step 3: Aggregate nutrient intake per user per day
daily_totals = df.groupby(['User_ID', 'Date']).agg({
    'Calories (kcal)': 'sum',
    'Protein (g)': 'sum',
    'Carbohydrates (g)': 'sum',
    'Fat (g)': 'sum',
    'Fiber (g)': 'sum',
    'Sugars (g)': 'sum',
    'Sodium (mg)': 'sum',
    'Cholesterol (mg)': 'sum',
    'Water_Intake (ml)': 'sum'
}).reset_index()

print(daily_totals.head())

# Step 4: Create synthetic user profiles if no real profile data available
user_profiles = pd.DataFrame({
    'User_ID': daily_totals['User_ID'].unique(),
    'Age': np.random.randint(18, 60, size=daily_totals['User_ID'].nunique()),
    'Gender': np.random.choice(['Male', 'Female'], size=daily_totals['User_ID'].nunique()),
    'Weight_kg': np.random.uniform(50, 90, size=daily_totals['User_ID'].nunique()),
    'Height_cm': np.random.uniform(150, 190, size=daily_totals['User_ID'].nunique())
})

print(user_profiles.head())

# Step 5: Merge daily nutrient totals with user profiles
data = daily_totals.merge(user_profiles, on='User_ID', how='left')

# Step 6: Calculate BMI
data['BMI'] = data['Weight_kg'] / (data['Height_cm'] / 100) ** 2

# Step 7: Calculate BMR using Mifflin-St Jeor formula
def calc_bmr(row):
    if row['Gender'] == 'Male':
        return 10 * row['Weight_kg'] + 6.25 * row['Height_cm'] - 5 * row['Age'] + 5
    else:
        return 10 * row['Weight_kg'] + 6.25 * row['Height_cm'] - 5 * row['Age'] - 161

data['BMR'] = data.apply(calc_bmr, axis=1)

# Step 8: Estimate TDEE assuming sedentary activity
data['TDEE'] = data['BMR'] * 1.2

# Step 9: Calculate nutrient ratios (% of calories)
data['protein_ratio'] = data['Protein (g)'] * 4 / data['Calories (kcal)']
data['fat_ratio'] = data['Fat (g)'] * 9 / data['Calories (kcal)']
data['carbs_ratio'] = data['Carbohydrates (g)'] * 4 / data['Calories (kcal)']

# Step 10: Define calorie adjustment target
data['calorie_adjustment'] = data['TDEE'] - data['Calories (kcal)']

# Step 11: Encode gender
data = pd.get_dummies(data, columns=['Gender'], drop_first=True)

# Step 12: Handle infinite or NaN values
data.replace([np.inf, -np.inf], 0, inplace=True)
data.fillna(0, inplace=True)

# Step 13: Scale numeric features
features_to_scale = ['Age', 'Weight_kg', 'Height_cm', 'BMI', 'BMR', 'TDEE',
                     'Calories (kcal)', 'Protein (g)', 'Fat (g)', 'Carbohydrates (g)',
                     'protein_ratio', 'fat_ratio', 'carbs_ratio', 'calorie_adjustment']

scaler = StandardScaler()
data[features_to_scale] = scaler.fit_transform(data[features_to_scale])

# Step 14: Save preprocessed data using os.path
processed_dir = r"D:\Projects\SmartFit-SmartDiet\data\processed"
os.makedirs(processed_dir, exist_ok=True)

save_path = os.path.join(processed_dir, 'preprocessed_diet_data.csv')
data.to_csv(save_path, index=False)
print(f"Preprocessed diet data saved at {save_path}")


Dataset shape: (10000, 14)
         Date  User_ID       Food_Item Category  Calories (kcal)  Protein (g)  \
0  2024-09-11      496            Eggs     Meat              173         42.4   
1  2024-12-17      201           Apple   Fruits               66         39.2   
2  2024-06-09      776  Chicken Breast     Meat              226         27.1   
3  2024-08-27      112          Banana   Fruits              116         43.4   
4  2024-07-28      622          Banana   Fruits              500         33.9   

   Carbohydrates (g)  Fat (g)  Fiber (g)  Sugars (g)  Sodium (mg)  \
0               83.7      1.5        1.5        12.7          752   
1               13.8      3.2        2.6        12.2          680   
2               79.1     25.8        3.2        44.7          295   
3               47.1     16.1        6.5        44.1          307   
4               75.8     47.0        7.8        19.4          358   

   Cholesterol (mg)  Meal_Type  Water_Intake (ml)  
0               125