In [None]:
import os
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
try:
    import catboost as cb
    HAVE_CATBOOST = True
except ImportError:
    HAVE_CATBOOST = False
    print("CatBoost not available - will skip CatBoost model")
import joblib


In [None]:
DATA_DIR = os.path.dirname(os.path.abspath("__file__")) 
foods_path = os.path.join(DATA_DIR, 'foods.csv')
health_path = os.path.join(DATA_DIR, 'health_activity_data.csv')

if not os.path.exists(foods_path) or not os.path.exists(health_path):
    raise FileNotFoundError('Ensure foods.csv and health_activity_data.csv are present in the project directory')

foods = pd.read_csv(foods_path)
health = pd.read_csv(health_path)


In [None]:

def find_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

col_weight = find_col(health, ['weight_kg', 'weight', 'Weight_kg', 'body_weight'])
col_height = find_col(health, ['height_cm', 'height', 'Height_cm', 'height_m'])
col_age = find_col(health, ['age', 'Age'])
col_sex = find_col(health, ['sex', 'gender', 'Sex', 'Gender'])
col_activity = find_col(health, ['activity_level', 'activity', 'Activity', 'activity_factor'])
possible_goal_col = find_col(health, ['calorie_goal', 'daily_calories', 'calories', 'maintenance_calories', 'calories_per_day'])


In [None]:
df = health.copy()

if col_height is not None:
    if df[col_height].dropna().size>0 and df[col_height].dropna().max()>3:
        df['height_m'] = df[col_height] / 100.0
    else:
        df['height_m'] = df[col_height]
else:
    df['height_m'] = np.nan

if col_weight is not None:
    df['weight_kg'] = df[col_weight]
else:
    df['weight_kg'] = np.nan


In [None]:

df['BMI'] = df.apply(lambda r: (r['weight_kg'] / (r['height_m']**2)) if pd.notnull(r['weight_kg']) and pd.notnull(r['height_m']) and r['height_m']>0 else np.nan, axis=1)
df['bmi_category'] = pd.cut(df['BMI'], bins=[-np.inf, 18.5, 25, 30, 35, np.inf], labels=[0, 1, 2, 3, 4]).astype(int)


if col_age is not None:
    df['age'] = df[col_age]
else:
    df['age'] = df.get('age', 30)

df['age_squared'] = df['age'] ** 2
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 30, 50, 70, np.inf], labels=[0, 1, 2, 3, 4]).astype(int)

if col_sex is not None:
    df['sex'] = df[col_sex].fillna('other').astype(str).str.lower()
else:
    df['sex'] = df.get('sex', 'other')

def map_activity(x):
    if pd.isnull(x): return 1.2
    s = str(x).lower()
    if any(k in s for k in ['sedentary','low', '1.2']): return 1.2
    if any(k in s for k in ['light','lightly','1.375']): return 1.375
    if any(k in s for k in ['moderate','moderately','2-3','1.55']): return 1.55
    if any(k in s for k in ['active','very active','hard','1.725']): return 1.725
    if any(k in s for k in ['extreme','athlete','1.9']): return 1.9
    try:
        val = float(x)
        if 1.0 <= val <= 3.0: return val
    except Exception: pass
    return 1.2

if col_activity is not None:
    df['activity_factor'] = df[col_activity].apply(map_activity)
else:
    df['activity_factor'] = 1.2

df['activity_category'] = pd.cut(df['activity_factor'], bins=[0, 1.3, 1.5, 1.7, np.inf], labels=[0, 1, 2, 3]).astype(int)


In [None]:
def compute_bmr_row(row):
    w = row.get('weight_kg', np.nan)
    h_m = row.get('height_m', np.nan)
    age = row.get('age', 30)
    sex = str(row.get('sex', 'other')).lower()
    if pd.isnull(w) or pd.isnull(h_m): return np.nan
    h_cm = h_m * 100.0
    if 'female' in sex or sex in ['f','woman','female']:
        return 10*w + 6.25*h_cm - 5*age - 161
    if 'male' in sex or sex in ['m','man','male']:
        return 10*w + 6.25*h_cm - 5*age + 5
    return 10*w + 6.25*h_cm - 5*age

df['BMR'] = df.apply(compute_bmr_row, axis=1)
df['TDEE'] = df['BMR'] * df['activity_factor']

np.random.seed(42)
noise_sd_fraction = 0.05
if possible_goal_col is not None:
    df['calorie_goal_observed'] = pd.to_numeric(df[possible_goal_col], errors='coerce')
else:
    df['calorie_goal_observed'] = np.nan

df['calorie_goal_synthetic'] = df['TDEE'] * (1 + np.random.normal(0, noise_sd_fraction, size=len(df)))
df['calorie_goal'] = df['calorie_goal_observed'].fillna(df['calorie_goal_synthetic']).astype(float)


In [7]:
model_df = df.dropna(subset=['weight_kg','height_m']).copy()
model_df['sex_cat'] = model_df['sex'].astype(str).str.lower().map(lambda s: 1 if s in ['male','m','man'] else (0 if s in ['female','f','woman'] else 2))

for c in ['age','weight_kg','height_m','BMI','activity_factor']:
    if c in model_df.columns:
        model_df[c] = pd.to_numeric(model_df[c], errors='coerce')
        model_df[c] = model_df[c].fillna(model_df[c].median())

model_df['weight_height_ratio'] = model_df['weight_kg'] / model_df['height_m']
model_df['bmi_activity'] = model_df['BMI'] * model_df['activity_factor']
model_df['age_bmi'] = model_df['age'] * model_df['BMI'] / 100.0
model_df['weight_class'] = pd.qcut(model_df['weight_kg'], q=5, labels=[0,1,2,3,4])
model_df['height_class'] = pd.qcut(model_df['height_m'], q=5, labels=[0,1,2,3,4])

feature_cols = ['age','age_squared','sex_cat','weight_kg','height_m','BMI','activity_factor','weight_height_ratio',
                'bmi_activity','age_bmi','bmi_category','age_group','activity_category','weight_class','height_class']

X = model_df[feature_cols].copy()
y = model_df['calorie_goal'].astype(float).copy()

print('Data prepared for modeling. X shape:', X.shape, 'y shape:', y.shape)


Data prepared for modeling. X shape: (1000, 15) y shape: (1000,)


In [8]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
feature_names = poly.get_feature_names_out(feature_cols)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly)
joblib.dump(scaler, os.path.join(DATA_DIR, 'scaler.pkl'))
print('Saved scaler to scaler.pkl')

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
cv = KFold(n_splits=5, shuffle=True, random_state=42)


Saved scaler to scaler.pkl


In [9]:
models = {}

def add_model_metrics(name, model, X_train, X_test, y_train, y_test, cv):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    cv_r2 = cross_val_score(model, X_scaled, y, cv=cv, scoring='r2')
    cv_mae = -cross_val_score(model, X_scaled, y, cv=cv, scoring='neg_mean_absolute_error')
    
    models[name] = {'model': model, 'mae_holdout': mae, 'r2_holdout': r2,
                    'cv_r2_mean': cv_r2.mean(), 'cv_mae_mean': cv_mae.mean()}
    print(f'{name} -> MAE: {mae:.2f}, R2: {r2:.4f}')
    print(f'{name} CV mean R2: {cv_r2.mean():.4f} CV mean MAE: {cv_mae.mean():.2f}')

# Train 4 models
en = ElasticNet(alpha=0.01, l1_ratio=0.5, random_state=42)
add_model_metrics('ElasticNet', en, X_train, X_test, y_train, y_test, cv)

rf = RandomForestRegressor(n_estimators=500, max_depth=15, min_samples_split=3,
                          min_samples_leaf=2, max_features=0.8, random_state=42)
add_model_metrics('RandomForest', rf, X_train, X_test, y_train, y_test, cv)

gb = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, 
                              max_depth=4, min_samples_split=4,
                              min_samples_leaf=2, subsample=0.8, random_state=42)
add_model_metrics('GradientBoosting', gb, X_train, X_test, y_train, y_test, cv)

if HAVE_CATBOOST:
    cb_model = cb.CatBoostRegressor(iterations=300, learning_rate=0.05, depth=4,
                                    l2_leaf_reg=3, random_seed=42, verbose=False)
    add_model_metrics('CatBoost', cb_model, X_train, X_test, y_train, y_test, cv)


ElasticNet -> MAE: 77.47, R2: 0.9006
ElasticNet CV mean R2: 0.9015 CV mean MAE: 77.29
RandomForest -> MAE: 85.22, R2: 0.8826
RandomForest CV mean R2: 0.8887 CV mean MAE: 82.18
GradientBoosting -> MAE: 84.45, R2: 0.8842
GradientBoosting CV mean R2: 0.8933 CV mean MAE: 80.82
CatBoost -> MAE: 83.65, R2: 0.8863
CatBoost CV mean R2: 0.8960 CV mean MAE: 79.92


In [10]:
best_name = max(models.items(), key=lambda x: x[1]['cv_r2_mean'])[0]
best_model = models[best_name]['model']
print('\nBest model by CV mean R2:', best_name)

joblib.dump(best_model, os.path.join(DATA_DIR, 'trained_model.pkl'))
print('Saved model to trained_model.pkl')

cal_col = None
for c in ['calories','cal_per_serving','calories_per_serving','calories_per_100g','kcal']:
    if c in foods.columns:
        cal_col = c
        break
if cal_col is None:
    prot = next((c for c in foods.columns if 'protein' in c.lower()), None)
    carbs = next((c for c in foods.columns if 'carb' in c.lower()), None)
    fat = next((c for c in foods.columns if 'fat' in c.lower()), None)
    if prot and carbs and fat:
        foods['calories_calc'] = foods[prot].fillna(0)*4 + foods[carbs].fillna(0)*4 + foods[fat].fillna(0)*9
        cal_col = 'calories_calc'
    else:
        raise ValueError('Could not find or compute calories column in foods.csv')

X_all_scaled = X_scaled
preds_all = best_model.predict(X_all_scaled)
users = model_df.reset_index(drop=True).copy()
users['pred_calorie_need'] = preds_all

rows_out = []
for idx, row in users.iterrows():
    pred_cal = row['pred_calorie_need']
    foods_copy = foods.copy()
    foods_copy['cal_diff'] = (foods_copy[cal_col].astype(float).fillna(0) - pred_cal).abs()
    top5 = foods_copy.nsmallest(5, 'cal_diff')
    for rank, frow in enumerate(top5.itertuples(), start=1):
        rows_out.append({
            'user_index': idx,
            'pred_calorie_need': float(pred_cal),
            'rank': rank,
            'food_name': getattr(frow, 'name', str(frow.Index)),
            'food_calories': float(getattr(frow, cal_col))
        })

recs_df = pd.DataFrame(rows_out)
recs_path = os.path.join(DATA_DIR, 'final_recommendations.csv')
recs_df.to_csv(recs_path, index=False)
print('Saved recommendations to', recs_path)

print('\nPerformance summary:')
summary = pd.DataFrame({
    'model': list(models.keys()),
    'MAE_holdout': [models[m]['mae_holdout'] for m in models],
    'R2_holdout': [models[m]['r2_holdout'] for m in models],
    'CV_R2_mean': [models[m]['cv_r2_mean'] for m in models],
    'CV_MAE_mean': [models[m]['cv_mae_mean'] for m in models]
})
print(summary.to_string(index=False))

print('\nDone.')



Best model by CV mean R2: ElasticNet
Saved model to trained_model.pkl
Saved recommendations to C:\Users\megha\NUTRITION PROJECT\final_recommendations.csv

Performance summary:
           model  MAE_holdout  R2_holdout  CV_R2_mean  CV_MAE_mean
      ElasticNet    77.473263    0.900561    0.901488    77.290721
    RandomForest    85.223761    0.882589    0.888666    82.183256
GradientBoosting    84.453622    0.884246    0.893283    80.818677
        CatBoost    83.649447    0.886335    0.896014    79.921163

Done.
