In [6]:
import pandas as pd
import numpy as np
import joblib
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.utils import resample

In [7]:
df = pd.read_csv('nutrition_dataset_processed (1).csv')
df.head()

Unnamed: 0,name,calories,fat,proteins,carbohydrate,Meal Type,Carb_Level,Protein_Level,Nutrient_Density
0,Ikan Gabus segar,0.073267,0.017,0.144,0.0,3,Low-Carb,High-Protein,1.733355
1,Teh,0.130693,0.007,0.111429,0.104791,2,High-Carb,High-Protein,1.600837
2,Kerbau daging segar,0.078218,0.005,0.106857,0.0,3,Low-Carb,High-Protein,1.302208
3,Ikan kayu kering,0.29901,0.019,0.404,0.000618,3,Low-Carb,High-Protein,1.289646
4,Telur Ayam bagian putih,0.049505,0.0,0.061714,0.001236,3,Low-Carb,High-Protein,1.27158


# **Preprocess Data & Training**

In [8]:

# 🏷️ Define Drink Keywords
drink_keywords = {
    "juice", "coffee", "tea", "milk", "soda", "beer", "wine", "drink", "beverage",
    "kopi", "teh", "sirup", "sari kedelai", "es", "jus", "susu", "cendol", "daun teh",
    "bir", "limun", "bandrek", "kelapa muda", "nectar", "fruit juice", "cola", "coke", "tonic",
    "water", "coconut water", "sprite", "limeade", "lemonade", "yogurt", "matcha", "chocolate",
    "smoothie", "shake", "isotonic", "infused", "bubble tea", "espresso", "latte",
    "americano", "mocha", "kombucha", "energy drink", "sherbet", "fizz", "flavored water"
}

# ❌ Define Non-Drink Exclusion List
non_drink_keywords = {
    "cheese", "udang", "shrimp", "chicken", "salmon", "daging", "tuna", "crab", "ikan",
    "beef", "meat", "duck", "lamb", "tofu", "tempeh", "egg", "telur", "almond", "peanut",
    "seed", "nuts", "cashew", "pecan", "walnut", "pistachio", "hazelnut", "butter",
    "bread", "rice", "pasta", "noodle", "roti", "potato", "yam", "cassava", "oncom", "nasi", "pempek", "amaranth"
}

meal_type_names = {0: 'Breakfast', 1: 'Carbs', 2: 'Drink', 3: 'Lunch/Dinner', 4: 'Snack'}

# 📥 Load Dataset
df = pd.read_csv('nutrition_dataset_processed (1).csv')


# 📌 Strictly Apply Drink Filtering to Meal Type 2
df.loc[df['Meal Type'] == 2, :] = df.loc[df['Meal Type'] == 2, :][
    df['name'].str.contains('|'.join(drink_keywords), case=False, na=False)
]
df.loc[df['Meal Type'] == 2, :] = df.loc[df['Meal Type'] == 2, :][
    ~df['name'].str.contains('|'.join(non_drink_keywords), case=False, na=False)
]

df["Nutrient_Density"] = (df["proteins"] + df["carbohydrate"] - df["fat"]) / (df["calories"] + 1e-6)

# 📌 Step 2: Handle Missing Values
numeric_cols = ['calories', 'fat', 'proteins', 'carbohydrate', 'Nutrient_Density']
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# 🔄 Step 3: Balance Meal Type Distribution
meal_counts = df['Meal Type'].value_counts()
max_count = meal_counts.max()

balanced_df = pd.DataFrame()
for meal_type, count in meal_counts.items():
    meal_subset = df[df['Meal Type'] == meal_type]

    # 🔹 Keep Meal Type 2 (Drinks) as is, don't upsample
    if meal_type == 2:
        upsampled_meal = meal_subset
    else:
        upsampled_meal = resample(meal_subset, replace=True, n_samples=max_count, random_state=42)

    balanced_df = pd.concat([balanced_df, upsampled_meal])

df = balanced_df.reset_index(drop=True)

# 🔄 Step 4: Data Augmentation (Adding Small Random Variations)
df_augmented = df.copy()
df_augmented[numeric_cols] += np.random.uniform(-0.05, 0.05, size=df[numeric_cols].shape)
df = pd.concat([df, df_augmented], ignore_index=True)

df = df.drop_duplicates(subset=['name'], keep='first').reset_index(drop=True)

# 🔄 Step 5: Re-Normalize Features After Augmentation
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[numeric_cols]), columns=numeric_cols)

# ✅ Step 6: Enforce Drink Filtering Again to Prevent Mixed Items
df.loc[df['Meal Type'] == 2, :] = df.loc[df['Meal Type'] == 2, :][
    df['name'].str.contains('|'.join(drink_keywords), case=False, na=False)
]

df = df.dropna().reset_index(drop=True)  # Remove any invalid rows before similarity

# 📊 Step 7: Compute Cosine Similarity
similarity_matrix = cosine_similarity(df_scaled)

# 🍽 Step 8: Find Similar Foods Based on Nutritional Input
def get_recommendations(calories, fat, proteins, carbohydrate, top_n=5):
    input_features = np.array([[calories, fat, proteins, carbohydrate, (proteins + carbohydrate - fat) / (calories + 1e-6)]])  # Set Nutrient_Density to 0 for now
    input_scaled = scaler.transform(input_features)
    sim_scores = cosine_similarity(input_scaled, df_scaled).flatten()

    recommendations = {(meal_type_names[int(meal_type)]): [] for meal_type in df['Meal Type'].unique()}  # Ensure meal type is integer

    for meal_type, meal_category in meal_type_names.items():
        meal_indices = df[df['Meal Type'] == meal_type].index
        meal_similarities = [(idx, sim_scores[idx]) for idx in meal_indices]
        meal_similarities = sorted(meal_similarities, key=lambda x: x[1], reverse=True)[:top_n]

        recommendations[meal_category] = [{**df.iloc[idx][['name', 'calories', 'fat', 'proteins', 'carbohydrate', 'Nutrient_Density', 'Meal Type']].to_dict(), 'Meal Type': meal_category} for idx, _ in meal_similarities]

    # 🎯 Display recommendations by meal type
    print("\n🔹 Top Recommendations per Meal Type:")
    for meal_category, items in recommendations.items():
        if items:
            print(f"\n🍽 Meal Type {meal_category}:")
            print(pd.DataFrame(items))

    # ✅ Construct an optimal daily meal plan by picking the best match for each meal type
    daily_meal_plan = [items[0] for items in recommendations.values() if items]
    print("\n✅ Optimal Daily Meal Plan:")
    print(pd.DataFrame(daily_meal_plan))

# 🎯 Example Usage
get_recommendations(0.1, 0.02, 0.15, 0.05)  # Example input values



🔹 Top Recommendations per Meal Type:

🍽 Meal Type Lunch/Dinner:
                     name  calories    fat  proteins  carbohydrate  \
0  Ikan bulan-bulan segar  0.132673  0.013  0.145714      0.007728   
1             Ikan Layang  0.107921  0.017  0.125714      0.000000   
2            Ikan Kembung  0.101980  0.010  0.125714      0.000000   
3    Ikan bambangan segar  0.110891  0.013  0.114286      0.005719   
4        Sapi dideh/darah  0.097030  0.011  0.125143      0.000000   

   Nutrient_Density     Meal Type  
0          1.058549  Lunch/Dinner  
1          1.007343  Lunch/Dinner  
2          1.134663  Lunch/Dinner  
3          0.964942  Lunch/Dinner  
4          1.176358  Lunch/Dinner  

🍽 Meal Type Carbs:
                        name  calories    fat  proteins  carbohydrate  \
0  Daun singkong ambon segar  0.142574  0.012  0.072571      0.038949   
1       Kacang kedelai rebus  0.187129  0.082  0.115429      0.019629   
2         Kacang ercis segar  0.127723  0.007  0.070857    

  df.loc[df['Meal Type'] == 2, :] = df.loc[df['Meal Type'] == 2, :][
  df.loc[df['Meal Type'] == 2, :] = df.loc[df['Meal Type'] == 2, :][
  df.loc[df['Meal Type'] == 2, :] = df.loc[df['Meal Type'] == 2, :][


# **Evaluate Model**

In [9]:
# Load Dataset
df = pd.read_csv('nutrition_dataset_processed (1).csv')

# Select relevant numeric columns
numeric_cols = ['calories', 'fat', 'proteins', 'carbohydrate', 'Nutrient_Density']

df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Normalize Data
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[numeric_cols]), columns=numeric_cols)

df_scaled['Meal Type'] = df['Meal Type']
df_scaled['name'] = df['name']

# Split into Train & Test Sets
train_df, test_df = train_test_split(df_scaled, test_size=0.2, random_state=42)

# Compute Similarity Matrix
similarity_matrix = cosine_similarity(train_df[numeric_cols])

# Function to Evaluate Model Performance
def evaluate_model(test_data, train_data, top_n=5):
    errors = []
    similarity_scores = []

    for _, test_row in test_data.iterrows():
        input_features = test_row[numeric_cols].values.reshape(1, -1)
        sim_scores = cosine_similarity(input_features, train_data[numeric_cols]).flatten()

        top_indices = np.argsort(sim_scores)[-top_n:][::-1]
        top_foods = train_data.iloc[top_indices]

        similarity_scores.append(sim_scores[top_indices].mean())

        # Compute Mean Absolute Error (MAE) for top recommendation
        mae = mean_absolute_error(test_row[numeric_cols], top_foods[numeric_cols].mean())
        errors.append(mae)

    avg_similarity = np.mean(similarity_scores)
    avg_mae = np.mean(errors)

    print(f"\n📊 Model Evaluation Results:")
    print(f"✅ Average Similarity Score: {avg_similarity:.4f}")
    print(f"✅ Mean Absolute Error (MAE): {avg_mae:.4f}")

    return avg_similarity, avg_mae

# Run Evaluation
evaluate_model(test_df, train_df)


📊 Model Evaluation Results:
✅ Average Similarity Score: 0.9992
✅ Mean Absolute Error (MAE): 0.0067


(np.float64(0.9992488545163173), np.float64(0.0067361047219366135))

# **Testing Model**

In [10]:
!pip install joblib



In [11]:
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(similarity_matrix, 'similarity_model.pkl')

['similarity_model.pkl']

In [12]:

# Load dataset and model
print("🔄 Loading dataset and model...")
df = pd.read_csv('nutrition_dataset_processed (1).csv')
scaler = joblib.load('scaler.pkl')  # Ensure you save the scaler when training
similarity_matrix = joblib.load('similarity_model.pkl')  # Save similarity model

# Ensure Nutrient Density is properly calculated
df["Nutrient_Density"] = (df["proteins"] + df["carbohydrate"] - df["fat"]) / (df["calories"] + 1e-6)

# Select relevant columns
numeric_cols = ['calories', 'fat', 'proteins', 'carbohydrate', 'Nutrient_Density']
df_scaled = pd.DataFrame(scaler.transform(df[numeric_cols]), columns=numeric_cols)

def get_recommendations(calories, fat, proteins, carbohydrate, top_n=5):
    input_features = np.array([[calories, fat, proteins, carbohydrate, (proteins + carbohydrate - fat) / (calories + 1e-6)]])
    input_scaled = scaler.transform(input_features)
    sim_scores = cosine_similarity(input_scaled, df_scaled).flatten()

    recommendations = {int(meal_type): [] for meal_type in df['Meal Type'].unique()}
    for meal_type in recommendations.keys():
        meal_indices = df[df['Meal Type'] == meal_type].index
        meal_similarities = sorted([(idx, sim_scores[idx]) for idx in meal_indices], key=lambda x: x[1], reverse=True)[:top_n]
        recommendations[meal_type] = [df.iloc[idx][['name', 'calories', 'fat', 'proteins', 'carbohydrate', 'Nutrient_Density', 'Meal Type']] for idx, _ in meal_similarities]

    return recommendations

# Interactive input
print("\n🔹 Enter your nutritional preferences:")
calories = float(input("Calories: "))
fat = float(input("Fat: "))
proteins = float(input("Proteins: "))
carbohydrate = float(input("Carbohydrate: "))

# Get recommendations
recommendations = get_recommendations(calories, fat, proteins, carbohydrate)

# Display results
print("\n🔹 Top Recommendations per Meal Type:")
for meal_type, items in recommendations.items():
    if items:
        print(f"\n🍽 Meal Type {meal_type}:")
        print(pd.DataFrame(items))

# Construct an optimal daily meal plan
daily_meal_plan = [items[0] for items in recommendations.values() if items]
print("\n✅ Optimal Daily Meal Plan:")
print(pd.DataFrame(daily_meal_plan))


🔄 Loading dataset and model...

🔹 Enter your nutritional preferences:
Calories: 0.3
Fat: 0.2
Proteins: 0.3
Carbohydrate: 0.4

🔹 Top Recommendations per Meal Type:

🍽 Meal Type 3:
                            name  calories    fat  proteins  carbohydrate  \
382                    Sapi abon  0.209901  0.106  0.102857      0.091654   
169              Koro Wedus biji  0.334653  0.015  0.126857      0.094281   
189             Koro Benguk biji  0.328713  0.030  0.137143      0.085008   
252           Koro andong kering  0.352475  0.041  0.117143      0.095209   
90   Buaya daging dendeng mentah  0.361386  0.048  0.283429      0.047604   

     Nutrient_Density  Meal Type  
382          0.421677          3  
169          0.615974          3  
189          0.584553          3  
252          0.486136          3  
90           0.783185          3  

🍽 Meal Type 2:
                       name  calories    fat  proteins  carbohydrate  \
135   Teh hijau daun kering  0.297030  0.048  0.161714      



# **Saved Model**

In [13]:

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

df.to_csv("processed_dataset.csv", index=False)