In [1]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
import numpy as np

In [2]:
df = pd.read_excel("data/food_data.xlsx")

In [3]:
df.head()

Unnamed: 0,description,sub_category,main_category,category,energy (kJ),carbohydrate_G,vitamin_A_UG,calcium_MG,potassium_MG,zinc_MG,...,folate_total_UG,vitamin B_12_UG,thiamin_MG,riboflavin_MG,cholesterol_MG,Niacin_MG,vitamin_B_6_MG,selenium_UG,vitamin D _UG,choline_total_MG
0,"Abiyuch, raw",Fruit,Veg,Fruits and Fruit Juices,290.0,17.6,5.0,8.0,304.0,0.31,...,,,,,,,,,,
1,"Acerola juice, raw",Juice,Veg,Fruits and Fruit Juices,96.0,4.8,25.0,10.0,97.0,0.1,...,14.0,0.0,0.02,0.06,0.0,0.4,0.004,0.1,,
2,"Acerola, (west indian cherry), raw",Fruit,Veg,Fruits and Fruit Juices,134.0,7.69,38.0,12.0,146.0,0.1,...,14.0,0.0,0.02,0.06,0.0,0.4,0.009,0.6,,
3,"Alfalfa seeds, sprouted, raw",Vegetable,Veg,Vegetables and Vegetable Products,96.0,2.1,8.0,32.0,79.0,0.92,...,36.0,0.0,0.076,0.126,0.0,0.481,0.034,0.6,0.0,14.4
4,"Amaranth leaves, raw",Vegetable,Veg,Vegetables and Vegetable Products,97.0,4.02,146.0,215.0,611.0,0.9,...,85.0,0.0,0.027,0.158,0.0,0.658,0.192,0.9,0.0,


In [4]:
df.rename(columns={'vitamin_K_ UG': 'vitamin_K_UG', 'vitamin D _UG' : 'vitamin_D_UG'}, inplace=True)

In [5]:
df.shape

(1166, 36)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1166 entries, 0 to 1165
Data columns (total 36 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   description                          1166 non-null   object 
 1   sub_category                         1166 non-null   object 
 2   main_category                        1166 non-null   object 
 3   category                             1166 non-null   object 
 4   energy (kJ)                          1048 non-null   float64
 5   carbohydrate_G                       1162 non-null   float64
 6   vitamin_A_UG                         994 non-null    float64
 7   calcium_MG                           1164 non-null   float64
 8   potassium_MG                         1150 non-null   float64
 9   zinc_MG                              1132 non-null   float64
 10  water_G                              1164 non-null   float64
 11  total_lipid_fat_G             

In [7]:
df.isnull().sum()

description                              0
sub_category                             0
main_category                            0
category                                 0
energy (kJ)                            118
carbohydrate_G                           4
vitamin_A_UG                           172
calcium_MG                               2
potassium_MG                            16
zinc_MG                                 34
water_G                                  2
total_lipid_fat_G                        0
protein_G                                2
fatty_acids_total_saturated_G          162
vitamin_C_MG                           125
fiber_total_dietary_G                  140
iron_MG                                  4
magnesium_MG                            26
phosphorus_MG                           15
sodium_MG                                6
copper_MG                               37
total_sugars_G                         463
fatty_acids_total_monounsaturated_G    176
vitamin_K_U

In [8]:
df.columns

Index(['description', 'sub_category', 'main_category', 'category',
       'energy (kJ)', 'carbohydrate_G', 'vitamin_A_UG', 'calcium_MG',
       'potassium_MG', 'zinc_MG', 'water_G', 'total_lipid_fat_G', 'protein_G',
       'fatty_acids_total_saturated_G', 'vitamin_C_MG',
       'fiber_total_dietary_G', 'iron_MG', 'magnesium_MG', 'phosphorus_MG',
       'sodium_MG', 'copper_MG', 'total_sugars_G',
       'fatty_acids_total_monounsaturated_G', 'vitamin_K_UG', 'vitamin_E_MG',
       'fatty_acids_total_polyunsaturated_G', 'folate_total_UG',
       'vitamin B_12_UG', 'thiamin_MG', 'riboflavin_MG', 'cholesterol_MG',
       'Niacin_MG', 'vitamin_B_6_MG', 'selenium_UG', 'vitamin_D_UG',
       'choline_total_MG'],
      dtype='object')

### Preprocessing

In [9]:
df.fillna(0, inplace = True)

In [10]:

in_mg = ['calcium_MG', 'potassium_MG', 'zinc_MG', 'vitamin_C_MG', 'iron_MG', 'magnesium_MG', 'phosphorus_MG',
          'sodium_MG', 'copper_MG', 'vitamin_E_MG', 'thiamin_MG', 'riboflavin_MG', 'cholesterol_MG', 'Niacin_MG', 
          'vitamin_B_6_MG', 'choline_total_MG']

in_grams = ['carbohydrate_G', 'water_G', 'total_lipid_fat_G', 'protein_G', 'fatty_acids_total_saturated_G', 
            'fiber_total_dietary_G','total_sugars_G', 'fatty_acids_total_monounsaturated_G', 
            'fatty_acids_total_polyunsaturated_G' ]

in_ug = ['vitamin_A_UG', 'vitamin_K_UG', 'folate_total_UG', 'vitamin B_12_UG', 'selenium_UG', 'vitamin_D_UG' ]

others = ['description', 'sub_category', 'main_category', 'category', 'energy (kJ)']

# Convert grams to milligrams (g → mg)
df[in_grams] = df[in_grams] * 1000

# Convert micrograms to milligrams (µg → mg)
df[in_ug] = df[in_ug] / 1000




In [11]:

df.columns = df.columns.str.replace(r'_(UG|MG|G)$', '', regex=True)

In [12]:
df.columns

Index(['description', 'sub_category', 'main_category', 'category',
       'energy (kJ)', 'carbohydrate', 'vitamin_A', 'calcium', 'potassium',
       'zinc', 'water', 'total_lipid_fat', 'protein',
       'fatty_acids_total_saturated', 'vitamin_C', 'fiber_total_dietary',
       'iron', 'magnesium', 'phosphorus', 'sodium', 'copper', 'total_sugars',
       'fatty_acids_total_monounsaturated', 'vitamin_K', 'vitamin_E',
       'fatty_acids_total_polyunsaturated', 'folate_total', 'vitamin B_12',
       'thiamin', 'riboflavin', 'cholesterol', 'Niacin', 'vitamin_B_6',
       'selenium', 'vitamin_D', 'choline_total'],
      dtype='object')

In [13]:
df['iron'].max()

41.9

In [13]:
# Select relevant columns (nutrients for modeling)
nutrients = ['calcium', 'potassium', 'zinc', 'vitamin_C', 'iron', 'magnesium', 'phosphorus','sodium', 'copper',
              'vitamin_E', 'thiamin', 'riboflavin', 'cholesterol', 'Niacin', 'vitamin_B_6', 'choline_total',
              'vitamin_A', 'vitamin_K', 'folate_total', 'vitamin B_12', 'selenium', 'vitamin_D' ]

# Normalize the nutrient values for ML
scaler = MinMaxScaler()
df[nutrients] = scaler.fit_transform(df[nutrients])

# Save the processed data
df.to_csv("data/processed_food_data.csv", index=False)
print("✅ Data preprocessing complete! File saved as 'processed_food_data.csv'.")


✅ Data preprocessing complete! File saved as 'processed_food_data.csv'.


In [14]:
# Load processed dataset
df = pd.read_csv("data/processed_food_data.csv")

# Define features (nutrient values)
nutrients = ['calcium', 'potassium', 'zinc', 'vitamin_C', 'iron', 'magnesium', 'phosphorus','sodium', 'copper',
              'vitamin_E', 'thiamin', 'riboflavin', 'cholesterol', 'Niacin', 'vitamin_B_6', 'choline_total',
              'vitamin_A', 'vitamin_K', 'folate_total', 'vitamin B_12', 'selenium', 'vitamin_D' ]
X = df[nutrients]

# Train a KNN model
knn = NearestNeighbors(n_neighbors=10, metric='euclidean')
knn.fit(X)

def recommend_food(deficiency):
    """Recommend food items based on a user's nutrient deficiency."""
    if deficiency not in nutrients:
        return "Invalid deficiency. Choose from: " + ", ".join(nutrients)

    # Create a query vector: 1 for the deficient nutrient, 0 for others
    deficiency_idx = nutrients.index(deficiency)
    sample = np.zeros(len(nutrients))
    sample[deficiency_idx] = 1  # Targeting the deficient nutrient

    # Find similar foods
    distances, indices = knn.kneighbors([sample])
    recommendations = df.iloc[indices[0]][['description', deficiency]]

    return recommendations.to_dict(orient="records")

# Test the model
print("✅ Model trained. Testing recommendations...\n")

print("🥗 Vitamin C Deficiency:")
print(recommend_food("vitamin_C"))

print("\n🥩 Iron Deficiency:")
print(recommend_food("iron"))


✅ Model trained. Testing recommendations...

🥗 Vitamin C Deficiency:




[{'description': 'Acerola, (west indian cherry), raw', 'vitamin_C': 1.0}, {'description': 'Acerola juice, raw', 'vitamin_C': 0.9523809523809524}, {'description': 'Beverages, tea, green, instant, decaffeinated, lemon, unsweetened, fortified with vitamin C', 'vitamin_C': 0.7976190476190477}, {'description': 'Guavas, common, raw', 'vitamin_C': 0.1357142857142857}, {'description': 'Peppers, hot chili, green, raw', 'vitamin_C': 0.144047619047619}, {'description': 'Peppers, sweet, yellow, raw', 'vitamin_C': 0.1095238095238095}, {'description': 'Mango, Ataulfo, peeled, raw', 'vitamin_C': 0.1}, {'description': 'Currants, european black, raw', 'vitamin_C': 0.1077380952380952}, {'description': 'Kiwifruit, ZESPRI SunGold, raw', 'vitamin_C': 0.0958333333333333}, {'description': 'Peppers, bell, orange, raw', 'vitamin_C': 0.094047619047619}]

🥩 Iron Deficiency:
[{'description': 'Lamb, variety meats and by-products, spleen, raw', 'iron': 1.0}, {'description': 'Pork, fresh, variety meats and by-produc



In [18]:
df['iron'].max()

1.0