In [1]:
pip install pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import re

In [60]:
data = pd.read_csv('Food and Calories - Sheet1.csv')

In [61]:
data

Unnamed: 0,Food,Serving,Calories
0,Artichoke,1 artichoke (128 g),60 cal
1,Arugula,1 leaf (2 g),1 cal
2,Asparagus,1 spear (12 g),2 cal
3,Aubergine,1 aubergine (458 g),115 cal
4,Beetroot,1 beet (82 g),35 cal
...,...,...,...
557,Wendy’s Jr. Cheeseburger,1 burger (129 g),290 cal
558,Wendy’s Son of Baconator,1 burger (218 g),700 cal
559,Whopper,1 burger (291 g),672 cal
560,Zinger,1 sandwich (202 g),517 cal


In [62]:
data.head()

Unnamed: 0,Food,Serving,Calories
0,Artichoke,1 artichoke (128 g),60 cal
1,Arugula,1 leaf (2 g),1 cal
2,Asparagus,1 spear (12 g),2 cal
3,Aubergine,1 aubergine (458 g),115 cal
4,Beetroot,1 beet (82 g),35 cal


In [63]:
data.info

<bound method DataFrame.info of                          Food              Serving Calories
0                   Artichoke  1 artichoke (128 g)   60 cal
1                     Arugula         1 leaf (2 g)    1 cal
2                   Asparagus       1 spear (12 g)    2 cal
3                   Aubergine  1 aubergine (458 g)  115 cal
4                    Beetroot        1 beet (82 g)   35 cal
..                        ...                  ...      ...
557  Wendy’s Jr. Cheeseburger     1 burger (129 g)  290 cal
558  Wendy’s Son of Baconator     1 burger (218 g)  700 cal
559                   Whopper     1 burger (291 g)  672 cal
560                    Zinger   1 sandwich (202 g)  517 cal
561             Zinger Burger   1 sandwich (202 g)  517 cal

[562 rows x 3 columns]>

In [64]:
data.describe()

Unnamed: 0,Food,Serving,Calories
count,562,562,562
unique,541,381,307
top,Olives,1 tbsp (14 ml),124 cal
freq,2,30,15


In [65]:
def extract_weight(serving):
    match = re.search(r'\(([\d.]+) g\)', serving)
    return float(match.group(1)) if match else 0

In [66]:
data['Weight'] = data['Serving'].apply(extract_weight)

In [67]:
X = data[['Food', 'Weight']]
y = data['Calories'].str.replace(' cal', '').astype(int)

In [68]:
X

Unnamed: 0,Food,Weight
0,Artichoke,128.0
1,Arugula,2.0
2,Asparagus,12.0
3,Aubergine,458.0
4,Beetroot,82.0
...,...,...
557,Wendy’s Jr. Cheeseburger,129.0
558,Wendy’s Son of Baconator,218.0
559,Whopper,291.0
560,Zinger,202.0


In [69]:
y

0       60
1        1
2        2
3      115
4       35
      ... 
557    290
558    700
559    672
560    517
561    517
Name: Calories, Length: 562, dtype: int32

In [74]:
preprocessor = ColumnTransformer(
    transformers=[('food', OneHotEncoder(handle_unknown='ignore'), ['Food'])],
    remainder='passthrough'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [76]:
pipeline.fit(X_train, y_train)

In [77]:
y_pred = pipeline.predict(X_test)

In [78]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 54828.68135486726
R-squared: 0.24955791035031638


In [79]:
new_food_item = pd.DataFrame({'Food': ['Artichoke'], 'Weight': [128]})
predicted_calories = pipeline.predict(new_food_item)
print(f"Predicted Calories: {predicted_calories[0]} cal")

Predicted Calories: 87.21 cal


In [81]:
new_food_item = pd.DataFrame({'Food': ['Banana'], 'Weight': [118]})
predicted_calories = pipeline.predict(new_food_item)
print(f"Predicted Calories: {predicted_calories[0]} cal")

Predicted Calories: 267.43 cal


In [82]:
new_food_item = pd.DataFrame({'Food': ['chicken Breast'], 'Weight': [200]})
predicted_calories = pipeline.predict(new_food_item)
print(f"Predicted Calories: {predicted_calories[0]} cal")

Predicted Calories: 284.43 cal
