## Obtaining the dataset

In [1]:
!wget https://raw.githubusercontent.com/hsu-ai-course/hsu.ai/master/code/datasets/ml/nutrition.csv -O nutrition.csv

--2020-05-12 14:00:34--  https://raw.githubusercontent.com/hsu-ai-course/hsu.ai/master/code/datasets/ml/nutrition.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8103 (7.9K) [text/plain]
Saving to: ‘nutrition.csv’


2020-05-12 14:00:34 (77.1 MB/s) - ‘nutrition.csv’ saved [8103/8103]



In [5]:
import pandas as pd

nutrition = pd.read_csv("nutrition.csv")
nutrition.head()

Unnamed: 0,Food and Serving,Calories,Calories from Fat,Total Fat 1,Total Fat 2,Sodium 1,Sodium 2,Potassium 1,Potassium 2,Total Carbo-hydrate 1,Total Carbo-hydrate 2,Dietary Fiber 1,Dietary Fiber 2,Sugars,Protein,Vitamin A,Vitamin C,Calcium,Iron,Saturated Fat,Saturated Fat 2,Chole-sterol 1,Chole-sterol 2,Food Type
0,,,,(g),(%DV),(g),(%DV),(g),(%DV),(g),(%DV),(g),(%DV),(g),(g),(%DV),(%DV),(%DV),(%DV),(%DV),(mg),(%DV),(mg),
1,"Asparagus, 5 spears (93 g/3.3 oz)",20.0,0.0,0,0,0,0,230,7,4,1,2,8,2,2,10,15,2,2,,,,,"Vegetables, Serving Size (gram weight/ ounce w..."
2,"Bell Pepper, 1 medium (148 g/5.3 oz)",25.0,0.0,0,0,40,2,220,6,6,2,2,8,4,1,4,190,2,4,,,,,"Vegetables, Serving Size (gram weight/ ounce w..."
3,"Broccoli, 1 medium stalk (148 g/5.3 oz)",45.0,0.0,0.5,1,80,3,460,13,8,3,3,12,2,4,6,220,6,6,,,,,"Vegetables, Serving Size (gram weight/ ounce w..."
4,"Carrot, 1 carrot, 7"" long, 1 1/4"" diameter (78...",30.0,0.0,0,0,60,3,250,7,7,2,2,8,5,1,110,10,2,2,,,,,"Vegetables, Serving Size (gram weight/ ounce w..."


## Filling NaNs + Replacing Food type by categorical

In [6]:
nutrition = nutrition.drop(nutrition.index[0])  # removing row with measuring units
nutrition = nutrition.drop(["Food and Serving"], axis = 1)  # removing names

types = {"Food Type":{
    "Vegetables, Serving Size (gram weight/ ounce weight)": 1,
    "Fruits Serving Size (gram weight/ounce weight)": 2,
    "Seafood, Serving Size (84 g/3 oz)": 3
}}

nutrition = nutrition.fillna(0.0)
nutrition.replace(types, inplace=True)
nutrition.head()

Unnamed: 0,Calories,Calories from Fat,Total Fat 1,Total Fat 2,Sodium 1,Sodium 2,Potassium 1,Potassium 2,Total Carbo-hydrate 1,Total Carbo-hydrate 2,Dietary Fiber 1,Dietary Fiber 2,Sugars,Protein,Vitamin A,Vitamin C,Calcium,Iron,Saturated Fat,Saturated Fat 2,Chole-sterol 1,Chole-sterol 2,Food Type
1,20.0,0.0,0.0,0,0,0,230,7,4,1,2,8,2,2,10,15,2,2,0,0,0,0,1
2,25.0,0.0,0.0,0,40,2,220,6,6,2,2,8,4,1,4,190,2,4,0,0,0,0,1
3,45.0,0.0,0.5,1,80,3,460,13,8,3,3,12,2,4,6,220,6,6,0,0,0,0,1
4,30.0,0.0,0.0,0,60,3,250,7,7,2,2,8,5,1,110,10,2,2,0,0,0,0,1
5,25.0,0.0,0.0,0,30,1,270,8,5,2,2,8,2,2,0,100,2,2,0,0,0,0,1


## Dividing into source and target + scaling

In [7]:
from sklearn.preprocessing import MinMaxScaler

nutrition.reset_index(drop=True, inplace=True)

target_cals = nutrition["Calories"] # what we will predict 
source = nutrition.drop(["Calories"], axis = 1)  # features that will be used for prediction


# Features' scales vary a lot (some range from 0 to 3, others from 70 to 450)
# Therefore we will scale them
scaler = MinMaxScaler()
source_scaled = pd.DataFrame(scaler.fit_transform(source), columns=source.columns)
source_scaled.head()

Unnamed: 0,Calories from Fat,Total Fat 1,Total Fat 2,Sodium 1,Sodium 2,Potassium 1,Potassium 2,Total Carbo-hydrate 1,Total Carbo-hydrate 2,Dietary Fiber 1,Dietary Fiber 2,Sugars,Protein,Vitamin A,Vitamin C,Calcium,Iron,Saturated Fat,Saturated Fat 2,Chole-sterol 1,Chole-sterol 2,Food Type
0,0.0,0.0,0.0,0.0,0.0,0.290909,0.3125,0.117647,0.090909,0.333333,0.333333,0.08,0.074074,0.076923,0.0625,0.2,0.044444,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.121212,0.142857,0.272727,0.25,0.176471,0.181818,0.333333,0.333333,0.16,0.037037,0.030769,0.791667,0.2,0.088889,0.0,0.0,0.0,0.0,0.0
2,0.0,0.05,0.066667,0.242424,0.214286,0.709091,0.6875,0.235294,0.272727,0.5,0.5,0.08,0.148148,0.046154,0.916667,0.6,0.133333,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.181818,0.214286,0.327273,0.3125,0.205882,0.181818,0.333333,0.333333,0.2,0.037037,0.846154,0.041667,0.2,0.044444,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.090909,0.071429,0.363636,0.375,0.147059,0.181818,0.333333,0.333333,0.08,0.074074,0.0,0.416667,0.2,0.044444,0.0,0.0,0.0,0.0,0.0


## Train-test splitting

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(source_scaled, target_cals, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,Calories from Fat,Total Fat 1,Total Fat 2,Sodium 1,Sodium 2,Potassium 1,Potassium 2,Total Carbo-hydrate 1,Total Carbo-hydrate 2,Dietary Fiber 1,Dietary Fiber 2,Sugars,Protein,Vitamin A,Vitamin C,Calcium,Iron,Saturated Fat,Saturated Fat 2,Chole-sterol 1,Chole-sterol 2,Food Type
3,0.0,0.0,0.0,0.181818,0.214286,0.327273,0.3125,0.205882,0.181818,0.333333,0.333333,0.2,0.037037,0.846154,0.041667,0.2,0.044444,0.0,0.0,0.0,0.0,0.0
53,0.166667,0.2,0.2,0.212121,0.214286,0.672727,0.6875,0.0,0.0,0.0,0.0,0.0,0.777778,0.030769,0.0,0.2,0.044444,0.0,0.0,0.235294,0.22807,1.0
17,0.222222,0.25,0.266667,0.0,0.0,0.327273,0.3125,0.529412,0.545455,0.333333,0.333333,0.2,0.148148,0.015385,0.041667,0.0,0.044444,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.060606,0.071429,0.218182,0.1875,0.147059,0.181818,0.333333,0.333333,0.12,0.037037,0.0,0.291667,0.4,0.044444,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.127273,0.125,0.058824,0.090909,0.166667,0.166667,0.04,0.037037,0.030769,0.041667,0.2,0.044444,0.0,0.0,0.0,0.0,0.0


## Linear Regression

In [10]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(X_train, y_train)

score = reg.score(X_test, y_test)
print("R^2 score of the prediction: ", score)

0.9890617332873107

In [21]:
pred_cals = reg.predict(X_test.iloc[[3]])
real_cals = list(y_test.iloc[[3]])[0]

print("Predicted calories: ", pred_cals[0])
print("Real calories: ", real_cals)

Predicted calories:  73.25807984334209
Real calories:  80.0
