In this notebook, we will go through common  models provided by scikit-learn, including

- Linear Regression
- Decision Tree  
- Support Vector Machine
- Random Forest

We will use a modified obesity data file found on Kaggle and the UCI Machine Learning Reppository.

# Problem to Solve
Our goal is to Predict the Obesity Level based on Hrs of Daily Tech Usage


<h4>Import and Preprocess Data</h4>

We will first import the data, drop columns that are not needed and preprocess the data.

In [1]:
import pandas as pd
import numpy as np
obesity = pd.read_csv('Obesity In the US.csv') 

The attributes are a wide range from age, gender, eating habits and physical abilities.  The data is also mixed with numerical and continous data for a broder analysis.

In [2]:
obesity.head()

Unnamed: 0,Gender,Age,Height (cm),Weight (lbs),Family History,FAVC,# of Veggies,# of Daily Meals,Food Between Meals,Smoke,Daily Water,Monitor Calories,Physical Activity Level,Hrs of Daily Tech Usage,Alcohol Consumption,Transportation Used,Obesity Level
0,Female,15,168,189.6,Yes,Yes,3,3,Sometimes,No,1,No,3,2,No,Walking,5
1,Female,16,166,127.87,No,No,2,1,Sometimes,No,1,No,0,1,No,Walking,2
2,Female,16,157,108.03,No,Yes,2,4,Always,No,2,No,0,1,Sometimes,Public_Transportation,2
3,Female,16,166,127.87,No,No,2,1,Sometimes,No,1,No,0,1,No,Walking,2
4,Female,16,160,125.66,No,Yes,3,3,Sometimes,No,1,No,3,0,No,Public_Transportation,2


In [3]:
obesity.describe()

Unnamed: 0,Age,Height (cm),Weight (lbs),# of Veggies,# of Daily Meals,Daily Water,Physical Activity Level,Hrs of Daily Tech Usage,Obesity Level
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.315964,169.982946,190.889621,2.423496,2.687826,2.014685,1.006632,0.664614,4.112269
std,6.357078,9.432006,57.741815,0.583905,0.80968,0.688616,0.895462,0.674009,1.985062
min,14.0,145.0,85.98,1.0,1.0,1.0,0.0,0.0,1.0
25%,20.0,163.0,144.345,2.0,3.0,2.0,0.0,0.0,2.0
50%,23.0,170.0,182.98,2.0,3.0,2.0,1.0,1.0,4.0
75%,26.0,177.0,236.845,3.0,3.0,2.0,2.0,1.0,6.0
max,61.0,198.0,381.4,3.0,4.0,3.0,3.0,2.0,7.0


In [4]:
obesity.dtypes

Gender                      object
Age                          int64
Height (cm)                  int64
Weight (lbs)               float64
Family History              object
FAVC                        object
# of Veggies                 int64
# of Daily Meals             int64
Food Between Meals          object
Smoke                       object
Daily Water                  int64
Monitor Calories            object
Physical Activity Level      int64
Hrs of Daily Tech Usage      int64
Alcohol Consumption         object
Transportation Used         object
Obesity Level                int64
dtype: object

In [5]:
#drop not need categorical columns
obesity.drop(["Family History","FAVC","Monitor Calories","Alcohol Consumption","Smoke","Transportation Used "], axis = 1, inplace = True)
obesity.head()

Unnamed: 0,Gender,Age,Height (cm),Weight (lbs),# of Veggies,# of Daily Meals,Food Between Meals,Daily Water,Physical Activity Level,Hrs of Daily Tech Usage,Obesity Level
0,Female,15,168,189.6,3,3,Sometimes,1,3,2,5
1,Female,16,166,127.87,2,1,Sometimes,1,0,1,2
2,Female,16,157,108.03,2,4,Always,2,0,1,2
3,Female,16,166,127.87,2,1,Sometimes,1,0,1,2
4,Female,16,160,125.66,3,3,Sometimes,1,3,0,2


# Split data into training and testing

In [6]:
import numpy as np

# Convert numerical attribute to categorical

# We believe that there is a strong correlation between Hours of Daily Tech Usage and Obesity Level. 
# These are our X, Y 

# Set limit - ceiling the Hrs of Daily Tech Usage value
obesity["Hrs Tech Usage_Cat"] = np.ceil(obesity["Hrs of Daily Tech Usage"])


# Label those above 3 as 3 categories
da = obesity["Hrs Tech Usage_Cat"].where(obesity["Hrs Tech Usage_Cat"] < 3, 3.0, inplace=True)

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(obesity, obesity["Hrs of Daily Tech Usage"]):
    strat_train_set = obesity.loc[train_index]
    strat_test_set = obesity.loc[test_index]

In [8]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("Hrs Tech Usage_Cat", axis=1, inplace=True)

In [9]:
obesitylevel = strat_train_set.drop("Obesity Level", axis=1) # drop labels for training set
obesity_labels = strat_train_set["Obesity Level"].copy()

# Preprocessing Data

In [10]:
#Preprocessing
from sklearn.preprocessing import FunctionTransformer

# get the right column indices: safer than hard-coding indices 3, 4, 7, 8
age_ix, height_ix, weight_ix, ActivityLevel_ix = [
    list(obesity.columns).index(col)
    for col in ("Age", "Height (cm)","Weight (lbs)","Physical Activity Level" )]


attr_adder = FunctionTransformer( validate=False,
                                 kw_args={'Age': False})

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', FunctionTransformer(validate=False)),
        ('std_scaler', StandardScaler()),
    ])

In [12]:
try:
    from sklearn.compose import ColumnTransformer
except ImportError:
    from future_encoders import ColumnTransformer # Scikit-Learn < 0.20

In [13]:
from sklearn.preprocessing import OneHotEncoder

attribs = list(obesitylevel)

# Transform categorical data to numerical
cat_attribs = ["Gender", "Food Between Meals"]

num_attribs = [x for x in attribs if x not in cat_attribs]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

obesity_prepared = full_pipeline.fit_transform(obesitylevel)

In [14]:
obesity_prepared

array([[-0.06097873,  0.40949622,  1.77753495, ...,  0.        ,
         0.        ,  1.        ],
       [-0.99814505,  0.40949622, -0.0582892 , ...,  0.        ,
         0.        ,  1.        ],
       [-0.84195066,  1.24913134, -0.442175  , ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 1.34477074,  2.08876646,  0.51057244, ...,  0.        ,
         0.        ,  1.        ],
       [-0.52956189,  0.51445061,  1.80871261, ...,  0.        ,
         0.        ,  1.        ],
       [-0.52956189,  0.09463305, -0.442175  , ...,  0.        ,
         0.        ,  1.        ]])

# Select and train a model among Linear Regression, Decision Tree, SVM, Random Forest to decide best fit model

In [15]:
from sklearn.metrics import mean_squared_error

In [16]:
# Linear Regression model
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(obesity_prepared, obesity_labels)

obesity_predictions = lin_reg.predict(obesity_prepared)
lin_mse = mean_squared_error(obesity_labels, obesity_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.44507373348089163

In [17]:
# Decision Tree model
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(obesity_prepared, obesity_labels)

obesity_predictions = tree_reg.predict(obesity_prepared)
tree_mse = mean_squared_error(obesity_labels, obesity_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [18]:
# Support Vector Machine model
from sklearn.svm import SVR

svm_reg = SVR()
svm_reg.fit(obesity_prepared, obesity_labels)

obesity_predictions = svm_reg.predict(obesity_prepared)
svm_mse = mean_squared_error(obesity_labels, obesity_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

0.23343194395089734

In [19]:
# Random Forest model
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
forest_reg.fit(obesity_prepared, obesity_labels)

obesity_predictions = forest_reg.predict(obesity_prepared)
forest_mse = mean_squared_error(obesity_labels, obesity_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

# RandomForestRegressor gives us a optimized modal and we want to use cross validation to verify our selection

0.07354432522303708

# cross_val on different models

In [20]:
from sklearn.model_selection import cross_val_score

In [21]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [22]:
# cross validation on decision tree regression with 10 cross validations
tree_scores = cross_val_score(tree_reg, obesity_prepared, obesity_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)
display_scores(tree_scores)
#Note: we specify n_estimators=10 to avoid a warning about the fact that the default value is going to change to 100 
#in Scikit-Learn 0.22.

Scores: [-0.07692308 -0.0591716  -0.08284024 -0.0887574  -0.04142012 -0.04733728
 -0.0591716  -0.07692308 -0.06547619 -0.05357143]
Mean: -0.0651591997745844
Standard deviation: 0.014944952747477062


In [23]:
# cross validation on linear regression with 10 cross validations
lin_scores = cross_val_score(lin_reg, obesity_prepared, obesity_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)


Scores: [0.46759318 0.46001993 0.46936883 0.43217782 0.48071472 0.39290174
 0.41888072 0.47352764 0.44724114 0.4290324 ]
Mean: 0.44714581140868825
Standard deviation: 0.026821730336544558


In [24]:
# cross validation on support vector machine regression with 10 cross validations
svm_scores = cross_val_score(svm_reg, obesity_prepared, obesity_labels,
                             scoring="neg_mean_squared_error", cv=10)
svm_rmse_scores = np.sqrt(-svm_scores)
display_scores(svm_rmse_scores)

Scores: [0.28291441 0.35869245 0.32887391 0.2802332  0.2855264  0.25906544
 0.28414713 0.31542363 0.3192969  0.31095363]
Mean: 0.3025127107934425
Standard deviation: 0.027865193690004827


In [25]:
# cross validation on random forest regression with 10 cross validations
forest_scores = cross_val_score(forest_reg, obesity_prepared, obesity_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [0.17574861 0.18873606 0.24203281 0.20754981 0.16853002 0.16853002
 0.19611614 0.1572696  0.16128207 0.17576364]
Mean: 0.18415587723746457
Standard deviation: 0.02436507986265066


After training different models with cross validation, Random Forest model seems to fit the best to this training dataset. 

With this, we picked Random Forest to be our model for fine tune because it gave us a good median result.

# Fine tune our model

In [27]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
   ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(obesity_prepared, obesity_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [28]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(obesity_prepared, obesity_labels)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
                   param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001641A0468B0>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001641A042F70>},
                   random_state=42, scoring='neg_mean_squared_error')

# Apply to test data

In [29]:
final_model1 = grid_search.best_estimator_
final_model2 = rnd_search.best_estimator_

X_test = strat_test_set.drop("Obesity Level", axis=1)
y_test = strat_test_set["Obesity Level"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions1 = final_model1.predict(X_test_prepared)
final_predictions2 = final_model2.predict(X_test_prepared)

final_mse1 = mean_squared_error(y_test, final_predictions1)
final_rmse1 = np.sqrt(final_mse1)
print(final_rmse1)

final_mse2 = mean_squared_error(y_test, final_predictions2)
final_rmse2 = np.sqrt(final_mse2)
print(final_rmse2)

0.20496208494444876
0.20718724242814124


In [30]:
import joblib
joblib.dump(final_model1, "final_model1.pkl") # DIFF
#...
my_model_loaded = joblib.load("final_model1.pkl") # DIFF

This project was completed by: Tu Vu, Qiana Debeb & Simisola Babatunde

Reference 
Palechor, F. M., & de la Hoz Manotas, A. (2019). Dataset for estimation of obesity levels based on eating habits and physical condition in individuals from Colombia, Peru and Mexico. Data in Brief, 104344