In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
import warnings
warnings.filterwarnings("ignore") #ignoring some deprication warnings

In [10]:
# load the dataset
data = pd.read_csv("bodyfat.csv", 
                   na_values="?")
data.head()

Unnamed: 0,Density,BodyFat,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm,Wrist
0,1.0708,12.3,23,154.25,67.75,36.2,93.1,85.2,94.5,59.0,37.3,21.9,32.0,27.4,17.1
1,1.0853,6.1,22,173.25,72.25,38.5,93.6,83.0,98.7,58.7,37.3,23.4,30.5,28.9,18.2
2,1.0414,25.3,22,154.0,66.25,34.0,95.8,87.9,99.2,59.6,38.9,24.0,28.8,25.2,16.6
3,1.0751,10.4,26,184.75,72.25,37.4,101.8,86.4,101.2,60.1,37.3,22.8,32.4,29.4,18.2
4,1.034,28.7,24,184.25,71.25,34.4,97.3,100.0,101.9,63.2,42.2,24.0,32.2,27.7,17.7


In [11]:
# check if any null values in dataset
print(data.isnull().sum())

Density    0
BodyFat    0
Age        0
Weight     0
Height     0
Neck       0
Chest      0
Abdomen    0
Hip        0
Thigh      0
Knee       0
Ankle      0
Biceps     0
Forearm    0
Wrist      0
dtype: int64


In [33]:
# Create feature matrix and target vector
X = data.drop("BodyFat", axis=1)
y = data["BodyFat"]

y

0      12.3
1       6.1
2      25.3
3      10.4
4      28.7
       ... 
247    11.0
248    33.6
249    29.3
250    26.0
251    31.9
Name: BodyFat, Length: 252, dtype: float64

In [None]:
# Split data into training and testing set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)

In [48]:
# create pipeline with scaling
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso # linear model

pipe = Pipeline([("preprocessing", StandardScaler()), ("model", Lasso())])

pipe

In [50]:
# grid search for all 3 models 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor # non-linear model
from sklearn.svm import SVR # non-linear model
from sklearn.preprocessing import MinMaxScaler


param_grid = [{"model": [Lasso(max_iter=2000)], 
               "model__alpha": [0.001, 0.01, 0.1, 1.0, 10.0],
               "model__fit_intercept": [True, False],
               "preprocessing": [StandardScaler(), MinMaxScaler()]
              },
              {"model": [RandomForestRegressor(random_state=0)], 
               "model__max_depth": [3, 5, 7, 9],
               "model__min_samples_split": [2, 5, 7],
               "model__max_features": ["log2", "sqrt"],
               "preprocessing": [None]
              }, 
              {"model": [SVR(kernel="rbf")], 
               "model__C": [0.01, 0.1, 1.0, 10.0, 100.0],
               "model__gamma": [0.001, 0.01, 0.1, 1.0, 10.0],
               "preprocessing": [StandardScaler(), MinMaxScaler()]
              }]

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)

In [51]:
# print the top 20 best performing combinations
grid_results = pd.DataFrame.from_dict(grid.cv_results_)
grid_results.sort_values(by="rank_test_score").head(20)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_model__alpha,param_model__fit_intercept,param_preprocessing,param_model__max_depth,param_model__max_features,...,param_model__gamma,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
86,0.007799,0.000746,0.002198,0.000401,SVR(),,,StandardScaler(),,,...,0.01,"{'model': SVR(), 'model__C': 100.0, 'model__ga...",0.893249,0.993495,0.979569,0.998678,0.941387,0.961275,0.039484,1
89,0.005203,0.000397,0.001797,0.000398,SVR(),,,MinMaxScaler(),,,...,0.1,"{'model': SVR(), 'model__C': 100.0, 'model__ga...",0.8491,0.997356,0.993174,0.999403,0.942495,0.956306,0.057594,2
84,0.004201,0.000404,0.0014,0.00049,SVR(),,,StandardScaler(),,,...,0.001,"{'model': SVR(), 'model__C': 100.0, 'model__ga...",0.855519,0.983483,0.994118,0.995195,0.942797,0.954222,0.052914,3
91,0.005996,0.000628,0.001799,0.0004,SVR(),,,MinMaxScaler(),,,...,1.0,"{'model': SVR(), 'model__C': 100.0, 'model__ga...",0.962906,0.968969,0.890918,0.995945,0.938062,0.95136,0.035387,4
76,0.005398,0.000486,0.002797,0.000399,SVR(),,,StandardScaler(),,,...,0.01,"{'model': SVR(), 'model__C': 10.0, 'model__gam...",0.924643,0.947182,0.933938,0.983783,0.929659,0.943841,0.021327,5
9,0.003399,0.00049,0.001799,0.000748,Lasso(max_iter=2000),0.1,True,MinMaxScaler(),,,...,,"{'model': Lasso(max_iter=2000), 'model__alpha'...",0.822037,0.98443,0.984991,0.990851,0.928095,0.942081,0.064217,6
8,0.003199,0.0004,0.002199,0.0004,Lasso(max_iter=2000),0.1,True,StandardScaler(),,,...,,"{'model': Lasso(max_iter=2000), 'model__alpha'...",0.815334,0.98101,0.984274,0.994286,0.935104,0.942001,0.066535,7
5,0.003799,0.000748,0.0016,0.00049,Lasso(max_iter=2000),0.01,True,MinMaxScaler(),,,...,,"{'model': Lasso(max_iter=2000), 'model__alpha'...",0.811505,0.976921,0.984183,0.994125,0.934681,0.940283,0.067504,8
81,0.003801,0.000748,0.001801,0.000401,SVR(),,,MinMaxScaler(),,,...,1.0,"{'model': SVR(), 'model__C': 10.0, 'model__gam...",0.969711,0.92741,0.864449,0.985142,0.936705,0.936684,0.041816,9
12,0.003594,0.000804,0.002003,0.000627,Lasso(max_iter=2000),1.0,True,StandardScaler(),,,...,,"{'model': Lasso(max_iter=2000), 'model__alpha'...",0.837337,0.966794,0.977349,0.977695,0.91425,0.934685,0.054028,10


In [53]:
# print test score
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

Test-set score: 0.99
