## Importing Libraries

In [1]:
# Import basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import essential models and functions from sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

## Importing Data Set

In [2]:
dataset = pd.read_csv('dataset/pca-sleep-efficiency.csv')
dataset.head()

Unnamed: 0,id,age,gender,bed_time,wakeup_time,sleep_duration,sleep_efficiency,rem_sleep_percentage,deep_sleep_percentage,light_sleep_percentage,...,smoking_status,exercise_frequency,exercise_frequency_filled,bed_time_edited,bed_time_encoded,wakeup_time_edited,wakeup_time_encoded,smoking,gender_type,sleep_quality
0,1,65,Female,01:00,07:00,6.0,0.88,18,70,12,...,Yes,3.0,3.0,1,1.0,7,7.0,1,0,-1.038628
1,2,69,Male,02:00,09:00,7.0,0.66,19,28,53,...,Yes,3.0,3.0,2,2.0,9,9.0,1,1,1.705723
2,3,40,Female,21:30,05:30,8.0,0.89,20,70,10,...,No,3.0,3.0,21,21.0,5,5.0,0,0,-0.876008
3,4,40,Female,02:30,08:30,6.0,0.51,23,25,52,...,Yes,1.0,1.0,2,2.0,8,8.0,1,0,2.028781
4,5,57,Male,01:00,09:00,8.0,0.76,27,55,18,...,No,3.0,3.0,1,1.0,9,9.0,0,1,0.192627


## Train Test Split

In [3]:
X = dataset[['age','gender_type','wakeup_time_encoded','bed_time_encoded','sleep_duration', 'exercise_frequency_filled', 'caffeine_consumption_filled', 'alcohol_consumption_filled', 'smoking']]
y = dataset['sleep_quality']

# Perform first split
X_train_temp, X_test, y_train_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# Perform the second split
X_train, X_valid, y_train, y_valid = train_test_split(X_train_temp, y_train_temp, test_size=0.25, random_state=100)

In [4]:
# Create the scaler with object range of 0-1
scaler = MinMaxScaler()

# Fit and transform using the training data
scaler.fit_transform(X_train)

# Transform the validation and test features
scaler.transform(X_valid)
scaler.transform(X_test)

array([[0.37288136, 0.        , 0.55555556, 0.04347826, 0.4       ,
        0.        , 0.125     , 0.2       , 0.        ],
       [0.3220339 , 0.        , 0.55555556, 0.        , 0.6       ,
        0.2       , 0.125     , 0.8       , 0.        ],
       [0.52542373, 0.        , 0.22222222, 0.91304348, 0.5       ,
        0.6       , 0.375     , 0.4       , 1.        ],
       [0.33898305, 0.        , 0.22222222, 0.        , 0.        ,
        0.4       , 0.375     , 0.        , 1.        ],
       [0.15254237, 1.        , 0.66666667, 0.04347826, 0.5       ,
        0.6       , 0.        , 0.        , 0.        ],
       [0.25423729, 0.        , 0.22222222, 0.95652174, 0.4       ,
        0.        , 0.125     , 0.        , 0.        ],
       [0.66101695, 0.        , 0.22222222, 0.95652174, 0.4       ,
        0.8       , 0.        , 0.4       , 0.        ],
       [0.54237288, 1.        , 0.22222222, 0.95652174, 0.4       ,
        0.6       , 0.        , 0.4       , 1.        ],


In [5]:
# Create a list of the columns to drop
cols_to_drop = ['gender_type', 'wakeup_time_encoded', 'sleep_duration','caffeine_consumption_filled','smoking']

# Drop these columns from training, validation and test data
X_train_temp.drop(columns=cols_to_drop,inplace=True)
X_train.drop(columns=cols_to_drop,inplace=True)
X_valid.drop(columns=cols_to_drop,inplace=True)
X_test.drop(columns=cols_to_drop,inplace=True)

## Hyperparameter Tuning

In [8]:
# Number of trees in Random Forest
rf_n_estimators = [100,200,300,400,500,600,700,800,900,1000]

# Maximum number of levels in tree
rf_max_depth = [3,4,5,6,7,8,9,10]

# Number of features to consider at every split
rf_max_features = ['sqrt', 'log2', None]

# Criterion to split on
rf_criterion = ['squared_error','absolute_error', 'friedman_mse']

# Minimum number of samples required to split a node
rf_min_samples_split = [2,3,4,5,6,7,8,9,10]

# Minimum decrease in impurity required for split to happen
rf_min_impurity_decrease = [0.0, 0.05, 0.1]

# Method of selecting samples for training each tree
rf_bootstrap = [True, False]

# Create the grid
rf_grid = {'n_estimators': rf_n_estimators,
               'max_depth': rf_max_depth,
               'max_features': rf_max_features,
               'criterion': rf_criterion,
               'min_samples_split': rf_min_samples_split,
               'min_impurity_decrease': rf_min_impurity_decrease,
               'bootstrap': rf_bootstrap}

print(rf_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 'max_features': ['sqrt', 'log2', None], 'criterion': ['squared_error', 'absolute_error', 'friedman_mse'], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_impurity_decrease': [0.0, 0.05, 0.1], 'bootstrap': [True, False]}


In [9]:
# Create the model to be tuned
rf_base = RandomForestRegressor()

# Create the random search Random Forest
rf_random = RandomizedSearchCV(estimator = rf_base, param_distributions = rf_grid, 
                               n_iter = 200, cv = 3, verbose = 2, random_state = 100, 
                               n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train_temp, y_train_temp)

# View the best parameters from the random search
rf_random.best_params_

Fitting 3 folds for each of 200 candidates, totalling 600 fits
[CV] END bootstrap=False, criterion=friedman_mse, max_depth=10, max_features=log2, min_impurity_decrease=0.0, min_samples_split=8, n_estimators=900; total time=   0.6s
[CV] END bootstrap=False, criterion=friedman_mse, max_depth=10, max_features=log2, min_impurity_decrease=0.0, min_samples_split=8, n_estimators=900; total time=   0.6s
[CV] END bootstrap=False, criterion=friedman_mse, max_depth=10, max_features=log2, min_impurity_decrease=0.0, min_samples_split=8, n_estimators=900; total time=   0.6s
[CV] END bootstrap=True, criterion=friedman_mse, max_depth=4, max_features=log2, min_impurity_decrease=0.05, min_samples_split=3, n_estimators=800; total time=   0.7s
[CV] END bootstrap=True, criterion=friedman_mse, max_depth=4, max_features=log2, min_impurity_decrease=0.05, min_samples_split=3, n_estimators=800; total time=   0.7s
[CV] END bootstrap=True, criterion=friedman_mse, max_depth=4, max_features=log2, min_impurity_decre

{'n_estimators': 100,
 'min_samples_split': 6,
 'min_impurity_decrease': 0.0,
 'max_features': 'sqrt',
 'max_depth': 9,
 'criterion': 'friedman_mse',
 'bootstrap': True}

In [29]:
# Number of trees to be used
xgb_n_estimators = [200,400,600,800,1000,1200,1400,1600,1800,2000]

# Maximum number of levels in tree
xgb_max_depth = [2,4,6,8,10,12,14,16,18,20]

# Minimum number of instaces needed in each node
xgb_min_child_weight = [1,2,3,4,5,6,7,8,9,10]

# Tree construction algorithm used in XGBoost
xgb_tree_method = ['auto', 'exact', 'approx', 'hist']

# Learning rate
xgb_eta = [0.1,0.2,0.3,0.4,0.5,0.6]

# Minimum loss reduction required to make further partition
xgb_gamma = [0,0.1,0.2,0.3,0.4,0.5]

# Learning objective used
xgb_sampling = ['uniform','gradient_based']

# Create the grid
xgb_grid = {'n_estimators': xgb_n_estimators,
            'max_depth': xgb_max_depth,
            'min_child_weight': xgb_min_child_weight,
            'tree_method': xgb_tree_method,
            'eta': xgb_eta,
            'gamma': xgb_gamma,
            'sampling_method': xgb_sampling}

print(xgb_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20], 'min_child_weight': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'tree_method': ['auto', 'exact', 'approx', 'hist'], 'eta': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], 'gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5], 'sampling_method': ['uniform', 'gradient_based']}


In [30]:
# Create the model to be tuned
xgb_base = xgb.XGBRegressor(random_state=100)

# Create the random search Random Forest
xgb_random = RandomizedSearchCV(estimator = xgb_base, param_distributions = xgb_grid, 
                                n_iter = 200, cv = 3, verbose = 2, 
                                random_state = 100, n_jobs = -1)

# Fit the random search model
xgb_random.fit(X_train_temp, y_train_temp)

# Get the optimal parameters
xgb_random.best_params_

Fitting 3 folds for each of 200 candidates, totalling 600 fits
[CV] END eta=0.2, gamma=0.1, max_depth=2, min_child_weight=2, n_estimators=400, sampling_method=uniform, tree_method=auto; total time=   0.0s
[CV] END eta=0.2, gamma=0.1, max_depth=2, min_child_weight=2, n_estimators=400, sampling_method=uniform, tree_method=auto; total time=   0.0s
[CV] END eta=0.2, gamma=0.1, max_depth=2, min_child_weight=2, n_estimators=400, sampling_method=uniform, tree_method=auto; total time=   0.1s
[CV] END eta=0.5, gamma=0.2, max_depth=8, min_child_weight=5, n_estimators=1000, sampling_method=uniform, tree_method=hist; total time=   0.1s
[CV] END eta=0.2, gamma=0.2, max_depth=6, min_child_weight=1, n_estimators=400, sampling_method=gradient_based, tree_method=hist; total time=   0.0s
[CV] END eta=0.5, gamma=0.2, max_depth=8, min_child_weight=5, n_estimators=1000, sampling_method=uniform, tree_method=hist; total time=   0.1s
[CV] END eta=0.6, gamma=0.1, max_depth=16, min_child_weight=3, n_estimators=

{'tree_method': 'hist',
 'sampling_method': 'uniform',
 'n_estimators': 1800,
 'min_child_weight': 3,
 'max_depth': 4,
 'gamma': 0.4,
 'eta': 0.2}

## Final Model Comparison

In [10]:
# Create the final Multiple Linear Regression
mlr_final = LinearRegression()

# Create the final Random Forest
rf_final = RandomForestRegressor(n_estimators = 100,
                                 min_samples_split = 6,
                                 min_impurity_decrease = 0.0,
                                 max_features = 'sqrt',
                                 max_depth = 9,
                                 criterion = 'friedman_mse',
                                 bootstrap = True,
                                 random_state = 100)

# Create the fnal Extreme Gradient Booster
xgb_final = xgb.XGBRegressor(tree_method = 'exact',
                         objective = 'reg:squarederror',
                         n_estimators = 200,
                         min_child_weight = 2,
                         max_depth = 2,
                         gamma = 0,
                         eta = 0.2,
                         random_state = 100)

# Train the models using 80% of the original data
mlr_final.fit(X_train_temp, y_train_temp)
rf_final.fit(X_train_temp, y_train_temp)
xgb_final.fit(X_train_temp, y_train_temp)

In [11]:
# Define a function that compares all final models
def final_comparison(models, test_features, test_labels):
    scores = pd.DataFrame()
    for model in models:
        predictions = model.predict(test_features)
        mae = round(mean_absolute_error(test_labels, predictions), 4)
        mse = round(mean_squared_error(test_labels, predictions), 4)
        r2 = round(r2_score(test_labels, predictions), 4)
        scores[str(model)] = [mae, mse, r2]
    scores.index = ['Mean Absolute Error', 'Mean Squared Error', 'R^2']
    return scores

In [12]:
# Call the comparison function with the three final models
final_scores = final_comparison([mlr_final, rf_final, xgb_final], X_test, y_test)

# Adjust the column headers
final_scores.columns  = ['Multivariate Linear Regression', 'Random Forest', 'Extreme Gradient Boosting']

final_scores.head()

Unnamed: 0,Multivariate Linear Regression,Random Forest,Extreme Gradient Boosting
Mean Absolute Error,0.7404,0.6831,0.7441
Mean Squared Error,0.8645,0.7938,0.9445
R^2,0.1717,0.2394,0.095
