In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importing function to split sets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate

# Importing regressors
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Importing metric
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import auc, roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay

# Importing Grid Search
from sklearn.model_selection import GridSearchCV

In [2]:
# Importing data
data = pd.read_csv('/content/drive/MyDrive/ObesityDataSet_raw_and_data_sinthetic.csv', sep = ',') # When data separator needed

data.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [3]:
# Preprocessing
# Getting rid of synthetically generated data
data = data[data['FAF'].isin([0, 1, 2, 3])]
data = data[data['FCVC'].isin([1, 2, 3])]

# Importing OriginalEncoder and OneHotEncoder to manage categorical and binary variables
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# Identify categorical features
categorical_features = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad']

# Using OrdinalEncoder:
ordinal_encoder = OrdinalEncoder()
data[categorical_features] = ordinal_encoder.fit_transform(data[categorical_features])

In [4]:
# PREPROCESSING

# MLP with all values
# Creating two sets
X_all = data.drop(columns=['FAF'], axis = 1)
y_all = data['FAF']

# Splitting dataset into train, validation and test sets
X_train_all, X_temp_all, y_train_all, y_temp_all = train_test_split(X_all, y_all, test_size=0.4, random_state=42)
X_val_all, X_test_all, y_val_all, y_test_all = train_test_split(X_temp_all, y_temp_all, test_size=0.5, random_state=42)


# MLP with selected values
# Creating two sets
X = data.drop(columns=['FAF', 'Gender', 'Age', 'Height', 'family_history_with_overweight', 'NCP', 'TUE', 'MTRANS', 'NObeyesdad'], axis = 1)
y = data['FAF']

# Splitting dataset into train, validation and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [None]:
# Importing MLP
from sklearn.neural_network import MLPRegressor

# Creating models
reg_all = MLPRegressor(hidden_layer_sizes=(5, 5), max_iter=1000, solver='lbfgs')
reg_mlp = MLPRegressor(hidden_layer_sizes=(5, 5), max_iter=1000, solver='lbfgs')

# Training models
reg_all.fit(X_train_all, y_train_all)
reg_mlp.fit(X_train, y_train)

# Making predictions
y_pred_train_all = reg_all.predict(X_train_all)
y_pred_all = reg_all.predict(X_val_all)

y_pred_train = reg_mlp.predict(X_train)
y_pred = reg_mlp.predict(X_val)

# Calculating RMSE for training set
rmse_mlp_all_train = root_mean_squared_error(y_train_all, y_pred_train_all)
rmse_mlp_train = root_mean_squared_error(y_train, y_pred_train)
print(f'RMSE train with all features: {rmse_mlp_all_train}')
print(f'\nRMSE train with selected features: {rmse_mlp_train}')

# Calculating RMSE for validation set
rmse_mlp_all = root_mean_squared_error(y_val_all, y_pred_all)
rmse_mlp = root_mean_squared_error(y_val, y_pred)
print(f'\nRMSE with all features: {rmse_mlp_all}')
print(f'\nRMSE with selected features: {rmse_mlp}')


RMSE train with all features: 0.8087054398732667

RMSE train with selected features: 0.9774950904151459

RMSE with all features: 0.9320928379889755

RMSE with selected features: 1.0163442965680676


In [5]:
from sklearn.preprocessing import (MinMaxScaler,StandardScaler,)

standard_scaler = StandardScaler().set_output(transform="pandas")
data_standardized = standard_scaler.fit_transform(data)

minmax_scaler = MinMaxScaler().set_output(transform="pandas")
data_normalized = minmax_scaler.fit_transform(data)

In [6]:
# PREPROCESSING

# MLP with selected values
# Creating two sets
X_standarized = data_standardized.drop(columns=['FAF', 'Gender', 'Age', 'Height', 'family_history_with_overweight', 'NCP', 'TUE', 'MTRANS', 'NObeyesdad'], axis = 1)
y_standarized = data_standardized['FAF']

# Splitting dataset into train, validation and test sets
X_train_standarized, X_temp_standarized, y_train_standarized, y_temp_standarized = train_test_split(X_standarized, y_standarized, test_size=0.4, random_state=42)
X_val_standarized, X_test_standarized, y_val_standarized, y_test_standarized = train_test_split(X_temp_standarized, y_temp_standarized, test_size=0.5, random_state=42)

# Use the normalized data
X_normalized = data_normalized.drop(columns=['FAF', 'Gender', 'Age', 'Height', 'family_history_with_overweight', 'NCP', 'TUE', 'MTRANS', 'NObeyesdad'], axis = 1)
y_normalized = data_normalized['FAF']

# Splitting dataset into train, validation and test sets
X_train_normalized, X_temp_normalized, y_train_normalized, y_temp_normalized = train_test_split(X_normalized, y_normalized, test_size=0.4, random_state=42)
X_val_normalized, X_test_normalized, y_val_normalized, y_test_normalized = train_test_split(X_temp_normalized, y_temp_normalized, test_size=0.5, random_state=42)



In [None]:
# Importing MLP
from sklearn.neural_network import MLPRegressor

# Creating models
reg_standarized = MLPRegressor(hidden_layer_sizes=(5, 5), max_iter=1000, solver='lbfgs')
reg_mlp = MLPRegressor(hidden_layer_sizes=(5, 5), max_iter=1000, solver='lbfgs')
reg_normalized = MLPRegressor(hidden_layer_sizes=(5, 5), max_iter=1000, solver='lbfgs')


# Training models
reg_standarized.fit(X_train_standarized, y_train_standarized)
reg_mlp.fit(X_train, y_train)
reg_normalized.fit(X_train_normalized, y_train_normalized)

# Making predictions
y_pred_train_standarized = reg_standarized.predict(X_train_standarized)
y_pred_standarized = reg_standarized.predict(X_val_standarized)

y_pred_train = reg_mlp.predict(X_train)
y_pred = reg_mlp.predict(X_val)

y_pred_train_normalized = reg_normalized.predict(X_train_normalized)
y_pred_normalized = reg_normalized.predict(X_val_normalized)

# Calculating RMSE for training set
rmse_mlp_standarized_train = root_mean_squared_error(y_train_standarized, y_pred_train_standarized)
rmse_mlp_train = root_mean_squared_error(y_train, y_pred_train)
rmse_mlp_normalized_train = root_mean_squared_error(y_train_normalized, y_pred_train_normalized)
print(f'\nRMSE train with selected features: {rmse_mlp_train}')
print(f'\nRMSE train with standarized features: {rmse_mlp_standarized_train}')
print(f'\nRMSE train with normalized features: {rmse_mlp_normalized_train}')

# Calculating RMSE for validation set
rmse_mlp_standarized = root_mean_squared_error(y_val_standarized, y_pred_standarized)
rmse_mlp = root_mean_squared_error(y_val, y_pred)
rmse_mlp_normalized = root_mean_squared_error(y_val_normalized, y_pred_normalized)
print(f'\nRMSE with selected features: {rmse_mlp}')
print(f'\nRMSE with standarized features: {rmse_mlp_all}')
print(f'\nRMSE with normalized features: {rmse_mlp_normalized}')


RMSE train with selected features: 0.8799259991602478

RMSE train with standarized features: 0.6943495563681574

RMSE train with normalized features: 0.2590099981413185

RMSE with selected features: 0.9952019003334249

RMSE with standarized features: 0.9320928379889755

RMSE with normalized features: 0.35518180573390484


In [7]:
def predict(model_name, X_train, X_val, y_train, y_val, X_test=None, y_test=None, sample_weight=None, predict_on_test_set=False, calc_rmse=False, calc_r2=False, calc_mae=False, **kwargs):

  # Creating model
  model = model_name(**kwargs)

  if sample_weight is not None:
    # Training model
    model.fit(X_train, y_train, sample_weight=sample_weight)
    model.fit(X_val, y_val, sample_weight=sample_weight)
  else:
    # Training model
    model.fit(X_train, y_train)
    model.fit(X_val, y_val)

  # Making train and validation predictions
  y_pred_train = model.predict(X_train)
  y_pred_val = model.predict(X_val)

  if predict_on_test_set == True:
    # Retraining model
    model.fit(X_train, y_train)
    model.fit(X_val, y_val)

    # Making test predictions
    y_pred = model.predict(X_test)

  print(f"Calculations for {model}")

  # Calculating metrics
  if calc_rmse == True:
    rmse_train = root_mean_squared_error(y_train, y_pred_train)
    rmse_validation = root_mean_squared_error(y_val, y_pred_val)
    print(f'\nRMSE train: {rmse_train}')
    print(f'RMSE validation: {rmse_validation}')
    if predict_on_test_set == True:
      rmse_test = root_mean_squared_error(y_test, y_pred)
      print(f'RMSE test: {rmse_test}')

  if calc_r2 == True:
    r2_train = r2_score(y_train, y_pred_train)
    r2_validation = r2_score(y_val, y_pred_val)
    print(f'\nR2 train: {r2_train}')
    print(f'R2 validation: {r2_validation}')
    if predict_on_test_set == True:
      r2_test = r2_score(y_test, y_test)
      print(f'R2 test: {r2_test}')

  if calc_mae == True:
    mae_train = mean_absolute_error(y_train, y_pred_train)
    mae_validation = mean_absolute_error(y_val, y_pred_val)
    print(f'\nMAE train: {mae_train}')
    print(f'MAE validation: {mae_validation}')
    if predict_on_test_set == True:
      mae_test = mean_absolute_error(y_test, y_pred)
      print(f'MAE test: {mae_test}')

  return model

In [8]:
# Function to print RMSE
def print_scores(scores):
  print(f"Mean: {np.abs(np.mean(scores))}")

# Function to cross-validation
def cross_validation(model, X_normalized, y_normalized, cv=5, scoring='neg_root_mean_squared_error'):
  scores = cross_val_score(model, X_normalized, y_normalized, cv=cv, scoring=scoring)
  return scores


In [45]:
reg = predict(model_name=MLPRegressor,
                   X_train=X_train_normalized,
                   X_val=X_val_normalized,
                   y_train=y_train_normalized,
                   y_val=y_val_normalized,
                   X_test=X_test_normalized,
                   y_test=y_test_normalized,
                   predict_on_test_set=True,
                   calc_rmse=True,
                   calc_r2=True,
                   calc_mae=True,
                   hidden_layer_sizes=(5, 5),
                   max_iter=1000,
                   solver='lbfgs'
                   )

Calculations for MLPRegressor(hidden_layer_sizes=(5, 5), max_iter=1000, solver='lbfgs')

RMSE train: 0.31856372124283006
RMSE validation: 0.28614133499144145
RMSE test: 0.42354808272435257

R2 train: 0.04411428473163459
R2 validation: 0.28212396617970315
R2 test: 1.0

MAE train: 0.2629384868777493
MAE validation: 0.23286994055017057
MAE test: 0.315252793059838


In [None]:
# regressor part
for iter in [3, 5, 10]:
  scores_regressor = cross_validation(reg_mlp, X_train_normalized, y_train_normalized, cv=iter, scoring='neg_root_mean_squared_error')
  print(f"\nRegression scores for {iter}-fold:")
  print_scores(scores_regressor)


Regression scores for 3-fold:
Mean: 0.32329399027810196

Regression scores for 5-fold:
Mean: 0.3225392163459206

Regression scores for 10-fold:
Mean: 0.31618845461314327


In [None]:
# R2 metric
for iter in [3, 5, 10]:
  scores_regressor = cross_validation(reg_mlp, X_train_normalized, y_train_normalized, cv=iter, scoring='r2')
  print(f"\nRegression scores for {iter}-fold:")
  print_scores(scores_regressor)


Regression scores for 3-fold:
Mean: 0.11033369900581143

Regression scores for 5-fold:
Mean: 0.009351993263133917

Regression scores for 10-fold:
Mean: 0.06841757714809014


In [None]:
# Mean absolute error metric
for iter in [3, 5, 10]:
  scores_regressor = cross_validation(reg_mlp, X_train_normalized, y_train_normalized, cv=iter, scoring='neg_mean_absolute_error')
  print(f"\nRegression scores for {iter}-fold:")
  print_scores(scores_regressor)



Regression scores for 3-fold:
Mean: 0.25440320977503933

Regression scores for 5-fold:
Mean: 0.24689158076225315

Regression scores for 10-fold:
Mean: 0.2433033986147731


In [None]:
reg = MLPRegressor()
parameters = {'hidden_layer_sizes': [(5, 5), (5, 10), (10, 10)],
              'activation': ['relu', 'tanh'],
              'max_iter': [3000, 4000],
              'solver': [ 'lbfgs', 'sgd', 'adam'],
              }

grid_search = GridSearchCV(reg, parameters, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train_normalized, y_train_normalized)
print(grid_search.best_params_) # to get the best parameters

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

{'activation': 'relu', 'hidden_layer_sizes': (5, 10), 'max_iter': 4000, 'solver': 'lbfgs'}


In [None]:
print(grid_search.best_params_)

{'activation': 'relu', 'hidden_layer_sizes': (5, 10), 'max_iter': 4000, 'solver': 'lbfgs'}


In [None]:
# Training model with the tuned hypermarameters
reg = predict(model_name=MLPRegressor,
                   X_train=X_train_normalized,
                   X_val=X_val_normalized,
                   y_train=y_train_normalized,
                   y_val=y_val_normalized,
                   X_test=X_test_normalized,
                   y_test=y_test_normalized,
                   predict_on_test_set=True,
                   calc_rmse=True,
                   calc_r2=True,
                   calc_mae=True,
                   activation='relu',
                   hidden_layer_sizes=(5, 10),
                   max_iter=4000,
                   solver='lbfgs'
                   )

Calculations for MLPRegressor(hidden_layer_sizes=(5, 10), max_iter=4000, solver='lbfgs')

RMSE train: 0.3932664686144688
RMSE validation: 0.24563108539147335
RMSE test: 0.4045203588610539

R2 train: -0.45675721147437054
R2 validation: 0.47100092668015503
R2 test: 1.0

MAE train: 0.28307057102839556
MAE validation: 0.1817694176752165
MAE test: 0.315024834091948


In [40]:
# Linear regressor
lr = predict(model_name=LinearRegression,
                   X_train=X_train_normalized,
                   X_val=X_val_normalized,
                   y_train=y_train_normalized,
                   y_val=y_val_normalized,
                   X_test=X_test_normalized,
                   y_test=y_test_normalized,
                   predict_on_test_set=True,
                   calc_rmse=True,
                   calc_r2=True,
                   calc_mae=True,
                   )

Calculations for LinearRegression()

RMSE train: 0.3099608099430981
RMSE validation: 0.3099143372358131
RMSE test: 0.3423633728288618

R2 train: 0.09504514569544364
R2 validation: 0.15788463258038454
R2 test: 1.0

MAE train: 0.2583132627008037
MAE validation: 0.25731029461158056
MAE test: 0.28733890378972576


In [None]:
#weighted linear regressor

#adding weights
sample_weight = np.ones(len(y_train_normalized))
sample_weight[:10] = 2

#training
reg = LinearRegression()
reg.fit(X_train_normalized, y_train_normalized, sample_weight=sample_weight)
y_pred_train_normalized = reg.predict(X_train_normalized)
y_pred_normalized = reg.predict(X_val_normalized)
y_pred_val_normalized  = reg.predict(X_val_normalized)

#calculating rmse
rmse_train_normalized = np.sqrt(np.mean((y_pred_train_normalized - y_train_normalized)**2))
rmse_validation_normalized = np.sqrt(np.mean((y_pred_normalized - y_val_normalized)**2))

print(rmse_train_normalized, rmse_validation_normalized)


0.2950194033215973 0.3252042067301469


In [None]:
reg.fit(X_train_normalized, y_train_normalized)
reg.fit(X_val_normalized, y_val_normalized)

rmse_train = root_mean_squared_error(y_train_normalized, y_pred_train_normalized)
rmse_validation = root_mean_squared_error(y_val_normalized, y_pred_val_normalized)
print(f'\nRMSE train: {rmse_train}')
print(f'RMSE validation: {rmse_validation}')

rmse_test = root_mean_squared_error(y_test_normalized, y_pred_normalized)
print(f'RMSE test: {rmse_test}')

# Making test predictions
y_pred_normalized = reg.predict(X_test_normalized)

r2_train = r2_score(y_train_normalized, y_pred_train_normalized)
r2_validation = r2_score(y_val_normalized, y_pred_val_normalized)
print(f'\nR2 train: {r2_train}')
print(f'R2 validation: {r2_validation}')

r2_test = r2_score(y_test_normalized, y_test_normalized)
print(f'R2 test: {r2_test}')

mae_train = mean_absolute_error(y_train_normalized, y_pred_train_normalized)
mae_validation = mean_absolute_error(y_val_normalized, y_pred_val_normalized)
print(f'\nMAE train: {mae_train}')
print(f'MAE validation: {mae_validation}')

mae_test = mean_absolute_error(y_test_normalized, y_pred_normalized)
print(f'MAE test: {mae_test}')


RMSE train: 0.2950194033215973
RMSE validation: 0.3252042067301469
RMSE test: 0.3685307558938234

R2 train: 0.18018756678829706
R2 validation: 0.07274204818218388
R2 test: 1.0

MAE train: 0.2428559703101832
MAE validation: 0.26637236671308195
MAE test: 0.28733890378972576


In [39]:
# SVM with linear kernel
svm_linear = predict(model_name=SVR,
                   X_train=X_train_normalized,
                   X_val=X_val_normalized,
                   y_train=y_train_normalized,
                   y_val=y_val_normalized,
                   X_test=X_test_normalized,
                   y_test=y_test_normalized,
                   predict_on_test_set=True,
                   calc_rmse=True,
                   calc_r2=True,
                   calc_mae=True,
                   kernel='linear')

Calculations for SVR(kernel='linear')

RMSE train: 0.320403033043375
RMSE validation: 0.3206069475578251
RMSE test: 0.3596145373619021

R2 train: 0.0330443015955737
R2 validation: 0.09877316411824266
R2 test: 1.0

MAE train: 0.25432746628951564
MAE validation: 0.2538213043072977
MAE test: 0.2913989245458545


In [38]:
# SVM with rbf kernel
svm_rbf = predict(model_name=SVR,
                   X_train=X_train_normalized,
                   X_val=X_val_normalized,
                   y_train=y_train_normalized,
                   y_val=y_val_normalized,
                   X_test=X_test_normalized,
                   y_test=y_test_normalized,
                   predict_on_test_set=True,
                   calc_rmse=True,
                   calc_r2=True,
                   calc_mae=True,
                   kernel='rbf')

Calculations for SVR()

RMSE train: 0.3205885132207753
RMSE validation: 0.2634252600883089
RMSE test: 0.3543523551297008

R2 train: 0.03192444311211484
R2 validation: 0.3915805464301124
R2 test: 1.0

MAE train: 0.25403944137539786
MAE validation: 0.20489175042676472
MAE test: 0.27986414882165256


In [41]:
# SVM with poly kernel
svm_poly = predict(model_name=SVR,
                   X_train=X_train_normalized,
                   X_val=X_val_normalized,
                   y_train=y_train_normalized,
                   y_val=y_val_normalized,
                   X_test=X_test_normalized,
                   y_test=y_test_normalized,
                   predict_on_test_set=True,
                   calc_rmse=True,
                   calc_r2=True,
                   calc_mae=True,
                   kernel='poly')

Calculations for SVR(kernel='poly')

RMSE train: 0.34631331330432835
RMSE validation: 0.272860538470692
RMSE test: 0.3790722129309095

R2 train: -0.12967030991592554
R2 validation: 0.34721567048235646
R2 test: 1.0

MAE train: 0.2701770933251739
MAE validation: 0.2093070490169022
MAE test: 0.295137890161221


In [None]:
# regressor part
for iter in [3, 5, 10]:
  scores_regressor = cross_validation(svm_linear, X_train_normalized, y_train_normalized, cv=iter, scoring='neg_root_mean_squared_error')
  print(f"\nRegression scores for {iter}-fold:")
  print_scores(scores_regressor)


Regression scores for 3-fold:
Mean: 0.30362082595833356

Regression scores for 5-fold:
Mean: 0.30329528076662077

Regression scores for 10-fold:
Mean: 0.2999329240237262


In [None]:
# R2 metric
for iter in [3, 5, 10]:
  scores_regressor = cross_validation(svm_linear, X_train_normalized, y_train_normalized, cv=iter, scoring='r2')
  print(f"\nRegression scores for {iter}-fold:")
  print_scores(scores_regressor)


Regression scores for 3-fold:
Mean: 0.12463352824782799

Regression scores for 5-fold:
Mean: 0.11760635614157067

Regression scores for 10-fold:
Mean: 0.1254540416484454


In [None]:
# Mean absolute error metric
for iter in [3, 5, 10]:
  scores_regressor = cross_validation(svm_linear, X_train_normalized, y_train_normalized, cv=iter, scoring='neg_mean_absolute_error')
  print(f"\nRegression scores for {iter}-fold:")
  print_scores(scores_regressor)



Regression scores for 3-fold:
Mean: 0.2408616905868691

Regression scores for 5-fold:
Mean: 0.2421527930835281

Regression scores for 10-fold:
Mean: 0.24021267260516344


In [None]:
reg = SVR()
parameters = [
              {'kernel':('linear', 'rbf',), 'tol':[1e-1, 1e-2, 1e-3, 1e-4], 'C':[0.01, 0.1, 0.5, 1, 1.5, 10]},
              {'kernel':['poly'], 'tol':[1e-1, 1e-2, 1e-3, 1e-4], 'C':[0.01, 0.1, 0.5, 1, 1.5, 10], 'degree':[2,3,4]}
              ]

grid_search = GridSearchCV(reg, parameters, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train_normalized, y_train_normalized)
print(grid_search.best_params_) # to get the best parameters

{'C': 0.1, 'kernel': 'rbf', 'tol': 0.0001}


In [37]:
reg = predict(model_name=SVR,
                   X_train=X_train_normalized,
                   X_val=X_val_normalized,
                   y_train=y_train_normalized,
                   y_val=y_val_normalized,
                   X_test=X_test_normalized,
                   y_test=y_test_normalized,
                   predict_on_test_set=True,
                   calc_rmse=True,
                   calc_r2=True,
                   calc_mae=True,
                   kernel='rbf',
                   tol=0.001,
                   C=0.1
                   )

Calculations for SVR(C=0.1)

RMSE train: 0.3104102868455534
RMSE validation: 0.3106429022843228
RMSE test: 0.34694355477896305

R2 train: 0.09241867672150661
R2 validation: 0.15392058885631577
R2 test: 1.0

MAE train: 0.25241364025076624
MAE validation: 0.25056780874790796
MAE test: 0.2821207302610123


In [46]:
# Random forest regressor
rfr = predict(model_name=RandomForestRegressor,
                   X_train=X_train_normalized,
                   X_val=X_val_normalized,
                   y_train=y_train_normalized,
                   y_val=y_val_normalized,
                   X_test=X_test_normalized,
                   y_test=y_test_normalized,
                   predict_on_test_set=True,
                   calc_rmse=True,
                   calc_r2=True,
                   calc_mae=True,
                   random_state=42,
                   n_estimators=100)

Calculations for RandomForestRegressor(random_state=42)

RMSE train: 0.31611525989035855
RMSE validation: 0.11805580243982906
RMSE test: 0.335743445097482

R2 train: 0.058751575801820444
R2 validation: 0.8778023281448446
R2 test: 1.0

MAE train: 0.23982924107142858
MAE validation: 0.09297222222222233
MAE test: 0.2549411111111111


In [None]:
# Random forest regressor
rfr = predict(model_name=RandomForestRegressor,
                   X_train=X_train_normalized,
                   X_val=X_val_normalized,
                   y_train=y_train_normalized,
                   y_val=y_val_normalized,
                   calc_rmse=True,
                   random_state=42,
                   n_estimators=200)

Calculations for RandomForestRegressor(random_state=42)

RMSE train: 0.31611525989035855
RMSE validation: 0.11805580243982906


In [None]:
# regressor part
for iter in [3, 5, 10]:
  scores_regressor = cross_validation(rfr, X_train_normalized, y_train_normalized, cv=iter, scoring='neg_root_mean_squared_error')
  print(f"\nRegression scores for {iter}-fold:")
  print_scores(scores_regressor)


Regression scores for 3-fold:
Mean: 0.30668425081622314

Regression scores for 5-fold:
Mean: 0.29436526935180457

Regression scores for 10-fold:
Mean: 0.2877768426109054


In [9]:
reg = RandomForestRegressor()
parameters = {'n_estimators':[10, 50, 100, 200, 350, 500, 650, 850, 1000],
              'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
              }

grid_search = GridSearchCV(reg, parameters, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train_normalized, y_train_normalized)
print(grid_search.best_params_) # to get the best parameters

  _data = np.array(data, dtype=dtype, copy=copy,


{'criterion': 'absolute_error', 'n_estimators': 200}


In [26]:
reg = RandomForestRegressor()
parameters = {'n_estimators':[10, 50, 100, 200, 350, 500, 650, 850, 1000, 2000],
              'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
              }

grid_search = GridSearchCV(reg, parameters, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train_normalized, y_train_normalized)
print(grid_search.best_params_) # to get the best parameters

  _data = np.array(data, dtype=dtype, copy=copy,


{'criterion': 'absolute_error', 'n_estimators': 850}


In [10]:
# Creating model
reg = RandomForestRegressor(n_estimators=200, criterion='absolute_error', random_state=42)

# Training model
reg.fit(X_train_normalized, y_train_normalized)
reg.fit(X_val_normalized, y_val_normalized)

# Making train and validation predictions
y_pred_train_normalized = reg.predict(X_train_normalized)
y_pred_val_normalized = reg.predict(X_val_normalized)
reg.fit(X_train_normalized, y_train_normalized)

# Predicting value on test set
y_pred_normalized = reg.predict(X_test_normalized)

# RMSE, R2, MAE score
rmse = root_mean_squared_error(y_test_normalized, y_pred_normalized)
r2 = r2_score(y_test_normalized, y_pred_normalized)
mae = mean_absolute_error(y_test_normalized, y_pred_normalized)


print(f"RMSE: {rmse}")
print(f"R2: {r2}")
print(f"MAE: {mae}")

RMSE: 0.30742090220268226
R2: 0.2215346871949233
MAE: 0.2304666666666667


In [23]:
# Random forest regressor
rfr = predict(model_name=RandomForestRegressor,
                   X_train=X_train_normalized,
                   X_val=X_val_normalized,
                   y_train=y_train_normalized,
                   y_val=y_val_normalized,
                   X_test=X_test_normalized,
                   y_test=y_test_normalized,
                   predict_on_test_set=True,
                   calc_rmse=True,
                   calc_r2=True,
                   calc_mae=True,
                   random_state=42,
                   n_estimators=200,
                   criterion='absolute_error')

Calculations for RandomForestRegressor(criterion='absolute_error', n_estimators=200,
                      random_state=42)

RMSE train: 0.3138344301397962
RMSE validation: 0.12540392146387805
RMSE test: 0.3346231711442988

R2 train: 0.07228513651627966
R2 validation: 0.8621170738439559
R2 test: 1.0

MAE train: 0.23938430059523808
MAE validation: 0.09913888888888872
MAE test: 0.2601666666666667


In [24]:
# Random forest regressor
rfr = predict(model_name=RandomForestRegressor,
                   X_train=X_train_normalized,
                   X_val=X_val_normalized,
                   y_train=y_train_normalized,
                   y_val=y_val_normalized,
                   X_test=X_test_normalized,
                   y_test=y_test_normalized,
                   predict_on_test_set=True,
                   calc_rmse=True,
                   calc_r2=True,
                   calc_mae=True,
                   random_state=42,
                   n_estimators=200,
                   criterion='squared_error')

Calculations for RandomForestRegressor(n_estimators=200, random_state=42)

RMSE train: 0.3158931556153219
RMSE validation: 0.12006190549114125
RMSE test: 0.3371414636739268

R2 train: 0.0600737635207681
R2 validation: 0.8736140724407622
R2 test: 1.0

MAE train: 0.23845037910997732
MAE validation: 0.094202222222222
MAE test: 0.2566994973544973


In [27]:
# Random forest regressor
rfr = predict(model_name=RandomForestRegressor,
                   X_train=X_train_normalized,
                   X_val=X_val_normalized,
                   y_train=y_train_normalized,
                   y_val=y_val_normalized,
                   X_test=X_test_normalized,
                   y_test=y_test_normalized,
                   predict_on_test_set=True,
                   calc_rmse=True,
                   calc_r2=True,
                   calc_mae=True,
                   random_state=42,
                   n_estimators=850,
                   criterion='absolute_error')

Calculations for RandomForestRegressor(criterion='absolute_error', n_estimators=850,
                      random_state=42)

RMSE train: 0.31285273550985515
RMSE validation: 0.12596248849155195
RMSE test: 0.33487316961546737

R2 train: 0.07807996459463384
R2 validation: 0.8608860377266916
R2 test: 1.0

MAE train: 0.23781512605042004
MAE validation: 0.0993816993464043
MAE test: 0.25929803921568617


In [28]:
# Random forest regressor
rfr = predict(model_name=RandomForestRegressor,
                   X_train=X_train_normalized,
                   X_val=X_val_normalized,
                   y_train=y_train_normalized,
                   y_val=y_val_normalized,
                   X_test=X_test_normalized,
                   y_test=y_test_normalized,
                   predict_on_test_set=True,
                   calc_rmse=True,
                   calc_r2=True,
                   calc_mae=True,
                   random_state=42,
                   n_estimators=850,
                   criterion='squared_error')

Calculations for RandomForestRegressor(n_estimators=850, random_state=42)

RMSE train: 0.31435265760418957
RMSE validation: 0.11747129018887352
RMSE test: 0.33488533258726544

R2 train: 0.06921877913099717
R2 validation: 0.879009371156291
R2 test: 1.0

MAE train: 0.23611217403628101
MAE validation: 0.09225049486461148
MAE test: 0.2562832586367878
