# First part: Comparison of classifiers on simulated data

In [None]:
from sklearn.datasets import make_moons, make_circles, make_classification, make_blobs
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier #Multilayer perceptron classifier
from matplotlib.colors import ListedColormap

The following are two useful functions for plotting a dataset (only training, or all data split into training and test) and the decision boundary of a model and the data

In [None]:
def plot_dataset(X_train, y_train, X_test=None, y_test=None):
    # -- function that plots the datapoints
    h = 0.02 # -- h is the step length
    x_min, x_max = X_train[:, 0].min() - 0.5, X_train[:, 0].max() + 0.5
    y_min, y_max = X_train[:, 1].min() - 0.5, X_train[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # -- just plot the dataset first
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(["#FF0000", "#0000FF"])
    ax = plt.subplot(1,1,1)
    ax.set_title("Input data")
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k")

    if X_test is not None and y_test is not None:
        # -- Plot the testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.2, edgecolors="k")

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())

In [None]:
def plot_model(input_model, X_train, y_train, X_test, y_test):
    # -- function that plots the datapoints and decision boundaries of input_model
    h = 0.02
    x_min, x_max = X_train[:, 0].min() - 0.5, X_train[:, 0].max() + 0.5
    y_min, y_max = X_train[:, 1].min() - 0.5, X_train[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # -- just plot the dataset first
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(["#FF0000", "#0000FF"])
    ax = plt.subplot(1, 1, 1)

    ax.set_title("Model decision boundary")
    # -- Plot the decision boundary. For that, we will assign a color to each
    # -- point in the mesh [x_min, x_max] x [y_min, y_max].
    if hasattr(input_model, "decision_function"):
        Z = input_model.decision_function(np.c_[xx.ravel(), yy.ravel()])
    else:
        Z = input_model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

    # -- Put the result into a color plot
    Z = Z.reshape(xx.shape)
    ax.contourf(xx, yy, Z, cmap=cm, alpha = 0.8)

    # -- Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c = y_train, cmap = cm_bright, edgecolors = "k")
    # -- Plot the testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c = y_test, cmap = cm_bright, edgecolors = "k", alpha = 0.2)

    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())

Let's generate an almost linearly separable dataset and run the perceptron first, then SVM, then a NN with default parameters

In [None]:
# -- generate a random n-classification dataset
X, y = make_classification(
    n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1
)

# -- add noise to points exploiting a uniform distribution
# -- the aim is to get closer to a non-linearly separable dataset
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size = X.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Let's plot the training dataset.

In [None]:
plot_dataset(X_train_scaled, y_train)

Let's now print all data (i.e., train and and test). The points in the test set are the most transparent that will be displayed.

In [None]:
plot_dataset(X_train_scaled, y_train, X_test_scaled, y_test)

Now let's learn a perceptron, plot its decision boundary, and print the train error and the test error.

In [None]:
perceptron = Perceptron(random_state = 11)
perceptron.fit(X_train_scaled, y_train)

plot_model(perceptron, X_train_scaled, y_train, X_test_scaled, y_test)

print(f'Training error:, {(1.0 - perceptron.score(X_train_scaled, y_train)):.5f}')

print(f'Test error:, {(1.0 - perceptron.score(X_test_scaled, y_test)):.5f}')

Let's do the same for SVM.

In [None]:
svm = SVC(kernel = "linear", C = 1)
svm.fit(X_train_scaled, y_train)

plot_model(svm, X_train_scaled, y_train, X_test_scaled, y_test)

print(f'Training error: {(1.0 - svm.score(X_train_scaled, y_train)):.5f}')
print(f'Test error: {(1.0 - svm.score(X_test_scaled, y_test)):.5f}')

Let's try with a NN.

In [None]:
# -- one hidden layer with size= 100, activation function = ReLU (see documentation)
mlp = MLPClassifier(max_iter = 1000)
mlp.fit(X_train_scaled, y_train)

plot_model(mlp, X_train_scaled, y_train, X_test_scaled, y_test)

print(f'Training error: {(1.0 - mlp.score(X_train_scaled, y_train)):.5f}')
print(f'Test error: {(1.0 - mlp.score(X_test_scaled, y_test)):.5f}')

---

Let's try now with some more complex dataset.

In [None]:
X, y = make_moons(noise = 0.3, random_state = 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Let's plot the training data.

In [None]:
plot_dataset(X_train_scaled, y_train)

Let's plot all the data.

In [None]:
plot_dataset(X_train_scaled, y_train, X_test_scaled, y_test)

Let's run the perceptron.

In [None]:
perceptron = Perceptron(random_state = 11)
perceptron.fit(X_train_scaled, y_train)

plot_model(perceptron, X_train_scaled, y_train, X_test_scaled, y_test)

print(f'Training error: {(1.0 - perceptron.score(X_train_scaled, y_train)):.5f}')
print(f'Test error: {(1.0 - perceptron.score(X_test_scaled, y_test)):.5f}')

Let's run the SVM

In [None]:
svm = SVC(kernel = "linear")
svm.fit(X_train, y_train)

plot_model(svm, X_train_scaled, y_train, X_test_scaled, y_test)

print(f'Training error: {(1.0 - svm.score(X_train_scaled, y_train)):.5f}')
print(f'Test error: {(1.0 - svm.score(X_test_scaled, y_test)):.5f}')

Let's try the NN

In [None]:
mlp = MLPClassifier(max_iter = 1500)
# -- Note that with max_iter = 1000 the model is not converging. (see 'tol' parameter). Try to re-train with max_iter = 1500
mlp.fit(X_train_scaled, y_train)

plot_model(mlp, X_train_scaled, y_train, X_test_scaled, y_test)

print(f'Training error: {(1.0 - mlp.score(X_train_scaled, y_train)):.5f}')
print(f'Test error: {(1.0 - mlp.score(X_test_scaled, y_test)):.5f}')

---

Another interesting dataset

In [None]:
X, y = make_circles(noise = 0.2, factor = 0.5, random_state = 1)

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.4, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Let's plot the training data.

In [None]:
plot_dataset(X_train_scaled, y_train)

Let's plot all the data.

In [None]:
plot_dataset(X_train_scaled, y_train, X_test_scaled, y_test)

Let's run the perceptron

In [None]:
perceptron = Perceptron(random_state = 11)
perceptron.fit(X_train_scaled, y_train)

plot_model(perceptron, X_train_scaled, y_train, X_test_scaled, y_test)

print(f'Training error: {(1.0 - perceptron.score(X_train_scaled, y_train)):.5f}')
print(f'Test error: {(1.0 - perceptron.score(X_test_scaled, y_test)):.5f}')

Let's run the SVM

In [None]:
svm = SVC(kernel = "linear")
svm.fit(X_train, y_train)

plot_model(svm, X_train_scaled, y_train, X_test_scaled, y_test)

print(f'Training error: {(1.0 - svm.score(X_train_scaled, y_train)):.5f}')
print(f'Test error: {(1.0 - svm.score(X_test_scaled, y_test)):.5f}')

Let's run the NN

In [None]:
mlp = MLPClassifier(max_iter = 1000)
mlp.fit(X_train_scaled, y_train)

plot_model(mlp, X_train_scaled, y_train, X_test_scaled, y_test)

print(f'Training error: {(1.0 - mlp.score(X_train_scaled, y_train)):.5f}')
print(f'Test error: {(1.0 - mlp.score(X_test_scaled, y_test)):.5f}')

---

Let's now consider the blobs dataset considered in the last Lab.

In [None]:
# -- make_blobs dataset

# -- generate the dataset
X, y = make_blobs(n_samples = 1000, centers = 2, n_features = 2, center_box=(-7.5, 7.5), random_state = 37, cluster_std = 2.8)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# -- scale data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

Let's plot all the data.

In [None]:
plot_dataset(X_train_scaled, y_train, X_test_scaled, y_test)

In [None]:
# -- perceptron
perceptron = Perceptron(random_state = 11)
perceptron.fit(X_train_scaled, y_train)

plot_model(perceptron, X_train_scaled, y_train, X_test_scaled, y_test)

print(f'Training error: {(1.0 - perceptron.score(X_train_scaled, y_train)):.5f}')
print(f'Test error: {(1.0 - perceptron.score(X_test_scaled, y_test)):.5f}')

In [None]:
# -- svm
svm = SVC(kernel = "linear")
svm.fit(X_train, y_train)

plot_model(svm, X_train_scaled, y_train, X_test_scaled, y_test)

print(f'Training error: {(1.0 - svm.score(X_train_scaled, y_train)):.5f}')
print(f'Test error: {(1.0 - svm.score(X_test_scaled, y_test)):.5f}')

In [None]:
# -- NN (mlp)
mlp = MLPClassifier(max_iter = 1000)
mlp.fit(X_train_scaled, y_train)

plot_model(mlp, X_train_scaled, y_train, X_test_scaled, y_test)

print(f'Training error: {(1.0 - mlp.score(X_train_scaled, y_train)):.5f}')
print(f'Test error: {(1.0 - mlp.score(X_test_scaled, y_test)):.5f}')

# Second part: Regression on House Pricing Dataset
We consider a reduced version of a dataset containing house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.

https://www.kaggle.com/harlfoxem/housesalesprediction

For each house we know 18 house features (e.g., number of bedrooms, number of bathrooms, etc.) plus its price, that is what we would like to predict.

In [None]:
# -- put here your ID_Number  (numero di matricola)
numero_di_matricola = 1

In [None]:
#import all packages needed
# %matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# -- avoid convergence warnings from sklearn library
import warnings
warnings.filterwarnings("ignore")

Load the data, remove data samples/points with missing values (NaN) and take a look at them.

In [None]:
# -- load the dataset
df = pd.read_csv('kc_house_data.csv', sep = ',')
# -- remove the data samples with missing values (NaN)
df = df.dropna()

In [None]:
# -- print the column names and the first 5 rows of the dataframe
print(df.columns)
print('\n')
print(df.head())

Extract input and output data. We want to predict the price by using features other than id as input.

In [None]:
Data = df.values
# -- m = number of input samples
m = Data.shape[0]
print("Amount of data:",m)
Y = Data[:m, 2]
X = Data[:m, 3:]

# -- print shapes
print("X shape: ", X.shape)
print("Y shape: ", Y.shape)

## Data Pre-Processing

We split the data into 3 parts: one will be used for training and choosing the parameters, one for choosing among different models, and one for testing. The part for training and choosing the parameters will consist of $2/3$ of all samples, the one for choosing among different models will consist of $1/6$ of all samples, while the other part consists of the remaining $1/6$-th of all samples.

In [None]:
# -- Split data into train (2/3 of samples), validation (1/6 of samples), and test data (the rest)
m_train = int(2/3*m)
m_val = int((m-m_train)/2)
m_test = m - m_train - m_val
print("Amount of data for training and deciding parameters:", m_train)
print("Amount of data for validation (choosing among different models):", m_val)
print("Amount of data for test:", m_test)

from sklearn.model_selection import train_test_split

X_train_and_val, X_test, Y_train_and_val, Y_test = train_test_split(X, Y, test_size = m_test/m, random_state = numero_di_matricola)
X_train, X_val, Y_train, Y_val = train_test_split(X_train_and_val, Y_train_and_val,
                                                  test_size = m_val/(m_train + m_val), random_state = numero_di_matricola)

Let's standardize the data.

In [None]:
# -- Data pre-processing
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
X_train_and_val_scaled = scaler.transform(X_train_and_val)

## Neural Networks
Let's start by learning a simple neural network with 1 hidden node.
Note: we are going to use the input parameter solver='lbfgs' and random_state=numero_di_matricola to fix the random seed (so results are reproducible).

We hereby define a function to train an MLPRegressor on the (already scaled) training data and (optionally) print its parameters at the end of the training.

In [None]:
# -- look at kwargs** in Python

In the function definition, a special syntax called `**kwargs` is used to pass a keyworded, variable-length argument list into the configuration settings of the `MLPRegressor` model. This syntax enables you to provide flexible and customizable parameters for the model. 

These keyword arguments allow you to specify various settings such as:
- **`hidden_layer_sizes`**, to define the number and size of hidden layers (e.g., `(1,)` for a single hidden layer with one neuron).
- **`solver`** (e.g., `'lbfgs'`).
- **`random_state`**

If you're unfamiliar with this syntax, refer to Python's documentation on `kwargs` for more details.


In [None]:
from sklearn.neural_network import MLPRegressor

def train_model(X_train, Y_train, X_val, Y_val, print_weights = True, **params):

    mlp_model = MLPRegressor(**params)
    mlp_model.fit(X_train, Y_train)

    # -- let's print the error (1 - R^2) on training data
    print(f'Training error: {(1.0 - mlp_model.score(X_train, Y_train)):.5f}')
    # -- let's print the error (1 - R^2) on validation data
    print(f'Validation error: {(1.0 - mlp_model.score(X_val, Y_val)):.5f}')

    if print_weights:

        weights = mlp_model.coefs_
        biases = mlp_model.intercepts_

        # -- let's print the coefficients of the model for the input nodes (but not the bias)
        print('\n--- Weights of NN ---')

        for i_layer, (w, b) in enumerate(zip(weights, biases)):
            print(f'\n# Layer {i_layer+1}')
            print(f'--- Weights, with shape {w.shape} ---')
            for i in range(w.shape[0]):
                for j in range(w.shape[1]):
                    print(f'w_({i+1}, {j+1})^({i_layer+1}): {w[i][j]:.3f}')

            print(f'--- Biases, with shape {b.shape} ---')
            for i in range(b.shape[0]):
                print(f'b_{i+1}: {b[i]:.3f}')

In [None]:
# -- let's define the model
# -- Look how to hidden_layer_sizes in the documentation
params = {'hidden_layer_sizes': (1, ),
          'solver' : 'lbfgs',
          'random_state' : numero_di_matricola}
train_model(X_train_scaled, Y_train, X_val_scaled, Y_val, **params)

## Neural Networks vs Linear Models

Let's learn a linear model on the same data and compare the results with the simple NN above.

In [None]:
from sklearn import linear_model

LR = linear_model.LinearRegression()

LR.fit(X_train_scaled, Y_train)

# -- let's print the error (1 - R^2) on training data
print(f'Training error: {(1.0 - LR.score(X_train_scaled, Y_train)):.5f}')
# -- let's print the error (1 - R^2) on validation data
print(f'Validation error: {(1.0 - LR.score(X_val_scaled, Y_val)):.5f}')

print(f'\n--- Weights, with shape {LR.coef_.shape} ---\n{LR.coef_}')
print(f'\n--- Bias --- \n{LR.intercept_}')

Is there a way to make a NN network learn a linear model?

Let's first check what is the activation function used by MLPRegressor...

In [None]:
# -- let's write the code to learn a linear model with NN: how?
params = {'hidden_layer_sizes': (1, ),
          'solver' : 'lbfgs',
          'random_state' : numero_di_matricola,
          'activation' : 'identity'
         }
train_model(X_train_scaled, Y_train, X_val_scaled, Y_val, **params)

In [None]:
# -- Example of handmade computations: with null input vector:
# -- linear model output = bias ~ 536.831,9203
# -- NN: w_(1, 1)^(2) * b_1 + b_2 ~ 536.829,396
# -- why the above tiny difference? Because of l2 default regularization

Note that there is an $\ell_2$ regularization term in MLPRegressor. What about making it smaller?

In [None]:
# -- you can try to change alpha (e.g., huge value to see the model is forcing null vector w)
params = {'hidden_layer_sizes': (1, ),
          'solver' : 'lbfgs',
          'random_state' : numero_di_matricola,
          'activation' : 'identity',
          'alpha' : 1e-20
         }
train_model(X_train_scaled, Y_train, X_val_scaled, Y_val, **params)

In [None]:
# -- with alpha = 1e-20: w_(1, 1)^(2) * b_1 + b_2 is 536.832,298621 (the difference is even closer,
# -- not perfectly the same due to rounding)

## More Complex NNs

Let's try more complex NN, for example increasing the number of nodes in the only hidden layer, or increasing the number of hidden layers.

Let's build a NN with 2 nodes in the only hidden layer

In [None]:
# -- let's build a NN with 2 nodes in the only hidden layer
params = {'hidden_layer_sizes': (2, ), 'solver' : 'lbfgs', 'random_state' : numero_di_matricola}
train_model(X_train_scaled, Y_train, X_val_scaled, Y_val, **params)

Let's build a NN with 5 nodes in the only hidden layer

In [None]:
# -- let's build a NN with 5 nodes in the only hidden layer
params = {'hidden_layer_sizes': (5, ), 'solver' : 'lbfgs', 'random_state' : numero_di_matricola}
train_model(X_train_scaled, Y_train, X_val_scaled, Y_val, **params)

Let's build a NN with 10 nodes in the only hidden layer

In [None]:
# -- let's build a NN with 10 nodes in the only hidden layer
params = {'hidden_layer_sizes': (10, ), 'solver' : 'lbfgs', 'random_state' : numero_di_matricola}
train_model(X_train_scaled, Y_train, X_val_scaled, Y_val, **params)

Let's build a NN with 100 nodes in the only hidden layer. Note that this is the default!

In [None]:
# -- let's build a NN with 100 nodes in the only hidden layer
params = {'hidden_layer_sizes': (100, ), 'solver' : 'lbfgs', 'random_state' : numero_di_matricola}
train_model(X_train_scaled, Y_train, X_val_scaled, Y_val, print_weights=False, **params)

Let's try 2 layers, 1 node each

In [None]:
# -- let's build a NN with 2 hidden layers each with a node
params = {'hidden_layer_sizes': (1, 1), 'solver' : 'lbfgs', 'random_state' : numero_di_matricola}
train_model(X_train_scaled, Y_train, X_val_scaled, Y_val, **params)

Let's try 2 layers, 2 nodes each

In [None]:
# -- let's build a NN with 2 hidden layers each with two nodes
params = {'hidden_layer_sizes': (2, 2), 'solver' : 'lbfgs', 'random_state' : numero_di_matricola}
train_model(X_train_scaled, Y_train, X_val_scaled, Y_val, **params)

Try other architectures! 

In [None]:
# -- let's build a NN with 2 hidden layers each with 10 nodes
params = {'hidden_layer_sizes': (10, 10), 'solver' : 'lbfgs', 'random_state' : numero_di_matricola}
train_model(X_train_scaled, Y_train, X_val_scaled, Y_val, **params)

In [None]:
# -- let's build a NN with 2 hidden layers each with 100 nodes
params = {'hidden_layer_sizes': (100, 100), 'solver' : 'lbfgs', 'random_state' : numero_di_matricola}
train_model(X_train_scaled, Y_train, X_val_scaled, Y_val, print_weights=False, **params)

How can we find the best architecture?

### $k$-Fold Cross Validation

Let's try 5-fold cross-validation with number of nodes in the hidden layer between 1 and 20. Note that we use train and validation data together, since we are doing cross-validation.

Note: you can also try to change the maximum amount of iterations to see what happens (see documentation for max_iter parameter)

In [None]:
from sklearn.model_selection import KFold
from itertools import product


def k_fold_cross_validation(X_train, Y_train, random_state, num_folds = 5):

    # -- grid of hyperparams
    param_grid = {'hidden_layer_sizes': [i for i in range(1, 21)],
                  'activation': ['relu'],
                  'solver': ['lbfgs'],
                  'random_state': [random_state],
                  'max_iter': [150, 175, 200]
                 }

    param_list = [
    {'hidden_layer_sizes': hls, 'activation': act, 'solver': solv, 'random_state': rs, 'max_iter': mit}
    for hls, act, solv, rs, mit in product(
        param_grid['hidden_layer_sizes'],
        param_grid['activation'],
        param_grid['solver'],
        param_grid['random_state'],
        param_grid['max_iter']
    )
    ]

    err_train_kfold = np.zeros(len(param_list),)
    err_val_kfold = np.zeros(len(param_list),)

    # print('Params for model selection:', param_list)

    kf = KFold(n_splits = num_folds)


    # -- perform kfold validation for model selection (k = 5)
    for i, params in enumerate(param_list):

        print(f'#{i+1}  {params}...')
        mlp_model = MLPRegressor(**params)

        for train_index, validation_index in kf.split(X_train):

            X_train_kfold, X_val_kfold = X_train[train_index], X_train[validation_index]
            Y_train_kfold, Y_val_kfold = Y_train[train_index], Y_train[validation_index]

            # -- data scaling: standardize features with respect to the current folds
            scaler_kfold = preprocessing.StandardScaler().fit(X_train_kfold)
            X_train_kfold_scaled = scaler_kfold.transform(X_train_kfold)
            X_val_kfold_scaled = scaler_kfold.transform(X_val_kfold)

            # -- learn the model using the training data from the k-fold
            mlp_model.fit(X_train_kfold_scaled, Y_train_kfold)

            # -- incremental mean
            err_train_kfold[i] += (1 - mlp_model.score(X_train_kfold_scaled, Y_train_kfold))
            err_val_kfold[i] += (1 - mlp_model.score(X_val_kfold_scaled, Y_val_kfold))
        print("train cv error", err_train_kfold[i] / num_folds)
        print("val cv error", err_val_kfold[i] / num_folds)
        print()


    # -- compute the mean => estimate of validation losses and errors for each lam
    err_train_kfold /= num_folds
    err_val_kfold /= num_folds

    # -- choose the regularization parameter that minimizes the loss
    print('\n---\n')
    best_param = param_list[np.argmin(err_val_kfold)]
    print('Best value of the parameters:', best_param)
    print('Min validation error:', np.min(err_val_kfold))

    return best_param

In [None]:
# -- obtain the best paramaters by running k_fold_cross_validation on training data
best_param = k_fold_cross_validation(X_train_scaled, Y_train, random_state = numero_di_matricola)

Note that with a smaller number of iterations we had a larger error on training set but a smaller error on validation data -> "early stopping is a form of regularization"

In [None]:
# -- let's train the model with best_param on train and validation
final_model = MLPRegressor(**best_param)
final_model.fit(X_train_and_val_scaled, Y_train_and_val)
training_error = 1.0 - final_model.score(X_train_and_val_scaled, Y_train_and_val)
print("Training error of best model: ", training_error)

In [None]:
# -- let's compute the test error
test_error = 1.0 - final_model.score(X_test_scaled, Y_test)
print("Test error of best model: ", test_error)