# Validation and cross-validation

In this exercise you will implement a validation pipeline.

At the end of the MSLE exercise you tested your model against the training and test datasets. As you should observe, there's a gap between the results. By validating your model, not only should you be able to anticipate the test time performance, but also have a method to compare different models.

Implement the basic validation method, i.e. a random split. Test it with your model from Exercise MSLE.

In [51]:
%matplotlib inline

#!wget -O mieszkania.csv https://www.dropbox.com/s/zey0gx91pna8irj/mieszkania.csv?dl=1
#!wget -O mieszkania_test.csv https://www.dropbox.com/s/dbrj6sbxb4ayqjz/mieszkania_test.csv?dl=1

In [52]:
from typing import Tuple

import numpy as np
import pandas as pd
from sklearn import preprocessing

np.random.seed(357)

In [53]:
def load(name: str) -> Tuple[np.ndarray, np.array]:
    data = pd.read_csv(name)
    x = data.loc[:, data.columns != 'cena'].to_numpy()
    y = data['cena'].to_numpy()

    return x, y

In [54]:
x_train, y_train = load('mieszkania.csv')
x_test, y_test = load('mieszkania_test.csv')

x_test, y_test

(array([[71, 'wolowo', 2, 2, 1912, 1],
        [45, 'mokotowo', 1, 1, 1938, 0],
        [38, 'mokotowo', 1, 1, 1999, 1],
        ...,
        [89, 'wolowo', 2, 2, 1922, 1],
        [40, 'wolowo', 1, 1, 1959, 0],
        [68, 'grodziskowo', 2, 1, 1927, 0]], dtype=object),
 array([ 322227,  295878,  306530,  553641,  985348,  695726,   99751,
         891261,  536499,  527093,  861472,  701472,  429776,  547725,
         669560,  318362, 1140170,  341242,  113580,  456093,  470730,
         421012,  617318,  796117,  138901,  857820,  939450,  398165,
         944399, 1025413,  522440,  344346,  145702,  246712,  574154,
         807608,  568048,  412494,  588840,  766040,  979540, 1044803,
         742235,  758936,  388672,  178238,  530053, 1150687,  587013,
         269316,  270969, 1008103,  299708,  393925,  511106,  947932,
         127717,  752428, 1185932,  330988,  330699,  403778,  584561,
         795392,  602356,  680512,  202121,  888872,  456054,  227841,
         343730,  

In [55]:
labelencoder = preprocessing.LabelEncoder()
labelencoder.fit(x_train[:, 1])
x_train[:, 1] = labelencoder.transform(x_train[:, 1])
x_test[:, 1] = labelencoder.transform(x_test[:, 1])

x_train = x_train.astype(np.float64)
x_test = x_test.astype(np.float64)

y_train, y_test = y_train.reshape(-1, 1), y_test.reshape(-1, 1)
x_train

array([[1.040e+02, 1.000e+00, 2.000e+00, 2.000e+00, 1.940e+03, 1.000e+00],
       [4.300e+01, 2.000e+00, 1.000e+00, 1.000e+00, 1.970e+03, 1.000e+00],
       [1.280e+02, 0.000e+00, 3.000e+00, 2.000e+00, 1.916e+03, 1.000e+00],
       ...,
       [1.070e+02, 0.000e+00, 2.000e+00, 2.000e+00, 1.935e+03, 0.000e+00],
       [1.170e+02, 0.000e+00, 3.000e+00, 2.000e+00, 1.978e+03, 1.000e+00],
       [5.600e+01, 3.000e+00, 2.000e+00, 1.000e+00, 1.923e+03, 0.000e+00]])

In [56]:
import plotly.express as px

def plot_loss(l):
    fig = px.line(y=l, labels={'y':'loss'})
    fig.show()

def mse(xs, ps):
    assert len(xs) == len(ps)

    n = len(xs)
    sum = np.sum((xs - ps) ** 2)

    return (1/n) * sum

def msle(xs, ps):
    assert len(xs) == len(ps)

    n = len(xs)
    sum = np.sum((np.log(1 + xs) - np.log(1 + ps)) ** 2)

    return (1/n) * sum

In [57]:
n, features = x_train.shape
lr = 1e-2 # step size
n_epochs = 80


In [58]:
def predict(w, xs):
    return xs @ w

def evaluate(w, xs, ys):
    return mse(ys, predict(w, xs))

def evaluate_msle(w, xs, ys):
    return msle(ys, predict(w, xs))

def linear_regression(n_epochs, lr, x, y, eval=evaluate):
    n, features = x.shape
    w = np.zeros(features).reshape(-1, 1)
    losses = [eval(w, x, y)]

    for i in range(n_epochs):
        y_hat = predict(w, x)
        dJdwi = np.sum((y_hat - y) * x, axis = 0)

        w = w - (2/n) * (lr * dJdwi).reshape(-1,1)

        loss = eval(w, x, y)
        losses.append(loss)

        if i == 0 or (i+1) % 5 == 0:
            print(f'Iter: {i:>3} Loss: {loss:8.8f} w: {[f"{x[0]:.2f}" for x in w]}')
            #print(f'Test loss :     {evaluate(w, np.log1p(x_test), np.log1p(y_test))}')

    return (w, losses)

def linear_regression_log(n_epochs, lr, x, y):
    return linear_regression(n_epochs, lr, np.log(1 + x), np.log(1 + y), eval=evaluate)

(w_msle, losses) = linear_regression_log(n_epochs, lr, x_train, y_train)

# print(w_msle)
plot_loss(losses)

Iter:   0 Loss: 54.70327051 w: ['1.11', '0.21', '0.28', '0.25', '1.97', '0.09']
Iter:   4 Loss: 0.64753697 w: ['0.75', '0.15', '0.19', '0.17', '1.33', '0.06']
Iter:   9 Loss: 0.04654517 w: ['0.71', '0.14', '0.18', '0.16', '1.25', '0.06']
Iter:  14 Loss: 0.04404905 w: ['0.72', '0.14', '0.18', '0.16', '1.25', '0.06']
Iter:  19 Loss: 0.04371263 w: ['0.72', '0.14', '0.18', '0.16', '1.25', '0.06']
Iter:  24 Loss: 0.04340348 w: ['0.72', '0.15', '0.18', '0.16', '1.24', '0.06']
Iter:  29 Loss: 0.04311280 w: ['0.72', '0.15', '0.18', '0.16', '1.24', '0.06']
Iter:  34 Loss: 0.04283945 w: ['0.73', '0.15', '0.19', '0.16', '1.24', '0.06']
Iter:  39 Loss: 0.04258237 w: ['0.73', '0.15', '0.19', '0.16', '1.24', '0.06']
Iter:  44 Loss: 0.04234058 w: ['0.73', '0.15', '0.19', '0.16', '1.24', '0.06']
Iter:  49 Loss: 0.04211316 w: ['0.73', '0.15', '0.19', '0.16', '1.24', '0.06']
Iter:  54 Loss: 0.04189923 w: ['0.73', '0.16', '0.19', '0.16', '1.23', '0.06']
Iter:  59 Loss: 0.04169798 w: ['0.74', '0.16', '0.1

In [59]:
#######################################################
# TODO: Implement the basic validation method,        #
# compare MSLE on training, validation, and test sets #
#######################################################
v_epochs = np.linspace(80, 400, 10).astype(int).reshape(-1,1)
v_lr = np.linspace(1.2e-2, 5e-3, 60).reshape(-1, 1)

print(v_epochs.shape, v_lr.shape)

xepochs, ylr = np.meshgrid(v_epochs, v_lr, indexing='ij')

def linreg_w(n_epochs, lr, x, y):
    return linear_regression_log(n_epochs, lr, x, y)[0]

def shuffle_and_divide(length, ratio):
    perm = np.random.permutation(length)
    return (perm[: int(length * ratio)], perm[int(length * ratio) :])

idx_train, idx_val = shuffle_and_divide(n, 0.8)

x_train_, x_val = x_train[idx_train], x_train[idx_val]
y_train_, y_val = y_train[idx_train], y_train[idx_val]

run_linreg = np.frompyfunc(lambda ne, lr: linreg_w(ne, lr, x_train_, y_train_), 2, 1)

import contextlib, io
with contextlib.redirect_stdout(io.StringIO()):
    w_matrix = run_linreg(xepochs, ylr)

check_loss = np.frompyfunc(lambda w : evaluate(w, np.log1p(x_val), np.log1p(y_val)), 1, 1)

loss_matrix = check_loss(w_matrix)

(10, 1) (60, 1)


In [60]:
idx = np.unravel_index(np.argmin(loss_matrix), xepochs.shape)
found_w = w_matrix[idx]

print(f"found min validation loss: {np.min(loss_matrix)}")
print(f"loss on test data: {evaluate(found_w, np.log1p(x_test), np.log1p(y_test))}")
print(xepochs[idx], ylr[idx])

found min validation loss: 0.04478994825727809
loss on test data: 0.05922318144581269
400 0.012


To make the random split validation reliable, a huge chunk of training data may be needed. To get over this problem, one may apply cross-validaiton.

![alt-text](https://chrisjmccormick.files.wordpress.com/2013/07/10_fold_cv.png)

Let's now implement the method. Make sure that:
* number of partitions is a parameter,
* the method is not limited to `mieszkania.csv`,
* the method is not limited to one specific model.

In [91]:
####################################
# TODO: Implement cross-validation #
####################################
# xepochs, ylr = np.meshgrid(v_epochs, v_lr, indexing='ij')
# def linreg_w(n_epochs, lr, x, y):
# def shuffle_and_divide(length, ratio):

def ccv_eval(w, x, y):
    return evaluate(w, np.log1p(x), np.log1p(y))

def swap_diagonal_and_last_column(arr):
    len = arr.shape[0]
    diag = np.arange(len)
    last = [len - 1] * len 

    arr[diag, last], arr[diag, diag] = (
        arr[diag, diag], arr[diag, last]
    )

    return arr

def prepare_ccv_tensor(x, partition_num):
    n, _ = x.shape

    #preparing partitions
    assert(n % partition_num == 0)
    partition_idxs = np.linspace(0, n, partition_num + 1).astype(int)
    partition_intervals = np.array([partition_idxs[:-1], partition_idxs[1:]]).T
    partitions = list(map(lambda ival : x[ival[0] : ival[1]], partition_intervals))
    partitions = np.array(partitions)
    #print(partitions.shape)

    partitions = np.tile(partitions[np.newaxis, :], (partition_num,1,1,1))

    return swap_diagonal_and_last_column(partitions)


def cross_check_validation(partition_num, regression_fun, eval_fun, x, y):
    n, _ = x.shape

    # last column of partitions is validation
    partitions = prepare_ccv_tensor(x, partition_num)
    val_sets = partitions[:, partition_num - 1]
    train_sets = partitions[:, np.arange(partition_num - 1)]

    return train_sets

cross_check_validation(4, linreg_w, ccv_eval, x_train, y_train).shape
#cross_check_validation(2, linreg_w, ccv_eval, np.array([[1,2,3,4], [5,6,7,8]]), y_train).shape

# idx_train, idx_val = shuffle_and_divide(n, 0.8)

# x_train_, x_val = x_train[idx_train], x_train[idx_val]
# y_train_, y_val = y_train[idx_train], y_train[idx_val]

# run_linreg = np.frompyfunc(lambda ne, lr: linreg_w(ne, lr, x_train_, y_train_), 2, 1)

# import contextlib, io
# with contextlib.redirect_stdout(io.StringIO()):
#     w_matrix = run_linreg(xepochs, ylr)

# check_loss = np.frompyfunc(lambda w : evaluate(w, np.log1p(x_val), np.log1p(y_val)), 1, 1)

# loss_matrix = check_loss(w_matrix)

(4, 3, 50, 6)

Recall that sometimes validation may be tricky, e.g. significant class imbalance, having a small number of subjects, geographically clustered instances...

What could in theory go wrong here with random, unstratified partitions? Think about potential solutions and investigate the data in order to check whether these problems arise here.

In [62]:
##############################
# TODO: Investigate the data #
##############################
