# Ridge Regression / Polynomial Expansion / k-fold Cross Validation / Validation Curve

In this programming exercise, you will implement a polynomial ridge regression and k-fold cross validation using only numpy. You will also plot a validation curve. DO NOT use libraries like scikit-learn or scipy.

Use the template provided in this notebook to implement ridge regression, polynomial expansion, k-fold cross validation, and plot a validation curve.

When done, paste the code into the quiz on Moodle and answer the questions.

In [None]:
# Importing libraries

import numpy as np
from numpy.typing import ArrayLike
import matplotlib.pyplot as plt
from typing import Optional

In [None]:
def load_data(name: str) -> tuple[ArrayLike, ArrayLike]:
    """Loads data from provided .npy files and returns the x and y values.
    Args:
        name (str): The file name of the .npy file to load.
    Returns:
        tuple[ArrayLike, ArrayLike]: The x and y values of the data. x has shape (n,) and y has shape (n,).
    """
    data = np.load(name)
    x, y = data.T
    return x, y

In [None]:
def shuffle_data(x: ArrayLike, y: ArrayLike) -> tuple[ArrayLike, ArrayLike]:
    """Shuffles the data using a random permutation.
    Args:
        x (ArrayLike): The input values of the data.
        y (ArrayLike): The target values of the data.
    Returns:
        tuple[ArrayLike, ArrayLike]: The shuffled x and y values.
    """
    rng = np.random.default_rng(seed=42)
    n = x.shape[0]
    indx = rng.permutation(n)
    return x[indx], y[indx]

In [None]:
def plot_regression(x: ArrayLike, y: ArrayLike, w: Optional[ArrayLike] = None) -> None:
    """Plot the data and linear regression model.
    Only for plotting 2D data.
    Args:
        x (ArrayLike): The input data of shape (n,).
        y (ArrayLike): The output data of shape (n,).
        w (ArrayLike, optional): The weight and bias of a linear regression. Defaults to None.
    """

    # Plot the data
    plt.plot(x, y, ".", markersize=8, color="#D81B60", label="Samples")
    plt.xlabel("x")
    plt.ylabel("y")
    plt.legend()
    plt.grid()

    # Plot the prediction
    if w is not None:
        deg = w.shape[0]
        x_plot = np.linspace(x.min(), x.max(), 100)
        X_plot = np.vander(x_plot, deg)

        # Set plotting range properly
        plt.ylim((np.min(y) * 1.2, np.max(y) * 1.2))

        plt.plot(x_plot, X_plot @ w, linewidth=2.5, color="#0BA462", label="Model")
        plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
def plot_validation_curve(
    lambdas: ArrayLike,
    train_losses: ArrayLike,
    val_losses: Optional[ArrayLike] = None,
) -> None:
    """Plots the validation curve.
    Args:
        lambdas (ArrayLike): The regularization values.
        train_losses (ArrayLike): The training losses.
        val_losses (ArrayLike, optional): The validation losses. Defaults to None.
    """

    plt.plot(lambdas, train_losses, color="#D81B60", linewidth=2.5, label="Train loss")
    if val_losses is not None:
        plt.plot(
            lambdas, val_losses, color="#1E88E5", linewidth=2.5, label="Valid loss"
        )
    plt.legend()
    plt.xscale("log")
    plt.xlabel("Lambda")
    plt.ylabel("Loss")
    plt.grid()
    plt.show()

In [None]:
def ridge_regression(x: ArrayLike, y: ArrayLike, lam: float = 0) -> ArrayLike:
    """Calculates the Ridge Regression (linear least-squares regression with l2-regularization) coefficients.
    Args:
        x (ArrayLike): The input data of shape (n, d).
        y (ArrayLike): The output data of shape (n,).
    Returns:
        ArrayLike: The ridge regression coefficients.
    """

    # TODO: Implement a solver for the problem from Task 1.

    pass

In [None]:
def get_polynomial_features(x: ArrayLike, deg: int) -> ArrayLike:
    """Generates polynomial features of the input data to the specified degree.
    Args:
        x (ArrayLike): The input data of shape (n,).
        deg (int): The degree of the polynomial features.
    Returns:
        ArrayLike: The polynomial features of the input data of shape (n, d + 1) from 0 degree to deg degree.
    """

    # TODO: Implement a polynomial feature expansion of a certain degree for the input data.

    pass

In [None]:
def empirical_loss(x: ArrayLike, y: ArrayLike, w: ArrayLike) -> float:
    """Calculates the loss of the linear least squares regression.
    Args:
        x (ArrayLike): The input data of shape (n, d).
        y (ArrayLike): The output data of shape (n,).
        w (ArrayLike): The weights of the model.
    Returns:
        float: The loss of the linear least squares regression.
    """
    n = x.shape[0]
    return np.linalg.norm(x @ w - y) ** 2 / n

In [None]:
def kfold(x: ArrayLike, y: ArrayLike, lam: float, k: int = 10) -> tuple[float, float]:
    """Performs k-fold cross-validation to evaluate the model's performance.
    Args:
        x (ArrayLike): The input data of shape (n, d).
        y (ArrayLike): The output data of shape (n,).
        lam (float): The regularization parameter.
        k (int, optional): The number of folds. Defaults to 10.
    Returns:
        tuple[float, float]: Average train and validation losses ¡¡PER DATA POINT!!.
    """

    # TODO: Implement k-fold cross validation.

    pass

In [None]:
def get_validation_curve_data(
    x: ArrayLike, y: ArrayLike, lambdas: ArrayLike
) -> tuple[int, ArrayLike, ArrayLike]:
    """Computes the best lambda and returns its index and train and validation losses for lambdas.
    Args:
        x (ArrayLike): The input data of shape (n, d).
        y (ArrayLike): The output data of shape (n,).
        lambdas (ArrayLike): The range of lambda values to evaluate.
    Returns:
        int: The best lambda index based on the validation loss.
        ArrayLike: The training losses for each lambda value.
        ArrayLike: The validation losses for each lambda value.
    """

    # TODO: Implement a function that returns data required for plotting the validation curve.

    pass

In [None]:
# Data loading and preprocessing
x_train, y_train = load_data("dataset_poly_train.npy")
x_train, y_train = shuffle_data(x_train, y_train)

deg = 6
X_train = get_polynomial_features(x_train, deg=6)

# Validation curve
lambdas = np.logspace(-9, 3, num=100, base=10)
best_lam_idx, train_losses, val_losses = get_validation_curve_data(
    X_train, y_train, lambdas
)

plot_validation_curve(
    lambdas,
    train_losses,
    val_losses,
)
print(f"Best validation error {val_losses[best_lam_idx]:.4f} ")
print(f"Corresponding train error {train_losses[best_lam_idx]:.4f}")
print(f"Corresponding lambda {lambdas[best_lam_idx]:.4f}")

In [None]:
# Train on all training data with the best lambda
w = ridge_regression(X_train, y_train, lam=lambdas[best_lam_idx])
plot_regression(x_train, y_train, w)

# Compute train and test error
x_test, y_test = load_data("dataset_poly_test.npy")
X_test = get_polynomial_features(x_test, deg=6)

train_loss = empirical_loss(X_train, y_train, w)
test_loss = empirical_loss(X_test, y_test, w)
print(f"Train loss: {train_loss:.4f}")
print(f"Test loss: {test_loss:.4f}")