In [1]:
import numpy as np
from nptyping import NDArray, Shape, Int, Float, Bool
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error as mse
from rich import print

In [11]:
import numpy as np
import math
import plotly.express as px
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

np.random.seed(0)

def target_function(x, coef):
    """Compute the target function value for a given input x and coefficients.

    Parameters
    ----------
    x : numpy.ndarray or float
        Input value(s) for the target function. Can be a single float or a numpy array of floats.
    coef : list or numpy.ndarray
        Coefficients of the target function. Should be a list or numpy array of length 2.

    Returns
    -------
    numpy.ndarray or float
        The target function value(s) for the given input x and coefficients. The output will have the same type as x.

    Examples
    --------
    >>> x = np.array([0, 1, 2, 3])
    >>> coef = [2, 3]
    >>> target_function(x, coef)
    array([ 0,  5, 14, 27])
    """
    return coef[0] * x + coef[1] * np.square(x)


def generate_poly_data():
    """Generate 1D polynomial regression data with noise for train and test sets.

    Returns
    -------
    tuple
        train_data : numpy.ndarray
            Training input data, an array of shape (num_samples,).
        train_targets : numpy.ndarray
            Training target data, an array of shape (num_samples, 1).
        test_data : numpy.ndarray
            Test input data, an array of shape (201,).
        test_targets_noise_free : numpy.ndarray
            Noise-free test target data, an array of shape (201, 1).
        test_targets_noisy : numpy.ndarray
            Noisy test target data, an array of shape (201, 1).
        noise_stddev : float
            The standard deviation of the noise added to the data.

    Examples
    --------
    >>> train_data, train_targets, test_data, test_targets_noise_free, test_targets_noisy, noise_stddev = generate_poly_data()
    >>> train_data.shape
    (21,)
    >>> train_targets.shape
    (21, 1)
    >>> test_data.shape
    (201,)
    >>> test_targets_noise_free.shape
    (201, 1)
    >>> test_targets_noisy.shape
    (201, 1)
    >>> noise_stddev
    4
    """
    num_samples = 21
    noise_stddev = 4
    train_data = np.linspace(0, 20, num_samples)
    test_data = np.arange(0, 20.1, 0.1)
    coefficients = np.array([-1.5, 1 / 9])
    train_targets = (
        target_function(train_data, coefficients).reshape(-1, 1)
        + math.sqrt(noise_stddev) * np.random.randn(train_data.shape[0], 1)
    )
    test_targets_noise_free = target_function(test_data, coefficients).reshape(-1, 1)
    test_targets_noisy = (
        test_targets_noise_free
        + math.sqrt(noise_stddev) * np.random.randn(test_data.shape[0], 1)
    )

    return (
        train_data,
        train_targets,
        test_data,
        test_targets_noise_free,
        test_targets_noisy,
        noise_stddev,
    )


def rescale_data(X):
    """Rescale input data using MinMaxScaler with the range (-1, 1).

    Parameters
    ----------
    X : numpy.ndarray
        Input data to be rescaled, an array of shape (n_samples,).

    Returns
    -------
    numpy.ndarray
        Rescaled input data, an array of shape (n_samples, 1).

    Examples
    --------
    >>> X = np.array([2, 4, 6, 8, 10])
    >>> rescaled_X = rescale_data(X)
    >>> rescaled_X
    array([[-1. ],
           [-0.5],
           [ 0. ],
           [ 0.5],
           [ 1. ]])
    """
    scaler = MinMaxScaler((-1, 1))
    return scaler.fit_transform(X.reshape(-1, 1))


def create_poly_features(X, degree):
    """Create polynomial features for the input data up to the specified degree.

    Parameters
    ----------
    X : numpy.ndarray
        Input data, an array of shape (n_samples, n_features).
    degree : int
        The maximum degree of the polynomial features to be created.

    Returns
    -------
    numpy.ndarray
        The polynomial features of the input data, an array of shape (n_samples, n_features * degree).

    Examples
    --------
    >>> X = np.array([[1, 2], [3, 4], [5, 6]])
    >>> poly_features = create_poly_features(X, degree=3)
    >>> poly_features
    array([[   1.,    2.,    1.,    4.,    1.,    8.],
           [   3.,    4.,    9.,   16.,   27.,   64.],
           [   5.,    6.,   25.,   36.,  125.,  216.]])
    """
    X_deg = np.tile(X, degree)
    n_deg = np.arange(14) + 1
    degrees = np.tile(np.repeat(n_deg, X.shape[1]), X.shape[0]).reshape(X.shape[0], -1)
    X_poly = np.power(X_deg, degrees)

    return X_poly


# Generate the data
[train_data, train_targets, test_data, test_targets_noise_free, test_targets, noise_stddev] = generate_poly_data()

# Set the degree for polynomial features
degree = 14

# Create polynomial features for train and test data, rescaling the data before creating the features
poly_train = create_poly_features(rescale_data(train_data), degree)
poly_test = create_poly_features(rescale_data(test_data), degree)

# Center the train and test target data by subtracting the mean
train_targets = train_targets - train_targets.mean()
test_targets = test_targets - test_targets.mean()

# Initialize a range of lambda values for Ridge regression
lambdas = np.logspace(-10, 1.3, 10)
# Lists to store train and test mean squared errors
train_mse, test_mse = [], []

# Loop through the lambdas, fit Ridge regression model, and calculate train and test MSE
for lam in lambdas:
    reg = Ridge(alpha=lam, fit_intercept=False).fit(poly_train, train_targets)
    train_prediction = reg.predict(poly_train)
    test_prediction = reg.predict(poly_test)
    train_mse.append(mean_squared_error(train_prediction, train_targets))
    test_mse.append(mean_squared_error(test_prediction, test_targets))

# Create a DataFrame to store lambda values and corresponding train and test MSE
mse_df = pd.DataFrame({'lambdas': lambdas, 'train_mse': train_mse, 'test_mse': test_mse})
# Plot train and test MSE vs log(lambda) using Plotly
fig1 = px.line(mse_df, x='lambdas', y=['train_mse', 'test_mse'], log_x=True, title='MSE vs log(lambda)', markers=True, line_shape='linear')
fig1.show()

# Cross-validation (CV) vs lambda
cv_means = []
cv_stand_errors = []
num_samples = len(train_data)
num_folds = 5

# Loop through the lambdas, perform cross-validation, and calculate the mean and standard error of the cross-validated MSE
for lam in lambdas:
    cross_validations = -np.array(
        cross_val_score(
            Ridge(alpha=lam, fit_intercept=False),
            poly_train,
            train_targets,
            cv=num_folds,
            scoring='neg_mean_squared_error',
        )
    )
    cv_means.append(cross_validations.mean())
    cv_stand_errors.append(cross_validations.std() / np.sqrt(num_folds))

# Create a DataFrame to store lambda values and corresponding CV means and standard errors
cv_df = pd.DataFrame({'lambdas': lambdas, 'cv_means': np.log(cv_means), 'cv_stand_errors': np.log(np.array(cv_stand_errors)) / 2})
# Plot CV means with error bars vs log(lambda) using Plotly
fig2 = px.line(cv_df, x='lambdas', y='cv_means', log_x=True, error_y='cv_stand_errors', title='CV vs log(lambda)', markers=True, line_shape='linear')
fig2.show()
