In [None]:
import pandas as pd
import numpy as np
#import tensorflow as tf
#from tensorflow.keras import layers
#from itertools import product
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.special
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from scipy import stats #used to compute z score and remove outliers
from tqdm import tqdm

In [None]:
# Creating clear plots for square waves with different frequencies
frequency_list = [1, 5, 30]
param_list = [1/x for x in frequency_list]

# Creating plots of many different kinds of function, parameterised by some float

num_data = 200
num_features_list = np.arange(1, 401, 1, dtype=int)
num_repeats = 300

def linear_fitting(function, num_data, num_features_list, num_repeats):
        # Create sklearn linear regression object
    regr = linear_model.LinearRegression(fit_intercept=True)
    def compute_y_from_x(X: np.ndarray):
        return function(X)

    low, high = -1.0, 1.0
    mse_list = []
    # Generate test data.
    X_test = np.linspace(start=low, stop=high, num=2000).reshape(-1, 1) # Note that this value has been increased from 1000 to 10,000. This shouldn't impact loss calculations, but worth noting just in case.
    # Number of testing data points increased to properly sample across rapidly varying functions.
    y_test = compute_y_from_x(X_test)

    for num_features in tqdm(num_features_list):
        feature_degrees = 1 + np.arange(num_features).astype(int)
        for repeat_idx in range(num_repeats):
            # Sample training data.
            X_train = np.random.uniform(low=low, high=high, size=(num_data, 1))
            y_train = compute_y_from_x(X_train)

            # Fit a polynomial regression model.
            X_train_poly = scipy.special.eval_legendre(feature_degrees, X_train)
            X_test_poly = scipy.special.eval_legendre(feature_degrees, X_test)
            beta_hat = np.linalg.pinv(X_train_poly) @ y_train
            y_train_pred = X_train_poly @ beta_hat
            y_test_pred = X_test_poly @ beta_hat
            train_mse = mean_squared_error(y_train, y_train_pred)
            test_mse = mean_squared_error(y_test, y_test_pred)
            mse_list.append(
                {
                    "Num. Data": num_data,
                    "Num. Parameters (Num Features)": num_features,
                    "repeat_idx": repeat_idx,
                    "Train MSE": train_mse,
                    "Test MSE": test_mse,
                }
            )

    mse_df = pd.DataFrame(mse_list)
    return mse_df

def func_compare(param_list, num_data, num_features_list, num_repeats):
    fig, axs = plt.subplots(1, len(param_list), figsize=(5*len(param_list),8))
    #fig, axs = plt.subplots(1, len(param_list), sharey='row') # horizonally placed
    mse_dfs = []
    for index, param in enumerate(param_list):

        #def compute_y_from_x(X: np.ndarray): # Used for finding cosine values
        #    return np.power(np.cos(X * 10), param)
        def compute_y_from_x(X: np.ndarray) -> np.ndarray:
          # Compute the phase of the square wave
          phase = X % param
          # Determine the values of the square wave
          square_wave = np.where(phase < param / 2, 1, -1)
          return square_wave
        function = compute_y_from_x
        mse_df = linear_fitting(function, num_data, num_features_list, num_repeats)
        mse_dfs.append(mse_df)
        #plt.close()
        # Use the axs object to plot on the specific subplot
        ax = axs[index]
        sns.lineplot(
            data=mse_df,
            x="Num. Parameters (Num Features)",
            y="Test MSE",
            label="Test",
            ax=ax  # specify the subplot
        )
        sns.lineplot(
            data=mse_df,
            x="Num. Parameters (Num Features)",
            y="Train MSE",
            label="Train",
            ax=ax  # specify the subplot
        )
        ax.set_ylabel("Mean Squared Error")
        ax.set_xlabel("Parameter Number")
        ax.set_ylim(bottom=1e-3)
        ax.set_yscale("log")
        ax.set_xscale("log")
        ax.set_title(f"Polynomial Regression, Step Function with Frequency = {np.power(param, -1)}")
        ax.axvline(
            x=num_data, color="black", linestyle="--", label="Interpolation Threshold"
        )
        ax.legend()

    # Adjust the layout of subplots
    plt.tight_layout()
    #plt.suptitle(f"Polynomial Regression, Step Function with Frequency = {np.power(param, -1)}", y=1.05)
    # Display the figure
    plt.show()
    return mse_dfs

mse_dfs = func_compare(param_list, num_data, num_features_list, num_repeats)