# Regression Coefficients Nullspace Perspective
Examples and visualizations of the nullspace perspective for regression in high dimensions on the Lithium-Irion-Phosphate (LFP) battery data set.

The LFP Data was originally published with: Data-driven prediction of battery cycle life before capacity degradation
https://www.nature.com/articles/s41560-019-0356-8

Source of data: 
https://data.matr.io/

License of LFP data: lfpdatalicense.txt

Copyright and Contact: Joachim Schaeffer, joachim.schaeffer@posteo.de

In [None]:
import numpy as np 
import pandas as pd
import scipy 

import matplotlib.pyplot as plt
from sklearn.cross_decomposition import PLSRegression
import matplotlib.transforms as mtransforms

import random
random.seed(42)
np.random.seed(42)

# Custom functions
from src.hd_data import HD_Data
from src.basis_function_data import construct_y_data
from src.nullspace import Nullspace

from src.utils import optimize_pls_cv, predict_LFP_based_on_coef, scatter_LFP_based_on_coef
from src.utils import project_reg_coeff_onto_nulls, project_reg_coeff_onto_space_by_basis
from src.plotting_utils import vis_reg_coef, plot_x_tt2, set_axis_label

%load_ext autoreload
%autoreload 2
plt.style.use('./src/plots.mplstyle')

In [None]:
# Varibles to set. 
# Paths are OS dependent.

# Data path
data_path = './data/'
# Shall plots be saved?
save_plots = True

# Path to save plots
save_plot_path = './results/Nullspace/'

# Plot additional visualizations.
extra_plots = 0

# Load data that was used for the publication
# Is set to true, the data for the plots will be loaded from csv files. 
# If set to false the data will be generated and figures might differ slightly due to the random structure of the noise.
load_data = True

# If load data=False and save_data=True, the csv files will be replaced with the data genearated during this run.
save_data = False

In [None]:
# Create PLS Model objects. 

# Extend this list up to 20 components using a for loop 
models = []
model_names = []
for i in range(1, 21):
    models.append(PLSRegression(n_components=i, tol=1e-7, scale=False))
    model_names.append('PLS ' + str(i) + ' Comp.')


In [None]:
# Load the LFP Dataset
lfp_df = pd.read_csv(data_path + 'lfp_slim.csv', index_col=0)

X_lfp = np.array(lfp_df.iloc[:, 0:1000])    
X_lfp = X_lfp[:, ::-1]
y_cl = np.array(lfp_df.iloc[:, 1000])
d_lfp = np.linspace(2.0, 3.5, 1000)

X_lfp_train = np.array(X_lfp[lfp_df.iloc[:, 1002]==0, :])
y_cl_train = np.array(y_cl[lfp_df.iloc[:, 1002]==0])
X_lfp_test = np.array(X_lfp[lfp_df.iloc[:, 1002]==1, :])
y_cl_test = np.array(y_cl[lfp_df.iloc[:, 1002]==1])
X_lfp_test2 = np.array(X_lfp[lfp_df.iloc[:, 1002]==2, :])
y_cl_test2 = np.array(y_cl[lfp_df.iloc[:, 1002]==2])

# Remove single outlier in the test set. 
# Reason: Very different shape and a lot lower cycle life than all other cells. 
# Degradation is not linear. 
id_outlier_test = np.where(np.mean(X_lfp_test, axis=1)==np.min(np.mean(X_lfp_test, axis=1)))
X_lfp_test = np.delete(X_lfp_test, id_outlier_test, axis=0)
y_cl_test = np.delete(y_cl_test, id_outlier_test, axis=0)

stdx = np.std(X_lfp_train, axis=0, ddof=1)
meanx = np.mean(X_lfp_train, axis=0)
X_ = X_lfp_train - meanx
X_lfp_train_std = X_ / stdx

X_test1_ = X_lfp_test - meanx
X_test1_std = X_test1_ / stdx
X_test2_ = X_lfp_test2 - meanx
X_test2_std = X_test2_ / stdx

In [None]:
figsize = [29, 7]
fig, ax = plt.subplots(1, 3, figsize=figsize, constrained_layout=True, sharex=True)
x_label = 'Voltage (V)'
y_label = r'$\Delta Q_{100-10}$ (Ah)'
y_label_ = r'$\Delta \widetilde{Q}_{100-10}$ (Ah)'
y_label_std_ = r'$\Delta \widetilde{Q}_{100-10}^{STD}$'
# IBM colorguide colors and other colors!
colors_IBM_ao = ["#0051a2", "#97964a", "#f4777f", "#93003a", "#648fff", "#785ef0", "#dc267f", "#fe6100", "#ffb000", "#000000"]
ax[0] = plot_x_tt2(X_lfp_train, d_lfp, ax[0], colors_IBM_ao[0], x_label, y_label, zorder=3)
ax[0] = plot_x_tt2(X_lfp_test, d_lfp, ax[0], colors_IBM_ao[3],  x_label, y_label, label_data='Test', zorder=2, linestyle='--')
ax[0] = plot_x_tt2(X_lfp_test2, d_lfp, ax[0], colors_IBM_ao[1], x_label, y_label, label_data='Test 2', zorder=1, linestyle='-.')
ax[0].set_title('Data')
ax[0].set_xlim(2.0, 3.5)


ax[1] = plot_x_tt2(X_lfp_train-np.mean(X_lfp_train, axis=0), d_lfp, ax[1], colors_IBM_ao[0], x_label, y_label_, label_data='Training')
ax[1].set_title('Mean Centered Training Data')
ax[1].set_xlim(2.0, 3.5)

ax[2] = plot_x_tt2(X_lfp_train_std, d_lfp, ax[2], colors_IBM_ao[0], x_label, y_label_std_, label_data='Training')
ax[2].set_title('Z-Scored Training Data')
ax[2].set_xlim(2.0, 3.5)

trans = mtransforms.ScaledTranslation(-55 / 72, 20 / 72, fig.dpi_scale_trans)
fig, ax[0] = set_axis_label(fig, ax[0], trans, label_str="a)", loc=(-0.04, 1.0))
fig, ax[1] = set_axis_label(fig, ax[1], trans, label_str="b)", loc=(-0.05, 1.0))
fig, ax[2] = set_axis_label(fig, ax[2], trans, label_str="c)", loc=(-0.03, 1.0))

if save_plots:
    plt.savefig(save_plot_path + 'data_lfp.pdf')
plt.show()

In [None]:
lfp_train_hdata = HD_Data(X_lfp_train, d_lfp, y=[])
fig, ax = lfp_train_hdata.analyze_snr_by_splines(s=1e-5, k=3, mode="X_raw", x_label='Voltage (V)')
fig.suptitle('LFP Training Data SNR Perspective')
fig.subplots_adjust(top=0.91)

trans = mtransforms.ScaledTranslation(-55 / 72, 20 / 72, fig.dpi_scale_trans)
fig, ax[0] = set_axis_label(fig, ax[0], trans, label_str="a)", loc=(-0.03, 1.03))
fig, ax[1] = set_axis_label(fig, ax[1], trans, label_str="b)", loc=(-0.03, 0.9))

if save_plots:
    plt.savefig(save_plot_path + 'SNR_TrainingData.pdf')
# Noise is heteroscedastic, violates the usual assumption!

In [None]:
lfp_train_hdata.smooth_snr(window_length=51, polyorder=3)

fig, ax = plt.subplots(1, 1, figsize=(12, 6))
ax.plot(lfp_train_hdata.d, lfp_train_hdata.snr, label="SNR")
ax.plot(lfp_train_hdata.d, lfp_train_hdata.snr_smooth, label="SNR Sav-Golay filtered")
ax.set_xlabel('Voltage (V)')
ax.set_ylabel('SNR')
ax.set_xlim(2.0, 3.5)
ax.legend()
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(12, 6))
ax.plot(lfp_train_hdata.d, lfp_train_hdata.snr_dB, label="SNR [dB]")
ax.plot(lfp_train_hdata.d, lfp_train_hdata.snr_smooth_dB, label="SNR Sav-Golay filtered [dB]")
ax.set_xlabel('Voltage (V)')
ax.set_ylabel('SNR [dB]')
ax.set_xlim(2.0, 3.5)
ax.legend()
plt.show()

# Save the snr_smooth_dB to csv (it is a numpy array)
np.savetxt("data/lfp_snr_smooth_dB.csv", lfp_train_hdata.snr_smooth_dB, delimiter=",")
np.savetxt("data/lfp_snr_smooth.csv", lfp_train_hdata.snr_smooth, delimiter=",")

In [None]:
y_lfp_train_syn = np.zeros(X_lfp_train.shape[0])
mean_function = lambda a: np.mean(a)
mean_reg_coef_lfp = (1/X_lfp_train.shape[1]) * np.ones(X_lfp_train.shape[1])

if not load_data: 
    y_lfp_train_mean_function = construct_y_data(X_lfp_train, mean_function, per_range=[0,1])
    data_lfp_mean = HD_Data(X=X_lfp_train, d=d_lfp, y=y_lfp_train_mean_function)
    data_lfp_mean = data_lfp_mean.add_wgn(snr_y=50)

    if save_data: 
        np.savetxt(
            data_path + 'lfp_y_mean.csv', data_lfp_mean.y, delimiter=",", 
            header='Sample Mean reposne for the LFP Data')
else:
    y=np.loadtxt(data_path + 'lfp_y_mean.csv', skiprows=1, delimiter=',')
    data_lfp_mean = HD_Data(X=X_lfp_train, d=d_lfp, y=y)

## LFP dataset synthethic y

In [None]:
nulls_lfp_mean = Nullspace(data_lfp_mean)
nulls_lfp_mean = nulls_lfp_mean.learn_weights(models, model_names)
nulls_lfp_mean.set_standardization(std=False)

In [None]:
optimize_pls_cv(data_lfp_mean.X_, data_lfp_mean.y_, max_comps=10, plot_components=True, std=False)

In [None]:
optimize_pls_cv(data_lfp_mean.X_, data_lfp_mean.y_, max_comps=10, plot_components=True, std=True)

In [None]:
nulls_lfp_mean.set_standardization(std=False)
nulls_lfp_mean.set_nullspace_weights(
    key_alpha = 'PLS 2 Comp.', 
    w_alpha_name=r'$\beta_{2}^{PLS}$',
    w_beta = mean_reg_coef_lfp, 
    w_beta_name=r'$\beta^*$'
    )

nulls_lfp_mean, fig, ax = nulls_lfp_mean.nullspace_analysis(
    plot_results=True, 
    save_plot=0,
    opt_gamma_method='NRMSE',
    con_thres=0.01,
    nullspace_path = False,
    ax_labelstr = ("", "a)"),
    )

ax[0].set_ylabel(r'$\Delta \widetilde{Q}_{100-10}$')
ax[0].set_xlabel('Voltage (V)')
ax[1].set_xlabel('Voltage (V)')    
ax[1].legend(loc=3)
plt.show()
if save_plots: 
    fig.savefig(save_plot_path + 'LFP_example_2PLS_mean.pdf')

nulls_lfp_mean.scatter_predictions()

In [None]:
nulls_lfp_mean.set_standardization(std=True)

nulls_lfp_mean.set_nullspace_weights(
    key_alpha = 'PLS 4 Comp.', 
    w_alpha_name=r'$\beta_{4}^{PLS}$',
    w_beta = mean_reg_coef_lfp, 
    w_beta_name=r'$\beta^*$',
    )

nulls_lfp_mean, fig, ax = nulls_lfp_mean.nullspace_analysis(
    plot_results=True,
    save_plot=0,
    opt_gamma_method='NRMSE',
    con_thres=0.01,
    nullspace_path = False,
    ax_labelstr = ("", "b)"),
    inset_axes_ce = [0.8, 0.77, 0.18, 0.15],
    zoom_coords = [3.03, 3.18, 1e-5*1.9, 1e-5*2.08],
    )

ax[0].set_ylabel(r'$\Delta \widetilde{Q}_{100-10}$')
ax[1].set_xlabel('Voltage (V)')   
ax[1].set_title('Nullspace Perspective, Z-Scored Data')
ax[1].legend(loc=3)
plt.show()
if save_plots: 
    fig.savefig(save_plot_path + 'LFP_example_std_4PLS_mean.pdf')


nulls_lfp_mean.scatter_predictions()

## Colum mean weights

In [None]:
# Contruct data object
# data LFP Synthethic YTrue repsonse is X@column_mean 

rcoef_lfp_cm =  np.mean(X_lfp_train, axis=0)

if not load_data: 
    y_lfps_train_cm = np.dot(X_lfp_train, rcoef_lfp_cm)

    dml_lfps_cm = HD_Data(X=X_lfp_train, d=d_lfp, y=y_lfps_train_cm)
    # Add noise to target values
    dml_lfps_cm = dml_lfps_cm.add_wgn(snr_y=50)
    if save_data: 
        np.savetxt(
            data_path + 'lfp_y_cm.csv', dml_lfps_cm.y, delimiter=',', 
            header='Column Mean response for the LFP Data')
else:
    y=np.loadtxt(data_path + 'lfp_y_cm.csv', skiprows=1, delimiter=',')
    dml_lfps_cm = HD_Data(X=X_lfp_train, d=d_lfp, y=y)

nulls_lfp_cm = Nullspace(dml_lfps_cm)
nulls_lfp_cm = nulls_lfp_cm.learn_weights(models, model_names)

In [None]:
optimize_pls_cv(dml_lfps_cm.X_, dml_lfps_cm.y_, max_comps=10, plot_components=True, std=False)

In [None]:
optimize_pls_cv(dml_lfps_cm.X_, dml_lfps_cm.y_, max_comps=10, plot_components=True, std=True)

In [None]:
nulls_lfp_cm.set_standardization(std=False)
nulls_lfp_cm.set_nullspace_weights(
    key_alpha = 'PLS 3 Comp.', 
    w_alpha_name=r'$\beta_{3}^{PLS}$',
    w_beta = rcoef_lfp_cm, 
    w_beta_name=r'$\beta^*$',
    )

nulls_lfp_cm, fig, ax = nulls_lfp_cm.nullspace_analysis(
    plot_results=True,
    save_plot=0,
    opt_gamma_method='NRMSE',
    con_thres=0.005,
    nullspace_path = False,
    ax_labelstr = ("", "a)"),
    inset_axes_ce = [0.01, 0.01, 0.38, 0.34],
    zoom_coords = [2.84, 3.12, -0.048, -0.04],
    )
    
ax[0].set_ylabel(r'$\Delta \widetilde{Q}_{100-10}$')
ax[1].set_xlabel('Voltage (V)')    

plt.show()
if save_plots: 
    fig.savefig(save_plot_path + 'LFP_3PLS_cm.pdf')


nulls_lfp_cm.scatter_predictions()

In [None]:
nulls_lfp_cm.set_standardization(std=True)
nulls_lfp_cm.set_nullspace_weights(
    key_alpha = 'PLS 6 Comp.', 
    w_alpha_name=r'$\beta_{6}^{PLS}$',
    w_beta = rcoef_lfp_cm, 
    w_beta_name=r'$\beta^*$',
    )

nulls_lfp_cm, fig, ax = nulls_lfp_cm.nullspace_analysis(
    plot_results=True,
    save_plot=0,
    opt_gamma_method='NRMSE',
    con_thres=0.005,
    nullspace_path = False,
    ax_labelstr = ("", "b)"),
    inset_axes_ce = [0.8, 0.05, 0.18, 0.5],
    zoom_coords = [3.17, 3.3, -0.00018, 0.00014],
    )
    
ax[0].set_ylabel(r'$\Delta \widetilde{Q}_{100-10}$')
ax[1].set_xlabel('Voltage (V)')    
ax[1].set_title('Nullspace Perspective, Z-Scored Data')

plt.show()
if save_plots: 
    fig.savefig(save_plot_path + 'LFP_6PLS_cm_std.pdf')


nulls_lfp_cm.scatter_predictions()

## Cycle Life Response

In [None]:
# Check the results
X_train_var = np.log10(np.var(X_lfp_train, axis=1, ddof=1)).copy()
mean_x = np.mean(X_train_var)
X_train_var_ = X_train_var - mean_x
X_test1_var = np.log10(np.var(X_lfp_test, axis=1, ddof=1)).copy()
X_test1_var_ = X_test1_var - mean_x
X_test2_var = np.log10(np.var(X_lfp_test2, axis=1, ddof=1)).copy()
X_test2_var_ = X_test2_var - mean_x

y_cl_train_ = np.log10(y_cl_train).copy()
mean_y_train = np.mean(y_cl_train_)
y_cl_train_ -= mean_y_train
y_cl_test_ = np.log10(y_cl_test).copy() -  mean_y_train
y_cl_test2_ = np.log10(y_cl_test2).copy() -   mean_y_train

# Ordinary Least Squares Regression 
beta = (1/(X_train_var_.T @ X_train_var_)) * X_train_var_.T @ y_cl_train_
# Reshape beta to a 2D array
beta = np.reshape(beta, (1, -1))
# Predict the cycle life of the set
pred_train = 10**(X_train_var_.reshape(-1, 1)@beta + mean_y_train).reshape(-1)
pred_test  = 10**(X_test1_var_.reshape(-1, 1)@beta + mean_y_train).reshape(-1)
pred_test2 = 10**(X_test2_var_.reshape(-1, 1)@beta + mean_y_train).reshape(-1)

# Print RMSE
print('RMSE Train: ', np.sqrt(np.mean((pred_train - y_cl_train)**2)))
print('RMSE Test: ',  np.sqrt(np.mean((pred_test - y_cl_test)**2)))
print('RMSE Test2: ', np.sqrt(np.mean((pred_test2 - y_cl_test2)**2)))

In [None]:
y_log_cl = np.log(y_cl_train).copy()
y_cl = y_cl_train
y_log_cl_ = y_log_cl - np.mean(y_log_cl)

# Optimize the number of components for the PLS model based on standardized data and not standardized data.
# cv_dict = optimize_pls_cv(X_, y_log_cl, max_comps=12, folds=10, plot_components=True, std=False, neg_rmse_exp_scorer=False)
# cv_dict_pls_std = optimize_pls_cv(X_, y_log_cl, max_comps=12, folds=10, plot_components=True, std=True, neg_rmse_exp_scorer=False)

# Estimate the generalization error on the original scale of the response variable.
cv_dict = optimize_pls_cv(X_, y_log_cl, max_comps=12, folds=10, plot_components=True, std=False, neg_rmse_exp_scorer=True)
cv_dict_pls_std = optimize_pls_cv(X_, y_log_cl, max_comps=12, folds=10, plot_components=True, std=True, neg_rmse_exp_scorer=True)

In [None]:
dml_lfps_cl = HD_Data(X=X_lfp_train, d=d_lfp, y=y_log_cl)
nulls_lfp_cl = Nullspace(dml_lfps_cl)
nulls_lfp_cl.set_standardization(std=False)
nulls_lfp_cl = nulls_lfp_cl.learn_weights(models, model_names)
# nulls_lfp_cl.set_standardization(std=True)
# nulls_lfp_cl = nulls_lfp_cl.learn_weights(models, model_names)

In [None]:
# Load file 
file_name = "regression_in_R/lfp_cl_D1_cv_reg_coeff.csv"
reg_coef_d1 = pd.read_csv(file_name, index_col=0)

file_name = "regression_in_R/lfp_cl_D2_cv_reg_coeff.csv"
reg_coef_d2 = pd.read_csv(file_name, index_col=0)

file_name = "regression_in_R/lfp_cl_D1_std_cv_reg_coeff.csv"
reg_coef_d1_std = pd.read_csv(file_name, index_col=0)

In [None]:
figsize = [20, 5]
fig, ax = plt.subplots(1, 2, figsize=figsize, constrained_layout=True, sharex=True, sharey=True)

nb_comp = 5
fig, ax[0] = vis_reg_coef(
    beta = nulls_lfp_cl.weights[f'PLS {nb_comp} Comp.'],
    d = np.linspace(2.0, 3.5, 1000),
    y = y_log_cl,
    label = r"$\beta_{5}^{PLS}$",
    fig = fig, 
    ax = ax[0],
    cid=0)

fig, ax[1] = vis_reg_coef(
    beta = reg_coef_d1.iloc[:, 1],
    d = np.linspace(2.0, 3.5, 1000),
    y = y_log_cl,
    label = r"$\beta_{D1}^{FL}$",
    fig = fig, 
    ax = ax[1],
    cid=1)

trans = mtransforms.ScaledTranslation(-55 / 72, 20 / 72, fig.dpi_scale_trans)
fig, ax[0] = set_axis_label(fig, ax[0], trans, label_str="a)", loc=(-0.03, 1.0))
fig, ax[1] = set_axis_label(fig, ax[1], trans, label_str="b)", loc=(-0.03, 1.0))

ax[0].set_xlabel("Voltage (V)")
ax[1].set_xlabel("Voltage (V)")
ax[0].set_ylabel("Coefficients")

# Save figure
if save_plots:
    plt.savefig(save_plot_path + 'LFP_CL_CV_Reg_Coeff_genlasso_s.pdf')

In [None]:
if 1:
    figsize = [20, 10]
    fig, ax = plt.subplots(2, 2, figsize=figsize, constrained_layout=True, sharex=True, sharey=True)

    fig, ax[0, 0] = vis_reg_coef(
        beta = reg_coef_d1.iloc[:, 1],
        d = np.linspace(2.0, 3.5, 1000),
        y = y_log_cl,
        label = r"$\beta_{D1}^{FL}$",
        fig = fig, 
        ax = ax[0, 0],
        cid=0)

    fig, ax[0, 1] = vis_reg_coef(
        beta = reg_coef_d2.iloc[:, 1],
        d = np.linspace(2.0, 3.5, 1000),
        y = y_log_cl,
        label = r"$\beta_{D2}^{FL}$",
        fig = fig, 
        ax = ax[0, 1],
        cid=1)

    nb_comp = 5
    fig, ax[1, 0] = vis_reg_coef(
        beta = nulls_lfp_cl.weights[f'PLS {nb_comp} Comp.'],
        d = np.linspace(2.0, 3.5, 1000),
        y = y_log_cl,
        label = r"$\beta_{5}^{PLS}$",
        fig = fig, 
        ax = ax[1, 0],
        cid=3)

    fig, ax[1, 1] = vis_reg_coef(
        beta = reg_coef_d1.iloc[:, 1],
        d = np.linspace(2.0, 3.5, 1000),
        y = y_log_cl,
        label = r"$\beta_{D1}^{FL}$",
        fig = fig, 
        ax = ax[1, 1],
        cid=0)

    fig, ax[1, 1] = vis_reg_coef(
        beta = reg_coef_d2.iloc[:, 1],
        d = np.linspace(2.0, 3.5, 1000),
        y = y_log_cl,
        label = r"$\beta_{D2}^{FL}$",
        fig = fig, 
        ax = ax[1, 1],
        cid=1)

    fig, ax[1, 1] = vis_reg_coef(
        beta = nulls_lfp_cl.weights[f'PLS {nb_comp} Comp.'],
        d = np.linspace(2.0, 3.5, 1000),
        y = y_log_cl,
        label = r"$\beta_{5}^{PLS}$",
        fig = fig, 
        ax = ax[1, 1],
        cid=3)


    trans = mtransforms.ScaledTranslation(-55 / 72, 20 / 72, fig.dpi_scale_trans)
    fig, ax[0,0] = set_axis_label(fig, ax[0,0], trans, label_str="a)", loc=(-0.03, 1.0))
    fig, ax[0,1] = set_axis_label(fig, ax[0,1], trans, label_str="b)", loc=(-0.03, 1.0))
    fig, ax[1,0] = set_axis_label(fig, ax[1,0], trans, label_str="c)", loc=(-0.03, 0.96))
    fig, ax[1,1] = set_axis_label(fig, ax[1,1], trans, label_str="d)", loc=(-0.03, 0.96))

    ax[1,0].set_xlabel("Voltage (V)")
    ax[1,1].set_xlabel("Voltage (V)")
    ax[0,0].set_ylabel("Coefficients")
    ax[1,0].set_ylabel("Coefficients")
    ax[1,1].legend(loc=3)

    # Save figure
    if save_plots:
        plt.savefig(save_plot_path + 'LFP_CL_CV_Reg_Coeff_genlasso.pdf')

In [None]:
plot_rescaled_regression_coefficients = False

if plot_rescaled_regression_coefficients:
    figsize = [20, 10]
    fig = plt.figure(figsize=figsize, constrained_layout=True)
    gs = fig.add_gridspec(2, 2)
    ax11 = plt.subplot(gs[1, 1])
    ax10 = plt.subplot(gs[1, 0])
    ax00 = plt.subplot(gs[0, 0])
    ax01 = plt.subplot(gs[0, 1], sharey=ax00)
else:
    figsize = [20, 5]
    fig = plt.figure(figsize=figsize, constrained_layout=True)
    gs = fig.add_gridspec(1, 2)
    ax00 = plt.subplot(gs[0, 0])
    ax01 = plt.subplot(gs[0, 1], sharey=ax00)

fig, ax00 = vis_reg_coef(
    beta = nulls_lfp_cl.weights['PLS 9 Comp. std'],
    d = np.linspace(2.0, 3.5, 1000),
    y = y_log_cl,
    label = r"$\beta_{9}^{PLS}$ z-scored",
    fig = fig, 
    ax = ax00,
    cid=0)

fig, ax01 = vis_reg_coef(
    beta = reg_coef_d1_std.iloc[:, 1],
    d = np.linspace(2.0, 3.5, 1000),
    y = y_log_cl,
    label = r"$\beta_{D1}^{FL}$ z-scored",
    fig = fig, 
    ax = ax01,
    cid=1)
plt.setp(ax01.get_yticklabels(), visible=False)
ax00.set_ylabel("Coefficients")  # \n z-scored columns")
trans = mtransforms.ScaledTranslation(-55 / 72, 20 / 72, fig.dpi_scale_trans)
fig, ax00 = set_axis_label(fig, ax00, trans, label_str="a)", loc=(-0.03, 1.0))
fig, ax01 = set_axis_label(fig, ax01, trans, label_str="b)", loc=(-0.03, 1.0))
if plot_rescaled_regression_coefficients:
    ax00.set_xticklabels([])
    ax01.set_xticklabels([])

    fig, ax10 = vis_reg_coef(
        beta = nulls_lfp_cl.weights['PLS 9 Comp. std']/np.std(X_, axis=0),
        d = np.linspace(2.0, 3.5, 1000),
        y = y_log_cl,
        label = r"$\beta_{9}^{PLS}$ z-scored rescaled",
        fig = fig, 
        ax = ax10,
        cid=0)

    #ax[1, 1].get_shared_x_axes().remove(ax[1, 0])
    fig, ax11 = vis_reg_coef(
        beta = reg_coef_d1_std.iloc[:, 1]/np.std(X_, axis=0),
        d = np.linspace(2.0, 3.5, 1000),
        y = y_log_cl,
        label = r"$\beta_{D1}^{FL} z-scored rescaled$",
        fig = fig, 
        ax = ax11,
        cid=1)

    trans = mtransforms.ScaledTranslation(-55 / 72, 20 / 72, fig.dpi_scale_trans)
    fig, ax10 = set_axis_label(fig, ax10, trans, label_str="c)", loc=(-0.03, 1.0))
    fig, ax11 = set_axis_label(fig, ax11, trans, label_str="d)", loc=(-0.03, 1.0))

    ax10.set_xlabel("Voltage (V)")
    ax11.set_xlabel("Voltage (V)")
    ax10.set_ylabel("Coefficients \n original scale columns")
    #fig.suptitle("Cross-Validate Generalized Lasso Regression Coefficients")
    if save_plots:
        plt.savefig(save_plot_path + 'LFP_CL_CV_Reg_Coeff_PLS_genlasso_std.pdf')
else:
    ax00.set_xlabel("Voltage (V)")
    ax01.set_xlabel("Voltage (V)")
    if save_plots:
        plt.savefig(save_plot_path + 'LFP_CL_CV_Reg_Coeff_PLS_genlasso_std_s.pdf')



In [None]:
# Run the variance model on the LFP data

log_var_train = np.log(np.var(X_lfp_train, axis=1))
mean_log_var_train = np.mean(log_var_train)
log_var_train_ = log_var_train-mean_log_var_train
log_var_test1 = np.log(np.var(X_lfp_test, axis=1))   
log_var_test1_ = log_var_test1-mean_log_var_train
log_var_test2 = np.log(np.var(X_lfp_test2, axis=1))
log_var_test2_ = log_var_test2-mean_log_var_train

log_y_cl_train_mean = np.mean(np.log(y_cl_train))
log_y_cl_train_ = np.log(y_cl_train)-log_y_cl_train_mean

# Fit the model
w = np.linalg.lstsq(log_var_train_.reshape(-1, 1), log_y_cl_train_, rcond=None)[0]

In [None]:
split_eval_cycle = 1200
df_rmse = pd.DataFrame()
print("D1")
df_rmse = predict_LFP_based_on_coef(
    X_, X_test1_, X_test2_, y_cl_train, y_cl_test, y_cl_test2, reg_coef_d1.iloc[:, 1], np.mean(y_log_cl), df_rmse, 'D1', split_eval_cycle=split_eval_cycle)
# print("D2")
# df_rmse = predict_LFP_based_on_coef(
#     X_, X_test1_, X_test2_, y_cl_train, y_cl_test, y_cl_test2, reg_coef_d2.iloc[:, 1], np.mean(y_log_cl), df_rmse, 'D2')
print("PLS 5 Comp.")
df_rmse = predict_LFP_based_on_coef(
    X_, X_test1_, X_test2_, y_cl_train, y_cl_test, y_cl_test2, nulls_lfp_cl.weights['PLS 5 Comp.'], np.mean(y_log_cl), df_rmse, 'PLS 5 Comp.', split_eval_cycle=split_eval_cycle)
print("D1 Z-Scored")
df_rmse = predict_LFP_based_on_coef(
    X_lfp_train_std, X_test1_std, X_test2_std, y_cl_train, y_cl_test, y_cl_test2, reg_coef_d1_std.iloc[:, 1], np.mean(y_log_cl), df_rmse, 'D1 Z-Scored', split_eval_cycle=split_eval_cycle)
print("PLS 9 Comp. Z-Scored")
df_rmse = predict_LFP_based_on_coef(
    X_lfp_train_std, X_test1_std, X_test2_std, y_cl_train, y_cl_test, y_cl_test2, nulls_lfp_cl.weights['PLS 9 Comp. std'], np.mean(y_log_cl), df_rmse, 'PLS 9 Comp. Z-Scored', split_eval_cycle=split_eval_cycle)

df_rmse = predict_LFP_based_on_coef(
    log_var_train_.reshape(-1, 1), 
    log_var_test1_.reshape(-1, 1), 
    log_var_test2_.reshape(-1, 1), 
    y_cl_train, 
    y_cl_test, 
    y_cl_test2, 
    w, 
    np.mean(np.log(y_cl_train)), 
    df_rmse, 
    "Variance Model", 
    split_eval_cycle=split_eval_cycle
)

In [None]:
print(df_rmse.to_latex(float_format="%.0f"))

In [None]:
nulls_lfp_cl.set_standardization(std=False)
nulls_lfp_cl.set_nullspace_weights(
    key_alpha = 'PLS 5 Comp.', 
    w_alpha_name=r'$\beta_{5}^{PLS}$',
    w_beta = reg_coef_d1.iloc[:, 1],
    w_beta_name=r"$\beta_{D1}^{FL}$",
    )

nulls_lfp_cl, fig, ax = nulls_lfp_cl.nullspace_analysis(
    plot_results=True, 
    save_plot=0,
    opt_gamma_method='NRMSE',
    con_thres=0.2,
    nullspace_path = False,
    ax_labelstr = ("", "a)"),
    )


ax[0].set_ylabel(r'$\Delta \widetilde{Q}_{100-10}$')
ax[0].set_xlabel('Voltage (V)')
ax[1].set_xlabel('Voltage (V)')    
ax[1].legend(loc=3)
plt.show()

nulls_lfp_cl.scatter_predictions(y_transform=np.exp)

In [None]:
# 1.: The nullspace object always assumes that regression coefficeints are inserted 
# to X in the original scale. Therefore, we have to rescale the regression coefficients and set
# Std=True for the object to know that we want to perform the nullspace analysis on the standardized data.

nulls_lfp_cl.set_standardization(std=True)
nulls_lfp_cl.set_nullspace_weights(
    w_alpha = nulls_lfp_cl.weights['PLS 9 Comp. std']/np.std(X_, axis=0), 
    w_alpha_name=r'$\beta_{9}^{PLS}$',
    w_beta = reg_coef_d1_std.iloc[:, 1]/np.std(X_, axis=0),
    w_beta_name=r"$\beta_{D1}^{FL}$",
    )

nulls_lfp_cl, fig, ax = nulls_lfp_cl.nullspace_analysis(
    plot_results=True, 
    save_plot=0,
    opt_gamma_method='NRMSE',
    con_thres=-0.5,
    nullspace_path = False,
    ax_labelstr = ("", "a)"),
    )

ax[0].set_ylabel(r'$\Delta \widetilde{Q}_{100-10}$')
ax[0].set_xlabel('Voltage (V)')
ax[1].set_xlabel('Voltage (V)')    
ax[1].legend(loc=3)
plt.show()

nulls_lfp_cl.scatter_predictions(y_transform=np.exp)

In [None]:
# Make a figure with 4 subplots to report the scatter pplots associated with the models in table 1. 
figsize = [17, 8.5]
fig, ax = plt.subplots(1, 2, figsize=figsize, constrained_layout=True, sharex=True)

ax[0] = scatter_LFP_based_on_coef(
    X_lfp_train_std, X_test1_std, X_test2_std, y_cl_train, y_cl_test, y_cl_test2, reg_coef_d1_std.iloc[:, 1], np.mean(y_log_cl),
    split_eval_cycle=split_eval_cycle,
    ax = ax[0],
    title="Fused Lasso Regression, Z-Scored Data")

# Now for the PLS 9 Comp Standardized Data
ax[1] = scatter_LFP_based_on_coef(X_lfp_train_std, X_test1_std, X_test2_std, y_cl_train, y_cl_test, y_cl_test2, nulls_lfp_cl.weights['PLS 9 Comp. std'], np.mean(y_log_cl),
    split_eval_cycle=split_eval_cycle,
    ax = ax[1],
    title="Fused Lasso Regression, Z-Scored Data")

trans = mtransforms.ScaledTranslation(-55 / 72, 20 / 72, fig.dpi_scale_trans)
fig, ax[0] = set_axis_label(fig, ax[0], trans, label_str="a)", loc=(-0.03, 1.0))
fig, ax[1] = set_axis_label(fig, ax[1], trans, label_str="b)", loc=(-0.03, 1.0))
#fig, ax[1,0] = set_axis_label(fig, ax[1,0], trans, label_str="c)", loc=(-0.03, 0.96))
#fig, ax[1,1] = set_axis_label(fig, ax[1,1], trans, label_str="d)", loc=(-0.03, 0.96))

# Save figure
if save_plots:
    plt.savefig(save_plot_path + 'Scatter_plot_predictions.pdf')

# Nullspace Projection For the Cycle Life Response

In [None]:
NX = scipy.linalg.null_space(X_lfp_train) #, rcond=10**-5)
# Changing the rcond value is related to the singular values that are considered zero and 
# thus impact the dimensionality of the nullspace. 
NX_std = scipy.linalg.null_space(X_lfp_train_std)
print(f"Shape of the nullspace basis vector matrix: {NX.shape}")

In [None]:
# Plot the individual basis vectors of the nullspae and see how the space changes after z-scoring
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
ax[0].plot(np.linspace(3.5, 2.0, 1000), NX)
ax[0].set_title('Nullspace of the Original Data')
ax[0].set_xlabel('Voltage (V)')
ax[0].set_ylabel('Nullspace Basis Vector')
ax[1].plot(np.linspace(3.5, 2.0, 1000), NX_std)
ax[1].set_title('Nullspace of the Z-Scored Data')
ax[1].set_xlabel('Voltage (V)')
fig.show()

# It's difficult to interpret when visualized this way. 
# Scipy constructs the nullspace basis vectors in a particular way such that they
# are orthogonal unit vectors which can be difficult to visualize (and interpret).
# Consider plotting the diagonal z-scoring matrix.

In [None]:
beta_d1 = reg_coef_d1.iloc[:, 1]
x_lfp = np.linspace(2.0, 3.5, len(beta_d1))
beta_proj_nulls_ = project_reg_coeff_onto_nulls(beta_d1, X_lfp_train)
plt.plot(x_lfp, beta_proj_nulls_, label='Nullspace Projection')

beta_proj_nulls = project_reg_coeff_onto_space_by_basis(beta_d1, NX)
plt.plot(x_lfp, beta_proj_nulls, label="Basis Vector Nullspace Projection")
plt.legend()
plt.show() 
beta_orth = beta_d1-beta_proj_nulls_
print(f"||NX.T@beta_orth|| {np.linalg.norm(NX.T@beta_orth):.2e}")

beta_orth = beta_d1-beta_proj_nulls
print(f"||NX.T@beta_orth|| {np.linalg.norm(NX.T@beta_orth):.2e}")
# The scipy implementation is better

# Now lets do the same for coefficients learned from the z-scored data
beta_d1_std = reg_coef_d1_std.iloc[:, 1]
beta_proj_nulls_std = project_reg_coeff_onto_space_by_basis(beta_d1_std, NX_std)
plt.plot(x_lfp, beta_proj_nulls_std, label="Basis Vector Nullspace Projection")
plt.legend()
plt.show()
beta_orth_std = beta_d1_std-beta_proj_nulls_std
print(np.linalg.norm(NX_std.T@beta_orth_std))

In [None]:
print("Predition difference on  trainign data of beta_orth and beta_d1")
print(X_lfp_train@beta_orth - X_lfp_train@beta_d1)
# The predictiona are unchanged, as expected/designed

# Now std 
print("Predition difference z-scored trainign data of beta_orth and beta_d1")
print(X_lfp_train_std@beta_orth_std - X_lfp_train_std@beta_d1_std)

In [None]:
figsize = [20, 5]
fig, ax = plt.subplots(1, 2, figsize=figsize, constrained_layout=True, sharex=True, sharey=False)
markevery = int(len(x_lfp)/10)
fig, ax[0] = vis_reg_coef(
    beta = beta_orth,
    d = np.linspace(2.0, 3.5, 1000),
    y = y_log_cl,
    label = r"$\beta_{D1\perp}^{FL}$",
    fig = fig, 
    ax = ax[0],
    cid=4,
    marker='o',
    markevery=markevery,
    linewidth=2.0)

fig, ax[0] = vis_reg_coef(
    beta = reg_coef_d1.iloc[:, 1],
    d = np.linspace(2.0, 3.5, 1000),
    y = y_log_cl,
    label = r"$\beta_{D1}^{FL}$",
    fig = fig, 
    ax = ax[0],
    cid=1,
    marker='s',
    markevery=(int(markevery/2), markevery),
    linewidth=2.0)

fig, ax[1] = vis_reg_coef(
    beta = beta_orth_std,
    d = np.linspace(2.0, 3.5, 1000),
    y = y_log_cl,
    label = r"$\beta_{D1\perp}^{FL}$ z-scored",
    fig = fig, 
    ax = ax[1],
    cid=4,
    marker='o',
    markevery=markevery,
    linewidth=2.0)

fig, ax[1] = vis_reg_coef(
    beta = reg_coef_d1_std.iloc[:, 1],
    d = np.linspace(2.0, 3.5, 1000),
    y = y_log_cl,
    label = r"$\beta_{D1}^{FL}$ z-scored",
    fig = fig, 
    ax = ax[1],
    cid=1,
    marker='s',
    markevery=(int(markevery/2), markevery),
    linewidth=2.0)

trans = mtransforms.ScaledTranslation(-55 / 72, 20 / 72, fig.dpi_scale_trans)
fig, ax[0] = set_axis_label(fig, ax[0], trans, label_str="a)", loc=(-0.03, 1.0))
fig, ax[1] = set_axis_label(fig, ax[1], trans, label_str="b)", loc=(-0.03, 1.0))

ax[0].set_xlabel("Voltage (V)")
ax[1].set_xlabel("Voltage (V)")
ax[0].set_ylabel("Coefficients")

#fig.suptitle("Cross-Validate Generalized Lasso Regression Coefficients")

# Save figure
if save_plots:
    plt.savefig(save_plot_path + 'Fused_Lasso_Orthogonal_Nulls_Comp.pdf')

In [None]:
# Check the prediction accuracies of the projected vecotors.
df_rmse = predict_LFP_based_on_coef(
    X_, X_test1_, X_test2_, y_cl_train, y_cl_test, y_cl_test2, beta_orth, np.mean(y_log_cl), df_rmse, 'D1, Orthogonal')

df_rmse = predict_LFP_based_on_coef(
    X_lfp_train_std, X_test1_std, X_test2_std, y_cl_train, y_cl_test, y_cl_test2, beta_orth_std, np.mean(y_log_cl), df_rmse, 'D1 Z-Scored, Orthogonal')
# The prediction accuracies of the orthogonal regression coefficients are almost identical to the original regression coefficients.

df_rmse = predict_LFP_based_on_coef(
    X_, X_test1_, X_test2_, y_cl_train, y_cl_test, y_cl_test2, reg_coef_d1.iloc[:, 1], np.mean(y_log_cl), df_rmse, 'D1')

df_rmse = predict_LFP_based_on_coef(
    X_lfp_train_std, X_test1_std, X_test2_std, y_cl_train, y_cl_test, y_cl_test2, reg_coef_d1_std.iloc[:, 1], np.mean(y_log_cl), df_rmse, 'D1 Z-Scored')

In [None]:
# PLS Coefficients are orthogonal to the nullspace by construction
beta_pls_1_nulls_projected = project_reg_coeff_onto_space_by_basis(nulls_lfp_cl.weights['PLS 1 Comp.'], NX)
print(np.linalg.norm(beta_pls_1_nulls_projected))

beta_pls_9std_nulls_projected = project_reg_coeff_onto_space_by_basis(nulls_lfp_cl.weights['PLS 9 Comp. std'], NX_std)
print(np.linalg.norm(beta_pls_9std_nulls_projected))

In [None]:
# Check the condition numbers of the parabolic data and the real data
temp_n = np.loadtxt(data_path + "poly_hd_data_n.csv", skiprows=1, delimiter=",") 
X_parab = temp_n[:,:-1]

cond_x_parab = np.linalg.cond(X_parab)
cond_xtx_parab = np.linalg.cond(X_parab.T@X_parab)
cond_xxT_parab  = np.linalg.cond(X_parab@X_parab.T)

cond_x_lfp = np.linalg.cond(X_lfp_train)
cond_xtx_lfp = np.linalg.cond(X_lfp_train.T@X_lfp_train)
cond_xxT_lfp  = np.linalg.cond(X_lfp_train@X_lfp_train.T)

print(f"Condition number of X_parab: {cond_x_parab:.2e}")
print(f"Condition number of X_parab.T@X_parab: {cond_xtx_parab:.2e}")
print(f"Condition number of X_parab@X_parab.T: {cond_xxT_parab:.2e}")
print(f"Condition number of X_lfp_train: {cond_x_lfp:.2e}")
print(f"Condition number of X_lfp_train.T@X_lfp_train: {cond_xtx_lfp:.2e}")
print(f"Condition number of X_lfp_train@X_lfp_train.T: {cond_xxT_lfp:.2e}")