# Function for GPR Plot
## Example Parameter: Avg Leaf Nitrogen Concentration (LNC) vs Leaf Carbon Nitrogen Ratio (LCN)
#### Author: Sofia Ingersoll

This is the list of the top 10 most common variables:
GPP, NBP, TOTVEGC, TLAI, EFLX_LH_TOT, SOILWATER_10CM, QRUNOFF, FSR, FAREA_BURNED, SNOWDP

In [1]:
# moved the libraries that were here into ml_utils.py because they're essential
# xarray is required to run the utils import line
import xarray as xr

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
# import libraries & data pre-processing functions from utils.py
from ml_utils import *

In [4]:
# Request an additional 10 cores of power for processing from the server
client = get_cluster("UCSB0021", cores = 40)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 36831 instead


In [5]:
# apply peer2peer network communication across multiple devices
client.cluster

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/36831/status,Workers: 40
Total threads: 40,Total memory: 372.40 GiB

0,1
Comm: tcp://128.117.208.103:35731,Workers: 40
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/36831/status,Total threads: 40
Started: Just now,Total memory: 372.40 GiB

0,1
Comm: tcp://128.117.208.88:34499,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/43963/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:42843,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-x8prsydz,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-x8prsydz

0,1
Comm: tcp://128.117.208.100:41657,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/36799/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.100:34607,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-5krc3sic,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-5krc3sic

0,1
Comm: tcp://128.117.208.100:43773,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/37257/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.100:37431,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-vodrv_je,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-vodrv_je

0,1
Comm: tcp://128.117.208.100:42243,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/38251/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.100:41119,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-y4_8_nmn,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-y4_8_nmn

0,1
Comm: tcp://128.117.208.88:43241,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/39693/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:36569,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-h5dngfjd,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-h5dngfjd

0,1
Comm: tcp://128.117.208.88:40587,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/38951/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:35213,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-71c72w6k,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-71c72w6k

0,1
Comm: tcp://128.117.208.103:37005,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/45511/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.103:40971,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-dmij7s3p,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-dmij7s3p

0,1
Comm: tcp://128.117.208.88:41865,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/42175/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:43823,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-049ka2dk,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-049ka2dk

0,1
Comm: tcp://128.117.208.100:37679,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/41685/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.100:33535,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-i_z6bfja,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-i_z6bfja

0,1
Comm: tcp://128.117.208.88:36795,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/37673/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:32901,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-vu7e9cp6,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-vu7e9cp6

0,1
Comm: tcp://128.117.208.100:43373,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/38421/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.100:37423,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-zrevp95q,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-zrevp95q

0,1
Comm: tcp://128.117.208.112:33797,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/39621/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.112:33915,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-0fahr40p,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-0fahr40p

0,1
Comm: tcp://128.117.208.88:39959,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/43887/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:38311,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-jfs2ni_w,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-jfs2ni_w

0,1
Comm: tcp://128.117.208.88:44611,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/41873/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:38771,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-u53ha5bt,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-u53ha5bt

0,1
Comm: tcp://128.117.208.88:36579,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/40949/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:40307,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-ym32wdth,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-ym32wdth

0,1
Comm: tcp://128.117.208.88:41569,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/41433/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:43809,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-h51k__gq,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-h51k__gq

0,1
Comm: tcp://128.117.208.88:46629,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/41431/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:39027,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-f1x1ieb4,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-f1x1ieb4

0,1
Comm: tcp://128.117.208.88:44083,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/37475/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:45669,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-atco0ucb,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-atco0ucb

0,1
Comm: tcp://128.117.208.88:44425,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/35549/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:42823,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-jbw8wqnt,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-jbw8wqnt

0,1
Comm: tcp://128.117.208.88:43431,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/44041/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:36277,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-qql82yb9,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-qql82yb9

0,1
Comm: tcp://128.117.208.100:42607,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/44565/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.100:43793,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-i58y8hhd,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-i58y8hhd

0,1
Comm: tcp://128.117.208.112:33433,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/44457/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.112:44441,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-ab43zurr,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-ab43zurr

0,1
Comm: tcp://128.117.208.109:40277,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/37821/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.109:37561,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-0g99gfsa,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-0g99gfsa

0,1
Comm: tcp://128.117.208.100:36989,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/40465/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.100:32877,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-ox9zndf6,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-ox9zndf6

0,1
Comm: tcp://128.117.208.100:43705,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/45001/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.100:39427,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-r7ebg49v,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-r7ebg49v

0,1
Comm: tcp://128.117.208.103:40579,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/43435/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.103:39663,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-nwz7rk1b,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-nwz7rk1b

0,1
Comm: tcp://128.117.208.88:35627,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/37391/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:38033,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-kbdsmeu1,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-kbdsmeu1

0,1
Comm: tcp://128.117.208.88:36839,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/43747/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:35583,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-sh0vz8jt,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-sh0vz8jt

0,1
Comm: tcp://128.117.208.100:44681,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/37123/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.100:34539,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-z190b9bn,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-z190b9bn

0,1
Comm: tcp://128.117.208.88:37905,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/46399/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:37085,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-w6txr1wu,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-w6txr1wu

0,1
Comm: tcp://128.117.208.112:42235,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/35903/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.112:42141,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-pmn6nwbf,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-pmn6nwbf

0,1
Comm: tcp://128.117.208.88:46143,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/42585/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:33143,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-lxqd8tvb,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-lxqd8tvb

0,1
Comm: tcp://128.117.208.88:37851,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/34459/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:33813,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-q_ft9ifn,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-q_ft9ifn

0,1
Comm: tcp://128.117.208.88:42839,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/37687/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:39311,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-zxuclq_w,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-zxuclq_w

0,1
Comm: tcp://128.117.208.88:44369,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/34993/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:33469,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-ba_ywsz2,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-ba_ywsz2

0,1
Comm: tcp://128.117.208.112:35711,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/44267/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.112:39087,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-zfm48bbb,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-zfm48bbb

0,1
Comm: tcp://128.117.208.88:46751,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/38275/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:34673,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-ennsxse5,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-ennsxse5

0,1
Comm: tcp://128.117.208.100:46769,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/33991/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.100:39723,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-a3jc9__3,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-a3jc9__3

0,1
Comm: tcp://128.117.208.88:35211,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/46631/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:37603,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-y5eqkl78,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-y5eqkl78

0,1
Comm: tcp://128.117.208.88:33969,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/singersoll/proxy/42037/status,Memory: 9.31 GiB
Nanny: tcp://128.117.208.88:40095,
Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-bd867d5m,Local directory: /glade/derecho/scratch/singersoll/tmp/dask-scratch-space/worker-bd867d5m


Prepare the data for GPR, including selecting the target variable and the parameters as features.

Split the data into training and testing sets.

Define a grid of hyperparameters for the GPR model.

Perform grid search cross-validation to find the best hyperparameters.

Fit the GPR model using the best hyperparameters on the training data.

Generate predictions and confidence intervals for each variable using the trained model.

Plot the results, including the observed data, predicted values, and confidence intervals.

### 1st Attempt

Testing new utils funct

In [11]:
param_avg = params

In [13]:
param_avg

In [7]:
da = subset_var_cluster('LNC')

In [8]:
# only runs once, then outputs an error -- add a cute message to shorten warning
var_avg = wrangle_var_cluster(da)

In [44]:
param_vals = param_avg.values
param_da = np.array(param_vals)

In [45]:
param_da

array(<bound method Mapping.values of <xarray.Dataset>
Dimensions:              (member: 500)
Coordinates:
  * member               (member) object 'LHC0001' 'LHC0002' ... 'LHC0500'
Data variables: (12/32)
    FUN_fracfixers       (member) float64 0.7783 0.3193 0.8768 ... 0.2621 0.4221
    KCN                  (member) float64 0.358 0.5488 0.2108 ... 0.9123 0.8392
    a_fix                (member) float64 0.255 0.6302 0.6621 ... 0.5339 0.1443
    crit_dayl            (member) float64 0.4012 0.27 0.3534 ... 0.9974 0.6571
    d_max                (member) float64 0.7978 0.5361 0.1409 ... 0.9076 0.4009
    fff                  (member) float64 0.1301 0.6792 ... 0.9918 0.02798
    ...                   ...
    stem_leaf            (member) float64 0.1138 0.1273 0.25 ... 0.3328 0.347
    sucsat_sf            (member) float64 0.7982 0.3947 0.1681 ... 0.3526 0.6515
    theta_cj             (member) float64 0.03646 0.09374 ... 0.8429 0.5055
    tpu25ratio           (member) float64 0.3936 0.04

In [46]:
param_reshaped = param_vals.reshape(500, 32)

AttributeError: 'function' object has no attribute 'reshape'

In [50]:
param_reshaped = param_avg.stack(features=("member", np.arange(33)))

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [26]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----      Split Data 90/10        ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Reshaping the data for splitting
# Reshape into (500, 32) where each row corresponds to a member and each column corresponds to a feature
#param_reshaped = param_avg.stack(features=("member", list(np.arange(3))))
#param_reshaped = param_avg.stack(features=("member", tuple(np.arange(33))))  # Convert np.arange(32) to a tuple
#param_reshaped = param_avg.stack(features=("member", list(np.arange(3))))



x_train, x_test, y_train, y_test = train_test_split(param_reshaped,
                                                    var_avg.values.reshape(-1, 1),
                                                    test_size=0.1,
                                                    # setting a seed
                                                    random_state=0)

AttributeError: 'function' object has no attribute 'reshape'

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----      Set up Tune Grid       ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Define a grid of hyperparameters for the GPR model
tune_grid = {
    "kernel": [ConstantKernel(constant_value=3, constant_value_bounds=(1e-4, 1e4) * RBF(length_scale=1.0) + WhiteKernel(noise_level=1e-4)],
    # Add more kernel configurations as needed
}

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----      Cross Validation        ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Perform grid search cross-validation to find the best hyperparameters
gpr = GaussianProcessRegressor(n_restarts_optimizer=20)

grid_search = GridSearchCV(gpr, tune_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(x_train, y_train)


In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----      Collect Metrics         ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Get the best hyperparameters
best_kernel = grid_search.best_params_['kernel']

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----      Finalize Workflow       ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Fit the GPR model using the best hyperparameters on the training data
gpr_best = GaussianProcessRegressor(kernel=best_kernel)
gpr_best.fit(x_train, y_train)

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----         Fit Model            ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Generate predictions and confidence intervals for each variable using the trained model
y_pred, sigma = gpr_best.predict(x_pred, return_std=True)
# standard 0:1, set constant
x_pred = np.linspace(0,1,20)

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----        Evaluate Model         ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# get accuracy of testing prediction, rmse
# get accuracy of testing prediction, rmse
y_pred_test = gpr_best.predict(x_test)
rmse = mean_squared_error(y_test, y_pred_test, squared=False)

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----  User Selected ML Plotting Funct   ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# function to plot a cluster for to build on for ml
def cluster_ml_plot(param_avg, var_avg):


    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # ----         Fit Model            ----
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Generate predictions and confidence intervals for each variable using the trained model
   # y_pred, sigma = gpr_best.predict(x_pred, return_std=True)     # to be included once upper portion of workflow optimized
   
    # standard 0:1, set constant
    x_pred = np.linspace(0,1,20)

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # ----         Plot Model           ----
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    plt.figure(figsize=(10, 6))
    plt.scatter(x_test, y_test, color='#62c900ff', label='Observed data')
    plt.plot(x_pred, y_pred, color='#134611', label='GPR Prediction')
    plt.fill_between(x_test.flatten(),
                     y_pred - 1.96 * sigma, y_pred + 1.96 * sigma,
                     alpha=0.5,
                     color='#9d6b53',
                     label = '95% Confidence Interval')
    # Set plot labels and title
    plt.xlabel(f"Perturbed Parameter: ({param_avg:})")  # ideas for later: param_name = getattr(param_avg)
    plt.ylabel(f"Variable: ({var_avg:})")
    plt.title('Parameter Value Uncertainty Estimation')
    plt.legend()
    # Show the plot
    plt.show()

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(param_avg.values,
                                                    var_avg.values, 
                                                    test_size=0.1, 
                                                    random_state=0)

# Instantiate and fit the Gaussian Process Regression model
gpr = GaussianProcessRegressor()
gpr.fit(x_train, y_train)

# Make predictions on the testing data
y_pred = gpr.predict(x_test)

# Evaluate the model using root mean squared error
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Print the RMSE
print("Root Mean Squared Error:", rmse)

## Custom Kernel: too big for realistic dashboard deployment
all psuedocode for now

In [None]:
from sklearn.gaussian_process.kernels import Kernel, ConstantKernel, RBF, Matern, WhiteKernel, DotProduct, RationalQuadratic, ExpSineSquared

class CustomKernel(Kernel):
    def __init__(self, constant_value=1.0, constant_value_bounds=(1e-3, 1e3),
                 length_scale=1.0, length_scale_bounds=(1e-3, 1e3),
                 nu=1.5, nu_bounds=(1.5, 2.5),
                 alpha=1e-10, alpha_bounds=(1e-12, 1e-2),
                 noise_level=1e-5, noise_level_bounds=(1e-8, 1e-3),
                 periodicity=1.0, periodicity_bounds=(1e-3, 1e3)):
        self.constant_value = constant_value
        self.constant_value_bounds = constant_value_bounds
        self.length_scale = length_scale
        self.length_scale_bounds = length_scale_bounds
        self.nu = nu
        self.nu_bounds = nu_bounds
        self.alpha = alpha
        self.alpha_bounds = alpha_bounds
        self.noise_level = noise_level
        self.noise_level_bounds = noise_level_bounds
        self.periodicity = periodicity
        self.periodicity_bounds = periodicity_bounds

    def __call__(self, X, Y=None, eval_gradient=False):
        # Constant Kernel
        constant_kernel = ConstantKernel(
            constant_value=self.constant_value,
            constant_value_bounds=self.constant_value_bounds
        )

        # Radial Basis Function (RBF) Kernel
        rbf_kernel = RBF(
            length_scale=self.length_scale,
            length_scale_bounds=self.length_scale_bounds
        )

        # Matern Kernel
        matern_kernel = Matern(
            length_scale=self.length_scale,
            length_scale_bounds=self.length_scale_bounds,
            nu=self.nu,
            nu_bounds=self.nu_bounds
        )

        # Automatic Relevance Determination (ARD) Kernel
        ard_kernel = RBF(
            length_scale=self.length_scale,
            length_scale_bounds=self.length_scale_bounds
        ) * DotProduct(sigma_0=self.alpha, sigma_0_bounds=self.alpha_bounds)

        # Noise Kernel
        noise_kernel = WhiteKernel(
            noise_level=self.noise_level,
            noise_level_bounds=self.noise_level_bounds
        )

        # Linear Kernel
        linear_kernel = DotProduct(sigma_0=0.0)

        # Periodic Kernel
        periodic_kernel = ExpSineSquared(
            length_scale=self.periodicity,
            periodicity=self.periodicity,
            length_scale_bounds=self.periodicity_bounds,
            periodicity_bounds=self.periodicity_bounds
        )

        kernels = [constant_kernel, rbf_kernel, matern_kernel, ard_kernel, noise_kernel, linear_kernel, periodic_kernel]

        kernel_expression = sum(kernel(X, Y, eval_gradient) for kernel in kernels)

        if eval_gradient:
            K, K_gradient = kernel_expression
            return K, K_gradient
        else:
            return kernel_expression


### This is an area that needs to be addressed this week. However, my focus is primarily on GPR to prepare for our meeting with Linnia the Post Doc

In [None]:
var_avg = wrangle_var_cluster(da)

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----      Split Data 90/10        ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
x_train, x_test, y_train, y_test = train_test_split(param_avg.values.reshape(500, 32),
                                                    var_avg.values.reshape(-1, 1),
                                                    test_size=0.1,
                                                    # setting a seed
                                                    random_state=0)

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----  User Selected ML Plotting Funct   ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# function to plot a cluster for to build on for ml
def cluster_ml_plot(param_avg, var_avg):


    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # ----         Fit Model            ----
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Generate predictions and confidence intervals for each variable using the trained model
   # y_pred, sigma = gpr_best.predict(x_pred, return_std=True)     # to be included once upper portion of workflow optimized
   
    # standard 0:1, set constant
    x_pred = np.linspace(0,1,20)

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # ----         Plot Model           ----
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    plt.figure(figsize=(10, 6))
    plt.scatter(x_test, y_test, color='#62c900ff', label='Observed data')
    plt.plot(x_pred, y_pred, color='#134611', label='GPR Prediction')
    plt.fill_between(x_test.flatten(),
                     y_pred - 1.96 * sigma, y_pred + 1.96 * sigma,
                     alpha=0.5,
                     color='#9d6b53',
                     label = '95% Confidence Interval')
    # Set plot labels and title
    plt.xlabel(f"Perturbed Parameter: ({param_avg:})")  # ideas for later: param_name = getattr(param_avg)
    plt.ylabel(f"Variable: ({var_avg:})")
    plt.title('Parameter Value Uncertainty Estimation')
    plt.legend()
    # Show the plot
    plt.show()

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----     load data stored in casper     ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#------- Parameter Data---------
# x variable data for plotting
df = pd.read_csv('/glade/campaign/asp/djk2120/PPEn11/csvs/lhc220926.txt',index_col=0)
# the only dimension here is the 'member' aka file index id [LCH0001-500]
# convert to data set
params = xr.Dataset(df)
# subset in parameter file
leafcn = params['leafcn']

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----    variable wrangling     ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# apply the bug fixing & dim. reduction functions 
# read in variable cluster
da = read_all_simulations('LNC')

# feb. ncar time bug
da = fix_time(da)

# convert xr.ds to xr.da
da = da['LNC']

# weight gridcell dim by global land area
da_global = weight_landarea_gridcells(da,landarea)

# weight time dim by days in month
da_global_ann = yearly_weighted_average(da_global)

# take global avg for param over year dimension
lnc = da_global_ann.mean(dim='year')

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----      Split Data 90/10        ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
x_train, x_test, y_train, y_test = train_test_split(leafcn.values.reshape(-1, 1), lnc.values.reshape(-1, 1), test_size=0.1, random_state=0)

In [None]:
for p in params.data_vars:
    print(p)

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----      Set up Tune Grid       ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Define a grid of hyperparameters for the GPR model
tune_grid = {
    "kernel": [1.0 * RBF(length_scale=1.0) + WhiteKernel(noise_level=1e-5)],
    # Add more kernel configurations as needed
}

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----      Cross Validation        ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Perform grid search cross-validation to find the best hyperparameters
gpr = GaussianProcessRegressor(n_restarts_optimizer=20)

grid_search = GridSearchCV(gpr, tune_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(x_train, y_train)


In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----      Collect Metrics         ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Get the best hyperparameters
best_kernel = grid_search.best_params_['kernel']

In [None]:
best_kernel

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----      Finalize Workflow       ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Fit the GPR model using the best hyperparameters on the training data
gpr_best = GaussianProcessRegressor(kernel=best_kernel)
gpr_best.fit(x_train, y_train)

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----         Fit Model            ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Generate predictions and confidence intervals for each variable using the trained model
y_pred, sigma = gpr_best.predict(x_pred, return_std=True)
# standard 0:1, set constant
x_pred = np.linspace(0,1,20)

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----        Evaluate Model         ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# get accuracy of testing prediction, rmse

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----        Visual Model Fit      ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Plot the results

plt.figure(figsize=(10, 6))
plt.scatter(x_test, y_test, color='#62c900ff', label='Observed data')
plt.plot(x_pred, y_pred, color='#134611', label='GPR Prediction')
plt.fill_between(x_test.flatten(), y_pred - 1.96 * sigma, y_pred + 1.96 * sigma, alpha=0.5, color='#9d6b53', label = '95% Confidence Interval')
plt.xlabel('Perturbed Parameter: Leaf Carbon to Nitrogen Ratio')
plt.ylabel('Variable: Leaf Nitrogen Concentration')
plt.title('Gaussian Process Regression Uncertainty Estimation')
plt.legend()
plt.show()

### 2nd attempt

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----      Set up Tune Grid         ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Define a grid of hyperparameters for the GPR model
tune_grid = {
    "kernel": [1.0 * RBF(length_scale=1.0)],
    # Add more kernel configurations as needed
}

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----      Cross Validation        ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Perform grid search cross-validation to find the best hyperparameters
gpr = GaussianProcessRegressor(n_restarts_optimizer=20)

grid_search = GridSearchCV(gpr, tune_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(x_train, y_train)

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----      Collect Metrics         ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Get the best hyperparameters
best_kernel = grid_search.best_params_['kernel']

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----         Fit Model            ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Generate predictions and confidence intervals for each variable using the trained model
y_pred, sigma = gpr_best.predict(x_test, return_std=True)

In [None]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----        Visual Model Fit      ----
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Plot the results
plt.figure(figsize=(10, 6))
plt.scatter(x_test, y_test, color='#62c900ff', label='Observed data')
plt.plot(x_test, y_pred, color='#134611', label='GPR Prediction')
# applying z-score for 95% CI
plt.fill_between(x_test.flatten(), y_pred - 1.96 * sigma, y_pred + 1.96 * sigma, alpha=0.5, color='#9d6b53', label = '95% Confidence Interval')
plt.xlabel('Perturbed Parameter: Leaf Carbon to Nitrogen Ratio')
plt.ylabel('Variable: Leaf Nitrogen Concentration')
plt.title('Gaussian Process Regression Uncertainty Estimation')
plt.legend()
plt.show()

### 3rd attempt

In [None]:
# for simplicity's sake, we're going to use the preloaded data rn
leafcn = params['leafcn']

def plot_gpr(param, var):
    '''describe the relationship between the selected variable 
    and parameter(s) over the selected period of time. output a
    best fit line of regression.'''
    # Plotting
    plt.scatter(x=param, y=var, color = '#62c900ff', alpha = 0.8)
    # Set plot labels and title
    plt.xlabel(param)
    plt.ylabel(var)
    plt.title('2005-2010 Global Average')

    # Set boundaries

    # Gaussian Process Regression
    kernel = C(1.0, (1e-2, 1e2)) * RBF(1.0, (1e-2, 1e2))
    gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=20)
    x = param.reshape(-1, 1)
    y = var
    gp.fit(x, y)
    x_pred = np.linspace(0, 1, 100).reshape(-1, 1)
    y_pred, sigma = gp.predict(x_pred, return_std=True)
    plt.plot(x_pred, y_pred, '#134611', label='Gaussian Process Regression')
    plt.fill_between(x_pred.flatten(), y_pred - sigma, y_pred + sigma, alpha=0.2, color='#9d6b53')

    # Show legend with confidence interval
    plt.legend(loc='upper right',  bbox_to_anchor=(1, 1), labels=['Data', 'Gaussian Process Regression', '95% Confidence Interval'])


    # Show the plot
    plt.show()

In [None]:
plot_gpr(leafcn, lnc)

In [None]:
# Example usage:
plot_gpr('leafcn', lnc)