## What Is This??

Hacked-up version of the decision tree model training thing so that you can train one locally.

You will need to download the data file from gdrive - sam will provide a link.


In [2]:
import pandas as pd
import numpy as np
from collections import defaultdict
import gcsfs
import datetime
import xarray as xr
import random
import xgboost as xgb     
from xgboost import XGBRegressor
import os
from pathlib import Path
from sklearn.model_selection import train_test_split

RSEED = 42

In [3]:
def apply_splits(X, y, train_val_idx, train_idx, val_idx, test_idx):
    
    """
    Uses splitting indeces found in 'train_val_test_split' to apply splits. 
    
    Parameters
    ----------
    X : pandas.Dataframe
        Dataframe of feature data
    
    y : pandas.Dataframe
        Dataframe of target data
    
    train_val_idx : list
        Indeces for combined training and validation dataset
    
    train_idx : list
        Indeces for training dataset
    
    val_idx : list
        Indeces for validation dataset
    
    test_idx : list
        Indeces for testing dataset
    
    Returns
    ----------
    X_train_val : pandas.Dataframe
        Combined train and validation set
    X_train : pandas.Dataframe
        Training set
    X_val : pandas.Dataframe
        Validation set
    X_test : pandas.Dataframe
        Test set
    y_train_val : pandas.Dataframe
        Target values for train and validation set
    y_train : pandas.Dataframe
        Target values for training set
    y_val : pandas.Dataframe
        Target values for validation set
    y_test : pandas.Dataframe
        Target values for test set
    """
    
    X_train_val = X[train_val_idx,:]
    X_train = X[train_idx,:]
    X_val = X[val_idx,:]
    X_test = X[test_idx,:]

    y_train_val = y[train_val_idx]
    y_train = y[train_idx]
    y_val = y[val_idx]
    y_test = y[test_idx]

    return X_train_val, X_train, X_val, X_test, y_train_val, y_train, y_val, y_test

def train_val_test_split(N, test_prop, val_prop):
    
    """
    Get indeces for splitting training and test sets for ML.
    
    Parameters
    ----------
    N : Number of months
    
    test_prop : float
        Proportion of data to use for testing, percentage
    
    val_prop : float
        Proportion of data to use for validation, percentage
    
    random_seeds : list
        Random numbers/seeds for partitioning randomly
    
    ens_count : int
        Random seed stop point for ensemble member
    
    Returns
    ----------
    intermediate_idx : list
        Indeces for combined training and validation dataset
        
    train_idx : list
        Indeces for training dataset
        
    val_idx : list
        Indeces for validation dataset
        
    test_idx : list
        Indeces for testing dataset
    
    """
    
    # intermediate_idx, test_idx = train_test_split(range(N), test_size=test_prop, random_state=random_seeds[0,ens_count])
    # train_idx, val_idx = train_test_split(intermediate_idx, test_size=val_prop/(1-test_prop), random_state=random_seeds[1,ens_count])
    # return intermediate_idx, train_idx, val_idx, test_idx
    if test_prop > 0:
        # Perform test split
        intermediate_idx, test_idx = train_test_split(
            range(N), test_size=test_prop, random_state=RSEED
        )
    else:
        # No test set, all data goes into train/val
        intermediate_idx = np.arange(N)
        test_idx = None  # No test set

    # Split remaining data into train/val
    train_idx, val_idx = train_test_split(
        intermediate_idx, 
        test_size=val_prop / (1 - test_prop) if test_prop > 0 else val_prop,
        random_state=RSEED
    )

    return intermediate_idx, train_idx, val_idx, test_idx



In [4]:
# Define date range
date_range_start = '2004-01-01T00:00:00.000000000'
date_range_end = '2023-12-31T00:00:00.000000000'

# create date vector, adds 14 days to start & end
dates = pd.date_range(start=date_range_start, 
                      end=date_range_end,freq='MS')


init_date = str(dates[0].year) + format(dates[0].month,'02d')
fin_date = str(dates[-1].year) + format(dates[-1].month,'02d')

### train-validate-test split proportions ###

select_dates = []
test_dates = []

for i in range(0,len(dates)):
    if i % 5 != 0:
        select_dates.append(dates[i]) ### 80% train days set ###
    if i % 5 == 0:
        test_dates.append(dates[i]) ### 20% test days set ### 

### Then, the month numbers above are converted back to their respective datetime objects.

year_mon = []

for i in range(0,len(select_dates)):
    
    tmp = select_dates[i]
    year_mon.append(f"{tmp.year}-{tmp.month}")
    
test_year_mon = []

for i in range(0,len(test_dates)):    
    tmp = test_dates[i]
    test_year_mon.append(f"{tmp.year}-{tmp.month}")


params = {
    'n_estimators': 500,  # Number of boosting rounds
    'max_depth': 6,  # Maximum depth of each tree to control model complexity
    'learning_rate': 0.05,  # Step size shrinkage to prevent overfitting
    'subsample': 0.8,  # Fraction of samples used for training each tree
    'colsample_bytree': 0.8,  # Fraction of features used per tree
    'gamma': 0.1,  # Minimum loss reduction required for further partitioning
    'min_child_weight': 5,  # Minimum sum of instance weight in a leaf node
    'reg_alpha': 0.1,  # L1 regularization to reduce model complexity
    'reg_lambda': 1.0,  # L2 regularization for preventing overfitting
    'objective': 'reg:squarederror',  # Loss function for regression tasks
    'n_jobs': 30,  # Number of parallel threads to use for training
    'eval_metric': 'rmse',
    'early_stopping_rounds': 50  # Stop training if performance doesn't improve for 50 rounds
}

In [5]:
features_sel = ['sst','sst_anom','sss','sss_anom','mld_clim_log','chl_log','chl_log_anom','xco2','A', 'B', 'C', 'T0', 'T1']
val_prop = .2 # 20% of training data for validation
test_prop = .0 # Since we apply this split to X_train, we set it to zero. We have X_test as testing dataset.

# the target variable we reconstruct:
target_sel = ['pco2_residual'] # this represents pCO2 - pCO2-T (calculated in notebook 00)

file_path = 'data/MLinput_ACCESS-ESM1-5_r10i1p1f1_mon_1x1_200401_202312.pkl'
random_seeds = [42]

with open(file_path, 'rb') as filee:
    df = pd.read_pickle(filee)
    df['year'] = df.index.get_level_values('time').year
    df['mon'] = df.index.get_level_values('time').month
    df['year_month'] = df['year'].astype(str) + "-" + df['mon'].astype(str)
    
    recon_sel = (
        ~df[features_sel+target_sel+['net_mask']].isna().any(axis=1)
    ) & (
        (df[target_sel] < 250) & (df[target_sel] > -250)
    ).to_numpy().ravel()

    sel = (recon_sel & (df['socat_mask'] == 1))
    
    train_sel = (sel & (pd.Series(df['year_month']).isin(year_mon))).to_numpy().ravel()
    test_sel  = (sel & (pd.Series(df['year_month']).isin(test_year_mon))).to_numpy().ravel()
    unseen_sel = (recon_sel & (df['socat_mask'] == 0))

    X = df.loc[sel, features_sel].to_numpy()
    y = df.loc[sel, target_sel].to_numpy().ravel()

    Xtrain = df.loc[train_sel, features_sel].to_numpy()                
    ytrain = df.loc[train_sel, target_sel].to_numpy().ravel()

    X_test = df.loc[test_sel, features_sel].to_numpy()
    y_test = df.loc[test_sel, target_sel].to_numpy().ravel()
    N = Xtrain.shape[0]
    
    train_val_idx, train_idx, val_idx, test_idx = train_val_test_split(
        N, test_prop, val_prop
    )
    X_train_val, X_train, X_val, X_test_tmp, y_train_val, y_train, y_val, y_test_tmp = \
        apply_splits(Xtrain, ytrain, train_val_idx, train_idx, val_idx, test_idx)   


    model = XGBRegressor(
        random_state=42,
        **params,
    )
    eval_set = [(X_val, y_val)] 
    model.fit(
        X_train_val, y_train_val, 
        eval_set=eval_set, 
        verbose=False
    )

    y_pred_test = model.predict(X_test)

