<div class="alert alert-info" role="alert" 
     style="font-size: 1.6em; font-weight: bold; padding: 10px; margin: 10px 0; text-align: center;">
    
    XGBoost model training -- Round 2
<div>

In [None]:
# Data Wrangling
import glob
import pandas as pd
import numpy as np

# Machine Learn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import shap

# Data Plotting
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

#ignorewarnings
import warnings
warnings.filterwarnings("ignore")

<div class="alert alert-info" role="alert" 
     style="font-size: 1.8em; font-weight: bold; padding: 15px; margin: 10px 0; text-align: center; background-color: #d9edf7; border-color: #bce8f1; color: #31708f; border-radius: 8px;">
    Data Preprocessing
</div>

### Load CSV files containing variables -- pandas' `read_csv function`

In [None]:
# Data are imported in Dataframe format
SSH = pd.read_csv("cmems_mod_glo_phy_my_0.083deg_P1D-m_SSH.csv", comment='#') # tells pandas to ignore lines starting with '#'
SST = pd.read_csv("cmems_mod_glo_phy_my_0.083deg_P1D-m_SST.csv", comment='#')
SSS = pd.read_csv("cmems_mod_glo_phy_my_0.083deg_P1D-m_SSs.csv", comment='#')
VEL = pd.read_csv("cmems_mod_glo_phy_my_0.083deg_P1D-m_VEL.csv", comment='#')
MLD = pd.read_csv("cmems_mod_glo_phy_my_0.083deg_P1D-m_MLD.csv", comment='#')

<div class="alert alert-info" role="alert" 
     style="font-size: 1.1em; padding: 15px; margin: 10px 0; text-align: center; background-color: #d9edf7; border-color: #bce8f1; color: #31708f; border-radius: 8px;">

    Conspicuously missing from our training thus far has been TIME. 
    We're now going to include Month and Year to see if this improves the accuracy of our SST predictions.    
<div>

In [None]:
# The UTC format that our dates (times) are currently in:
SSH['dates'] = pd.to_datetime(SSH['time'], format='%Y-%m-%dT%H:%M:%S.%fZ', utc=True)

# Step1: convert to a more friendly format, add 'dates' as new column 
SSH['dates'] = SSH['dates'].dt.strftime('%Y-%m-%d %H:%M')
# Step2: Remove the times in this case as they contain no real information
SSH['dates'] = pd.to_datetime(SSH['dates'])
# Step3: Drop the now unnecessary 'time' column
SSH = SSH.drop(columns=['time'])

# Extract months+ years (integers)
SSH['mo'] = SSH['dates'].dt.month
SSH['yr'] = SSH['dates'].dt.year

# Re-order columns so that 'dates' is first, followed by 'mo' (month), and then 'zos' (SSH values)
SSH = SSH[['dates', 'mo', 'yr', 'zos']]

# Display the first 2 rows of the resulting DataFrame
print(SSH.head(2))

In [None]:
# We can now combine these different vars to make a new df
df = pd.DataFrame({'Date':SSH['dates'], 'Month':SSH['mo'], 'Year':SSH['yr'], 'SSH':SSH['zos'], 'SST':SST['thetao'], 
                   'SSS':SSS['so'],'Vuo':VEL['uo'], 'Vvo':VEL['vo'], 'MLD':MLD['mlotst']})
print(df.head(3))
print(':')
print(len(df))

### Set the `predictor` and `target` variables (X, y)

In [None]:
predictors = ['Month', 'Year', 'SSH', 'SSS', 'Vuo', 'Vvo', 'MLD'] # Predictor vars
X = df[predictors].values 
y = df['SST'].values      # Target variable
# Needs to be (n,n)(n,)
print(X.shape, y.shape)

### Split the data into two sets: `training` (80%) and `test` (20%)

In [None]:
# Split your dataset so 20% is set aside for testing (0.2) 
# Set random_state to ensure yr train-test split is always the same (for reproducibility)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the 80% training: 20% testing split
print("Trainin set size:", X_train.shape[0])
print("Testing set size:",  X_test.shape[0])

<div class="alert alert-info" role="alert" 
     style="font-size: 1.5em; padding: 15px; margin: 10px 0; text-align: center; background-color: #d9edf7; border-color: #bce8f1; color: #31708f; border-radius: 8px;">
    
    Hyperparameter Tuning
<div>

In [None]:
# Define a parameter grid (dictionary) with lists of possible values for each hyperparameter:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],       # Step size shrinkage used to prevent overfitting
    'max_depth': [3, 5, 7],                        # Maximum depth of a tree, controlling model complexity
    'min_child_weight': [1, 3, 5],                 # Minimum sum of instance weight (hessian) needed in a child
    'subsample': [0.7, 0.8, 0.9, 1.0],             # Fraction of samples to use for each tree
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],      # Fraction of features to consider for each tree
    'n_estimators': [100, 200, 300],               # Number of trees (boosting rounds) to build
    'gamma': [0, 0.1, 0.5]                         # Minimum loss reduction required to make a split
}

# Initialise XGBoost regressor model with squared error objective.
xgb_reg = xgb.XGBRegressor(objective = 'reg:squarederror', random_state = 42)

# Set up RandomizedSearchCV with the following parameters:
random_search = RandomizedSearchCV(
    estimator = xgb_reg,                          # The base model to optimize (our XGBoost regressor)
    param_distributions=param_grid,               # Dictionary with parameters to sample from (our grid)
    n_iter = 100,                                 # Number of random parameter combinations to try (reduces total fits)
    scoring='r2',                                 # Metric to evaluate performance (R² in this case)
    cv = 5,                                       # Number of cross-validation folds (5-fold cross-validation)
    verbose= 1,                                   # Verbosity level to print progress messages during the search
    n_jobs =-1,                                   # Use all available CPU cores to parallelize the search
    random_state = 42                             # Random seed for reproducibility of the random sampling
)

# Fit the randomised search on the training data:
random_search.fit(X_train, y_train)

# Print best hyperparameters and cross-validation (CV) scores
print("Best parameters:", random_search.best_params_)
print("Best CV R²: {:.2f}".format(random_search.best_score_))

### Yes! From `R²`= 0.71 to `R²`= 0.96. We can now save this 'even better' model using `save_model()`.

In [None]:
# Save the model to a JSON file
random_search.best_estimator_.save_model("xgb_sst_model2.json")

In [None]:
# Using SHAP to explain our model predictions with XGBoost
explainer = shap.TreeExplainer(random_search.best_estimator_)  # Use best_estimator_ from random_search
shap_vals = explainer.shap_values(X_test)                      # Compute SHAP values for your test data

# Compute the mean absolute SHAP values for each feature
# This provides a robust measure of feature importance
shap_importance = np.abs(shap_vals).mean(axis = 0)

# Create df of the features and their importance (SHAP)
shap_df = pd.DataFrame({
    "Variable": predictors,
    "Mean Absolute SHAP": shap_importance})

# Sort shap_df so the most important features (variables) are at the top:
shap_df.sort_values(by = "Mean Absolute SHAP", ascending = False, inplace = True)
print(shap_df)

### Clearly, `Month` is an incredibly important feature, and `Year` is moderately important for predicting SST.

<div class="alert alert-info" role="alert" 
     style="font-size: 1.6em; font-weight: bold; padding: 10px; margin: 10px 0; text-align: center;">

    XGBoost model deployment -- Round 2

<div>

In [None]:
# Data are imported in Dataframe format
SSH = pd.read_csv("cmems_mod_glo_phy_zos_0.083deg_P1D-m.csv", comment='#') # tells pandas to ignore lines starting with '#'
SST = pd.read_csv("cmems_mod_glo_phy-sst_0.083deg_P1D-m.csv", comment='#')
SSS = pd.read_csv("cmems_mod_glo_phy-sss_0.083deg_P1D-m.csv", comment='#')
VEL = pd.read_csv("cmems_mod_glo_phy-vel_0.083deg_P1D-m.csv", comment='#')
MLD = pd.read_csv("cmems_mod_glo_phy_mld_0.083deg_P1D-m.csv", comment='#')

In [None]:
# The UTC format that our dates (times) are currently in:
SSH['dates'] = pd.to_datetime(SSH['time'], format='%Y-%m-%dT%H:%M:%S.%fZ', utc=True)

# Step1: convert to a more friendly format, add 'dates' as new column 
SSH['dates'] = SSH['dates'].dt.strftime('%Y-%m-%d %H:%M')
# Step2: Remove the times in this case as they contain no real information
SSH['dates'] = pd.to_datetime(SSH['dates'])
# Step3: Drop the now unnecessary 'time' column
SSH = SSH.drop(columns=['time'])

# Extract months+ years (integers)
SSH['mo'] = SSH['dates'].dt.month
SSH['yr'] = SSH['dates'].dt.year

# Re-order columns so that 'dates' is first, followed by 'mo' (month), and then 'zos' (SSH values)
SSH = SSH[['dates', 'mo', 'yr', 'zos']]

# Combine vars in new df:
new_df = pd.DataFrame({'Date':SSH['dates'], 'Month':SSH['mo'], 'Year':SSH['yr'], 'SSH':SSH['zos'], 'SST':SST['thetao'], 
                       'SSS':SSS['so'], 'Vuo':VEL['uo'], 'Vvo':VEL['vo'], 'MLD':MLD['mlotst']})

In [None]:
predictors = ['Month', 'Year', 'SSH', 'SSS', 'Vuo', 'Vvo', 'MLD'] # Predictor vars
X_new = new_df[predictors].values     # Features
y_new = new_df['SST'].values          # True SST values
# Check reshaped data (n,n)(n,)
print(X_new.shape, y_new.shape)

<div class="alert alert-info" role="alert" 
     style="font-size: 1.5em; padding: 15px; margin: 10px 0; text-align: center; background-color: #d9edf7; border-color: #bce8f1; color: #31708f; border-radius: 8px;">
    
    Even-Better Model
<div>

In [None]:
sst_model = xgb.XGBRegressor()               # Create new XGBoost regressor
sst_model.load_model("xgb_sst_model2.json")  # Load the model saved to disk

In [None]:
# Saved XGBoost model: predict 'new' SST
sst_pred = sst_model.predict(X_new)

# Evaluate model performance on new data:
new_r2 = r2_score(y_new, sst_pred)
new_rmse = np.sqrt(mean_squared_error(y_new, sst_pred))

print("New R²:   {:.2f}".format(new_r2))
print("New RMSE: {:.2f}".format(new_rmse))

In [None]:
# Create figure
fig1, ax = plt.subplots(1, 1, figsize=(15, 5))

# Plot True and Predicted SSTs
ax.plot(new_df['Date'], new_df['SST'], linestyle = '-', c = 'blue', linewidth = 1.0, label = 'True SSTs')
ax.plot(new_df['Date'], sst_pred,linestyle = '-.', c = 'orangered', linewidth = 1.5, label = 'Predicted')

# Formatting
ax.grid(True, color = 'silver', linestyle = ':', linewidth = 0.5)
ax.set_xlim([np.nanmin(new_df['Date']), np.nanmax(new_df['Date'])])
ax.set_ylabel('SST °C', fontsize = 10, weight = 'bold')
ax.legend();

# Set the x-axis major locator to every 3 months
ax.xaxis.set_major_locator(mdates.MonthLocator(interval = 1))
# Format the x-axis ticks as 'mm-yy'
ax.xaxis.set_major_formatter(mdates.DateFormatter('%m-%y'))
# Rotate tick labels for better readability
plt.setp(ax.get_xticklabels(), rotation = 40, ha = 'center')

plt.show()