# Shallow Machine Learning Models: Multivariate Regression

<div class="alert alert-info" role="alert" 
     style="font-size: 1.1em; padding: 10px; margin: 10px 0; text-align: center;">
    
    Multivariate regression models the relationship between one dependent variable and two or more independent variables using a linear equation.
<div>

### Import Libraries including from `sklearn` for shallow ML

In [None]:
# Data Wrangling
import glob
import pandas as pd
import numpy as np

# Machine Learn
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Data Plotting
import matplotlib.pyplot as plt
import seaborn as sns

#ignorewarnings
import warnings
warnings.filterwarnings("ignore")

<div class="alert alert-info" role="alert" 
     style="font-size: 1.8em; font-weight: bold; padding: 15px; margin: 10px 0; text-align: center; background-color: #d9edf7; border-color: #bce8f1; color: #31708f; border-radius: 8px;">
    Data Preprocessing
</div>

### Option 1 to Load CSV files containing variables -- pandas' `read_csv function`

In [None]:
# Data are imported in Dataframe format
SSH = pd.read_csv("cmems_mod_glo_phy_my_0.083deg_P1D-m_SSH.csv", comment='#') # tells pandas to ignore lines starting with '#'
SST = pd.read_csv("cmems_mod_glo_phy_my_0.083deg_P1D-m_SST.csv", comment='#')
SSS = pd.read_csv("cmems_mod_glo_phy_my_0.083deg_P1D-m_SSs.csv", comment='#')
VEL = pd.read_csv("cmems_mod_glo_phy_my_0.083deg_P1D-m_VEL.csv", comment='#')
MLD = pd.read_csv("cmems_mod_glo_phy_my_0.083deg_P1D-m_MLD.csv", comment='#')

In [None]:
# The UTC format that our dates (times) are currently in:
SSH['dates'] = pd.to_datetime(SSH['time'], format='%Y-%m-%dT%H:%M:%S.%fZ', utc=True)

# Step1: convert to a more friendly format, add 'dates' as new column 
SSH['dates'] = SSH['dates'].dt.strftime('%Y-%m-%d %H:%M')
# Step2: Remove the times in this case as they contain no real information
SSH['dates'] = pd.to_datetime(SSH['dates'])
# Step3: Drop the now unnecessary 'time' column
SSH = SSH.drop(columns=['time'])

# Re-order your columns so date is still first
SSH = SSH[['dates','zos']]
# Display
print(SSH.head(2))

In [None]:
# We can now combine these different vars to make a new df
df = pd.DataFrame({'Date':SSH['dates'], 'SSH':SSH['zos'], 'SST':SST['thetao'], 'SSS':SSS['so'], 
                   'Vuo':VEL['uo'], 'Vvo':VEL['vo'], 'MLD':MLD['mlotst']})
print(df.head(5))

### Option 2 to Load CSV files containing variables -- `glob` and pandas' `read_csv function`

### Set the `predictor` and `target` variables (X, y)

In [None]:
predictors = ['SSH', 'SSS', 'Vuo', 'Vvo', 'MLD'] # Predictor vars
X = df[predictors].values 
y = df['SST'].values      # Target variable
# Needs to be (n,n)(n,)
print(X.shape, y.shape)

### Split the data into two sets: `training` (80%) and `test` (20%)

In [None]:
# Split your dataset so 20% is set aside for testing (0.2) 
# Set random_state to ensure yr train-test split is always the same (for reproducibility)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the 80% training: 20% testing split
print("Trainin set size:", X_train.shape[0])
print("Testing set size:",  X_test.shape[0])

<div class="alert alert-info" role="alert" 
     style="font-size: 1.8em; font-weight: bold; padding: 15px; margin: 10px 0; text-align: center; background-color: #d9edf7; border-color: #bce8f1; color: #31708f; border-radius: 8px;">
    
    Elastic Net Regularisation
<div>

In [None]:
# Fit Elastic Net with cross-validation
elastic_net = ElasticNetCV(l1_ratio = [0.1, 0.5, 0.9],         # Mix of Lasso (L1) & Ridge (L2)
                           alphas = np.logspace(-3, 1, 100),   # Range of regularization strengths
                           cv = 5, random_state = 42)

In [None]:
#Fit the model on the training set
elastic_net.fit(X_train, y_train)

# Predict SST on the test dataset
y_pred = elastic_net.predict(X_test)

<div class="alert alert-info" role="alert" 
     style="font-size: 1.8em; font-weight: bold; padding: 15px; margin: 10px 0; text-align: center; background-color: #d9edf7; border-color: #bce8f1; color: #31708f; border-radius: 8px;">
    Evaluating Model Performance
</div>

### Metrics for Multivariate Regression Model: `R2` and `RMSE`

In [None]:
# Evaluate model performance
split_score = elastic_net.score(X_test, y_test)

# Compute R-squared value
r_sqd = elastic_net.score(X_test, y_test)

# Compute RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Print the metrics
print("Multivar Elastic Net R²:   {:.2}".format(r_sqd))
print("Multivar Elastic Net RMSE: {:.2f}".format(rmse))
# print("Score:{:.2}".format(split_score))

### Those are better scores! But which of the additional variable/s (feature/s) have contributed to this improvement?

<div class="alert alert-info" role="alert" 
     style="font-size: 1.8em; font-weight: bold; padding: 15px; margin: 10px 0; text-align: center; background-color: #d9edf7; border-color: #bce8f1; color: #31708f; border-radius: 8px;">
    Feature Importance
</div>

In [None]:
# Feature Importances:
feature_importance = pd.Series(elastic_net.coef_, index = predictors)

# Sort df so most important variables are at the top (descending order)
regress_df = feature_importance.sort_values(ascending= False)
print('Feature Importance:\n',regress_df)

In [None]:
# Plot Feature Importance:
fig2 = plt.figure(figsize = (6, 4))

# Use diverging palette from Seaborn
cmap = sns.color_palette("Spectral", as_cmap=True)

# Create a barplot with the Spectral color palette
sns.barplot(x = regress_df.values, y = regress_df.index, 
            palette = cmap(regress_df.rank(pct = True)))

# Add features and labels
plt.title("Feature Importance (Elastic Net Coefficients)", fontweight='bold')
plt.xlabel("Coefficient Value", fontsize = 10)
plt.ylabel("Features", fontsize = 10)
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Display
plt.show()

<div class="alert alert-info" role="alert" 
     style="font-size: 1.em; padding: 15px; margin: 10px 0; text-align: left; background-color: #d9edf7; border-color: #bce8f1; color: #31708f; border-radius: 8px;">

    ⦾ SSH appears to be the dominant predictor with the highest Elastic Net coefficient.
    
    ⦾ However, we must remember that ElasticNet applies both L1 and L2 penalties, which shrinks all coefficients. 
       - Good: this regularisation reduces overfitting.
       - Less good: other predictors can appear less important — even if they contribute, and especially when they're correlated.
        
    ⦾ Correlated predictors tend to share predictive power, so the penalty may disproportionately reduce their individual 
      coefficients, making it seem like they have little value in the ranking.
    
<div>

### Ultimately, we've likely reached the limit of what linear models can do for us. If we're dealing with `non-linearity`... we need to upgrade to a model that can account for this!

In [None]:
# Create a figure with 1 row and 2 columns of subplots
fig3, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 4))

# ---------------------------
# Scatterplot: SSH vs. SST
ax1.scatter(df['SSH'], df['SST'], color='purple', alpha=0.7, marker='.', s=10)
ax1.set_xlabel('SSH')                           # Label for the x-axis
ax1.set_ylabel('SST')                           # Label for the y-axis
ax1.set_title('SSH vs. SST' , weight='bold')    # Title for this subplot
ax1.grid(True, linestyle=':', linewidth=0.5)    # Add a dotted grid for clarity

# ---------------------------
# Scatterplot: SSS vs. SST
ax2.scatter(df['SSS'], df['SST'], color='orange', alpha=0.7, marker='.', s=10)
ax2.set_xlabel('SSS')                           # Label for the x-axis
ax2.set_ylabel('SST')                           # Label for the y-axis
ax2.set_xlim([35.3, 36.7])
ax2.set_title('SSS vs. SST' , weight='bold')    # Title for this subplot
ax2.grid(True, linestyle=':', linewidth=0.5)    # Add a dotted grid for clarity

# ---------------------------
# Scatterplot: MLD vs. SST
ax3.scatter(df['MLD'], df['SST'], color='teal', alpha=0.7, marker='.', s=10)
ax3.set_xlabel('MLD')                           # Label for the x-axis
ax3.set_ylabel('SST')                           # Label for the y-axis
ax3.set_xlim([10, 100])
ax3.set_title('MLD vs. SST' , weight='bold')    # Title for this subplot
ax3.grid(True, linestyle=':', linewidth=0.5)    # Add a dotted grid for clarity

# Adjust layout to prevent overlapping elements
plt.tight_layout()
# Display
plt.show()