# Modelling Rogue Wave Data with Elastic Net Regression Model

In [None]:
import os
import sys
import pickle

sys.path.append('./')
import utils

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr

import matplotlib.pyplot as plt

In [21]:
print(os.cpu_count()) # ask the question how many CPU cores are available on the current machine
n_jobs = 10
seed = 42

12


## Loading Rogue Wave Data

Loading the data that was preprocessed in `data_preprocessing.ipynb`.

In [None]:
case = 1 

# Load and unpack the data
with open(f'./data_case{case}.pickle', 'rb') as handle:
    data = pickle.load(handle)

X_train = data[0]
X_test = data[1]
y_train_cat = data[2]
y_test_cat = data[3]

y_train = X_train.AI_10min
y_test = X_test.AI_10min

X_train = X_train.drop(columns=['AI_10min'])
X_test = X_test.drop(columns=['AI_10min'])

In [16]:
scaler = StandardScaler()
X_train_transformed = scaler.fit_transform(X_train)
X_test_transformed = scaler.transform(X_test)

### Train the Model

Running the ElasticNet (setting l1_ratio to 0.5, as suggested for this type of data) and tune only C parameter (regularization strength) with corss-validation.

In [None]:
# fit a model
num_cv = 5
skf_gen = StratifiedKFold(num_cv).split(X_train_transformed, y_train_cat)

model = ElasticNetCV(cv = skf_gen, l1_ratio = [0.5], n_jobs=n_jobs, random_state = seed)
model.fit(X_train_transformed, y_train)

In [None]:
# predict label with default threshold
y_pred = model.predict(X_test_transformed)

### Evaulate the Model

In [None]:
print(f"MSE: {round(mean_squared_error(y_test, y_pred), 3)}")
print(f"R^2: {round(r2_score(y_test, y_pred), 3)}")
print(f"Spearman R: {round(spearmanr(y_test, y_pred).correlation, 3)}")

In [None]:
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred, alpha=0.7, color='b')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)  # Line y = x for reference
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.grid(True)
plt.show()

## Save the Model

In [None]:
# Save the model with joblib
data_and_model = [X_train, X_test, y_train, y_test, y_train_cat, y_test_cat, model]

with open(f'./model_elnet_regression.pickle', 'wb') as handle:
    pickle.dump(data_and_model, handle, protocol=pickle.HIGHEST_PROTOCOL)