# Modelling Rogue Wave Data with Random Forest Regression Model

In [None]:
import os
import sys
import pickle
import pandas as pd

sys.path.append('./')
import utils

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr
from fgclustering import FgClustering

import matplotlib.pyplot as plt

In [None]:
print(os.cpu_count()) # ask the question how many CPU cores are available on the current machine
n_jobs = 10
seed = 42

## Loading Rogue Wave Data

Loading the data that was preprocessed in `data_preprocessing.ipynb`.

In [None]:
case = 1 

# Load and unpack the data
with open(f'./data_case{case}.pickle', 'rb') as handle:
    data = pickle.load(handle)

X_train = data[0]
X_test = data[1]
y_train_cat = data[2]
y_test_cat = data[3]

y_train = X_train.AI_10min
y_test = X_test.AI_10min

X_train = X_train.drop(columns=['AI_10min'])
X_test = X_test.drop(columns=['AI_10min'])

## Building a Random Forest Regression Model

### Setting Random Forest Hyperparameters

In [None]:
hyper_grid_classifier = {'n_estimators': [250, 750], 
            'max_depth': [5, 10, 20], 
            'max_samples': [0.8],
            'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
            'max_features': ['sqrt','log2'],
}

### Train the Model

In [None]:
# Define a classifier. We set the oob_score = True, as OOB is a good approximation of the validation set score
classifier = RandomForestRegressor(oob_score=True, random_state=seed, n_jobs=n_jobs)

num_cv = 5
skf_gen = StratifiedKFold(num_cv).split(X_train, y_train_cat)

gridsearch_classifier = GridSearchCV(classifier, hyper_grid_classifier, cv=skf_gen)
gridsearch_classifier.fit(X_train, y_train)

### Evaulate the Model

In [None]:
# Check the results
print(f'The mean cross-validated score of the best model is {round(gridsearch_classifier.best_score_*100, 2)}% accuracy and the parameters of best prediction model are:')
print(gridsearch_classifier.best_params_)

In [None]:
# Take the best estimator
model = gridsearch_classifier.best_estimator_

# predict label 
y_pred = model.predict(X_test)

In [None]:
print(f"MSE: {round(mean_squared_error(y_test, y_pred), 3)}")
print(f"R^2: {round(r2_score(y_test, y_pred), 3)}")
print(f"Spearman R: {round(spearmanr(y_test, y_pred).correlation, 3)}")

In [None]:
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred, alpha=0.7, color='b')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)  # Line y = x for reference
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.grid(True)
plt.show()

### Save the Model

In [None]:
# Save the model with joblib
data_and_model = [X_train, X_test, y_train, y_test, y_train_cat, y_test_cat, model]

with open(f'./model_randomforest_regression.pickle', 'wb') as handle:
    pickle.dump(data_and_model, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Load the Model

In [None]:
# Load and unpack the data
with open(f'./model_randomforest_regression.pickle', 'rb') as handle:
    data_and_model = pickle.load(handle)

X_train = data_and_model[0]
X_test = data_and_model[1]
y_train = data_and_model[2]
y_test = data_and_model[3]
y_train = data_and_model[4]
y_test = data_and_model[5]
model = data_and_model[6]

# is the model performing reasonably on the training data?
print(f'Model Performance on training data: {round(r2_score(y_train, model.predict(X_train))*100,2)} R^2.')

# is the model performing reasonably on the test data?
print(f'Model Performance on test data: {round(r2_score(y_test, model.predict(X_test))*100,2)} R^2.')

## Explainability for Random Forest Model

### Random Forest Feature Importance

An alternative to Permutation Feature Importance is the Random Forest specific feature importance method based on the mean decrease in impurity. The mean decrease in impurity is defined as the total decrease in node impurity averaged over all trees of the ensemble. This Feature Importances is directly provided by the fitted attribute feature_importances_ .

Lets plot the feature importance based on mean decrease in impurity:

In [None]:
utils.plot_impurity_feature_importance(model .feature_importances_, names=X_train.columns, title="Random Forest Feature Importance")

### Interpretation with Forest-Guided Clustering

In [None]:
data_fgc = X_train.copy()
data_fgc["target"] = y_train

Run FGC with subsampling

In [None]:
number_of_iterations = 25   # Number of times dataset will be subsampled
sample_size = 1000           # Number of samples in the subsampled dataset
max_K = 10                   # Maximum K for the FGC

k_result =  dict((i,0) for i in range(1, max_K)) # Dictionary to store the result

for i in range(number_of_iterations):
    # sample the dataset
    data_sample = data_fgc.sample(
        n = sample_size, replace = False, random_state = i
        ) # setting random state for reproducibility
    # instantiate the fgc object on the subsampled dataset and run it:
    fgc = FgClustering(
        model=rf, data=data_sample, target_column='target'
        )
    fgc.run(
        method_clustering = 'pam', max_K = max_K, 
        discart_value_JI = 0.60, bootstraps_JI = 100, n_jobs = n_jobs, verbose = 0
        )
    # save the result
    k_result[fgc.k] += 1

pd.DataFrame(k_result.items(), columns=['k','count']).sort_values(by='count', ascending=False).reset_index(drop=True)

In [None]:
k = 8

In [None]:
# create the fgc object
fgc = FgClustering(model=model, data=data_fgc, target_column="target")

fgc.run(
    number_of_clusters = k, method_clustering = 'pam', 
    bootstraps_JI = 100, bootstraps_p_value = 100, discart_value_JI = 0.6
    ,n_jobs = n_jobs, verbose = 2 
)

Interpreting RandomForestClassifier


In [None]:
data_fgc["target_predicted"] = model.predict(X_train)
fgc.calculate_statistics(data=data_fgc, target_column='target')

In [None]:
fgc.plot_feature_importance()
fgc.plot_decision_paths()