In [None]:
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
import netCDF4 as nc
import pandas as pd
import os 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold 
from sklearn.ensemble import RandomForestClassifier
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns

%matplotlib inline

# Data Preparation

In [None]:
colnames=['day', 'month','year', 'lon', 'lat', 'Qnet','slp', 'sat', 'wind_speed','sst', 'sstRoC', 'mhw_categories']
allvars = pd.read_csv('balance_noland_2lag.csv', names=colnames, header=None)

In [None]:
allvars.head()

In [None]:
# Labels are the values we want to predict
labels = np.array(allvars['mhw_categories'])

In [None]:
# Remove the labels from the features
# axis 1 refers to the columns
allvars= allvars.drop('mhw_categories', axis = 1)
# Saving feature names for later use
allvars_list = list(allvars.columns)
allvars_list

In [None]:
# Convert to numpy array
allvars = np.array(allvars)

In [None]:
# Split the data into training and testing sets
# the random_state parameter is used for initializing the internal random number generator, 
# which will decide the splitting of data into train and test indices.
train_allvars, test_allvars, train_labels, test_labels = train_test_split(allvars, labels, test_size = 0.4, random_state = 42)

In [None]:
print('Training allvars Shape:', train_allvars.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing allvars Shape:', test_allvars.shape)
print('Testing Labels Shape:', test_labels.shape)

# Randomized Cross Validation

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 9)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = 3)
# Fit the random search model
rf_random.fit(train_allvars, train_labels)

In [None]:
# Get CV results
rf_random.best_params_

df_cv_results = pd.DataFrame(rf_random.cv_results_)
df_cv_results.head()

In [None]:
# Define evaluation means function
def evaluate(model, test_allvars, test_labels):
    predictions = model.predict(test_allvars)
    predictions1=predictions+1 
    test_labels1=test_labels+1
    errors = abs(predictions1 - test_labels1)
    mape = 100 * np.mean(errors / test_labels1)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [None]:
# Evaluate best model against base model
base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
base_model.fit(train_allvars, train_labels)
base_accuracy = evaluate(base_model, test_allvars, test_labels)

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, test_allvars, test_labels)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

# Random Forest
# Run RF with best model identified by CV 

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
# Instantiate model with N decision trees
rfc = RandomForestClassifier(n_estimators = 800, min_samples_split=2,min_samples_leaf=1, random_state = 42) # oob_score=True, bootstrap=True
# Train the model on training data
rfc.fit(train_allvars, train_labels);

In [None]:
print('Score against train set: ', rfc.score(train_allvars, train_labels))
# print('OOB Score: ', rfc.oob_score_) # only if bootstrap=True
print('Score against test set: ', rfc.score(test_allvars, test_labels))

In [None]:
# Make predictions for the test set
yc_pred_test = rfc.predict(test_allvars)
# Add 1 to be able to do the division for MAPE 
yc_pred_test1=yc_pred_test+1 
test_labels1=test_labels+1

In [None]:
# Calculate the absolute errors
errorsc = abs(yc_pred_test1 - test_labels1)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errorsc), 2))

In [None]:

# Calculate mean absolute percentage error (MAPE)
mapec = 100 * (errorsc / test_labels1)
# Calculate and display accuracy
accuracyc = 100 - np.mean(mapec)
print('Accuracy:', round(accuracyc, 2), '%.')

In [None]:
print('Parameters currently in use:\n')
pprint(rfc.get_params())

In [None]:
# View confusion matrix for test data and predictions
confusion_matrix(test_labels, yc_pred_test)

In [None]:
# Get and reshape confusion matrix data
matrix = confusion_matrix(test_labels, yc_pred_test)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

# Build the plot
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)

# Add labels to the plot
class_names = ['90th Percentile','Moderate', 'Strong', 'Severe', 
               'Extreme']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.show()

In [None]:
print(classification_report(test_labels, yc_pred_test))