<a href="https://colab.research.google.com/github/MCRLdata-Sandbox/tutorials/blob/main/MCRLdata_ML_tutorial_Part_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Hands-on Random Forests

Welcome to the fun part of the tutorial! This notebook is a simple, interactive way to understand how different parameters you can change in your model influence the model's performance. The code used to run this portion of the tutorial is Python which is an easier approach for interacting with models in Jupyter Notebooks. When you run the code chunk below (press the play button), check boxes and sliders will appear for model parameters you can manipulate:

  1. check boxes - variables to include as predictors
  2. Split

It is good to note that this code uses a different package to construct Random Forests. While the results are similar, they will not be the same as the previous tutorial. Happy sliding and checking!

*NOTE: If you are looking for the tutorial walking through the basics of Random Forests instead, please check out the first part of this tutorial [here](https://colab.research.google.com/github/MCRLdata-Sandbox/tutorials/blob/main/ML_tutorial.ipynb#scrollTo=-UaHjmisqQaT)*

In [39]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.inspection import PartialDependenceDisplay
from ipywidgets import widgets, VBox, HBox, Output

# Load Data
df1 = pd.read_csv("https://raw.githubusercontent.com/MCRLdata-Sandbox/tutorials/refs/heads/main/data/df1.csv")

# List of predictors for dynamic interaction
all_predictors = ["water_level_m_navd88", "temp_deg_c", "month", "hour", "doy", "white_noise"]

# Plotting function: Measured vs Predicted, Feature Importance, and PDP
def plot_rf_fit_and_feature_importance_and_partial_dependence(model, test_data, features, target):
    test_data['predicted'] = model.predict(test_data[features])
    r2 = r2_score(test_data[target], test_data['predicted'])

    # Compute feature importance
    importance = model.feature_importances_
    importance_df = pd.DataFrame({'predictor': features, 'raw_fi': importance})
    importance_df['fi'] = importance_df['raw_fi'] / importance_df['raw_fi'].sum()
    importance_df = importance_df.sort_values(by='fi', ascending=False)

    # Get top two predictors for PDP
    top_predictors = importance_df['predictor'].iloc[:2].tolist()

    # Create figure layout
    fig = plt.figure(figsize=(24, 8))
    gs = fig.add_gridspec(1, 3, width_ratios=[1, 1, 1])

    # Column 1: Measured vs Predicted
    ax1 = fig.add_subplot(gs[0, 0])
    sns.scatterplot(x=test_data[target], y=test_data['predicted'], alpha=0.75, ax=ax1)
    ax1.plot([test_data[target].min(), test_data[target].max()],
             [test_data[target].min(), test_data[target].max()], linestyle="dashed", color="red")
    ax1.annotate(f'R² = {round(r2, 2)}', (0.3 * test_data[target].max(), 0.9 * test_data['predicted'].max()))
    ax1.annotate(f'n = {test_data.shape[0]}', (0.3 * test_data[target].max(), 0.95 * test_data['predicted'].max()))
    ax1.set_xlabel("Measured pCO2")
    ax1.set_ylabel("Predicted pCO2")
    ax1.set_title("Measured vs Predicted pCO2")

    # Column 2: Feature Importance
    ax2 = fig.add_subplot(gs[0, 1])
    sns.barplot(x=importance_df['fi'] * 100, y=importance_df['predictor'], palette="viridis", ax=ax2)
    ax2.set_xlabel("Feature Importance (%)")
    ax2.set_title("Feature Importance of Random Forest Model")

    # Column 3: PDP (stacked vertically)
    gs_pdp = gs[0, 2].subgridspec(2, 1)  # Two rows in third column
    ax3_top = fig.add_subplot(gs_pdp[0, 0])
    PartialDependenceDisplay.from_estimator(
        model, test_data[features], features=[top_predictors[0]], kind="average", ax=ax3_top
    )
    ax3_top.set_ylabel("pCO2")  # Update y-axis label to "pCO2"
    ax3_top.set_title(f"Partial Dependence - {top_predictors[0]}")

    ax3_bottom = fig.add_subplot(gs_pdp[1, 0])
    PartialDependenceDisplay.from_estimator(
        model, test_data[features], features=[top_predictors[1]], kind="average", ax=ax3_bottom
    )
    ax3_bottom.set_ylabel("pCO2")  # Update y-axis label to "pCO2"
    ax3_bottom.set_title(f"Partial Dependence - {top_predictors[1]}")

    plt.tight_layout()
    plt.show()

# Updated make_model function
def make_model(predictor_states, split_ratio, ntree, mtry):
    # Select active predictors based on checkboxes
    selected_predictors = [p for p, active in zip(all_predictors, predictor_states) if active]

    if not selected_predictors:
        print("No predictors selected. Please select at least one predictor.")
        return

    target = "p_co2_in_water_ppm"
    train, test = train_test_split(df1, test_size=(1 - split_ratio), random_state=42)

    # Train Random Forest Model
    model = RandomForestRegressor(
        n_estimators=ntree, max_features=min(mtry, len(selected_predictors)), random_state=42
    )
    model.fit(train[selected_predictors], train[target])

    # Call the plotting function
    print(f"Using Predictors: {selected_predictors}")
    plot_rf_fit_and_feature_importance_and_partial_dependence(model, test.copy(), selected_predictors, target)

# Create widgets for checkboxes
predictor_checkboxes = [widgets.Checkbox(value=True, description=p) for p in all_predictors]

# Split checkboxes into two columns
midpoint = len(predictor_checkboxes) // 2
checkbox_column1 = VBox(predictor_checkboxes[:midpoint])  # First column
checkbox_column2 = VBox(predictor_checkboxes[midpoint:])  # Second column

# Create sliders for Run Model
split_ratio_slider = widgets.FloatSlider(min=0.6, max=0.9, step=0.01, value=0.7, description="split_ratio")
ntree_slider = widgets.IntSlider(min=100, max=1000, step=50, value=500, description="ntree")
mtry_slider = widgets.IntSlider(min=1, max=5, step=1, value=2, description="mtry")

# Button to trigger the model run
run_button = widgets.Button(description="Run Model", button_style="success")

# Output widget to replace existing output
output = Output()

# Display outputs upon button click
def on_button_click(b):
    with output:  # Use the Output widget
        output.clear_output()  # Clear previous output
        # Gather all checkbox states
        predictor_states = [cb.value for cb in predictor_checkboxes]
        # Get slider values
        split_ratio = split_ratio_slider.value
        ntree = ntree_slider.value
        mtry = mtry_slider.value
        # Run model
        make_model(predictor_states, split_ratio, ntree, mtry)

run_button.on_click(on_button_click)

# Arrange checkboxes in two columns side by side with sliders
checkbox_ui = HBox([checkbox_column1, checkbox_column2])  # Two columns of checkboxes
slider_ui = VBox([split_ratio_slider, ntree_slider, mtry_slider])  # Stack sliders vertically
ui = HBox([checkbox_ui, slider_ui])  # Place checkboxes and sliders side by side
main_ui = VBox([ui, run_button, output])  # Add Output widget below the Run Button

# Display the UI
display(main_ui)

VBox(children=(HBox(children=(HBox(children=(VBox(children=(Checkbox(value=True, description='water_level_m_na…