<a href="https://colab.research.google.com/github/MScEcologyAndDataScienceUCL/BIOS0032_AI4Environment/blob/main/02_Intro_to_ML/02b_ML_model_sandbox.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Week 2: B. Model Sandbox

This is a little playground that allows you to _draw_ your own dataset and observe how different
supervised machine learning models behave!

Notebook © 2025 [Benjamin Kellenberger](https://bkellenb.github.io/), adapted from [original
source](https://github.com/probabl-ai/youtube-appendix/blob/main/04-drawing-data/notebook.ipynb).


## Instructions

1. Run all the code cells below and scroll to the bottom.
2. Start drawing in the widget in the bottom left (click and hold/drag the mouse over
   the grey area). Blue data points should appear.
3. Switch the "Class:" option to another one (_e.g._, "b") and draw another cluster of points.
4. A new panel on the right should appear showing you the **decision boundary** of the currently
   selected classifier.

Now, you can experiment! Watch what happens if you:
* Adjust the model's hyperparameters (below the model selection dropdown menu).
* Select a different model.
* Draw differently distributed points: try them clustered together, one class in a ring around the
  other, with noise/outliers, _etc._

We need to install the [drawdata](https://github.com/koaning/drawdata) library first:

In [None]:
%pip install drawdata

In [3]:
from collections import defaultdict
from drawdata import ScatterWidget
import matplotlib.pyplot as plt
from IPython.display import display
from IPython.core.display import HTML
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

import matplotlib.pylab as plt 
import numpy as np
import ipywidgets
import numpy as np
import matplotlib.pyplot as plt

In [14]:
widget = ScatterWidget()
output = ipywidgets.Output()



# model selection
MODELS = {
    'k-Nearest Neighbour': {
        'class': KNeighborsClassifier,
        'params': {
            'n_neighbors': [1, 1, 20]
        }
    },
    'Decision Tree': {
        'class': DecisionTreeClassifier,
        'params': {
            'max_depth': [1, 2, 50]
        }
    },
    'Random Forest': {
        'class': RandomForestClassifier,
        'params': {
            'n_estimators': [1, 5, 200],
            'max_depth': [1, 2, 50],
            'min_samples_leaf': [1, 3, 50]
        }
    },
    'Gaussian Naive Bayes': {
        'class': GaussianNB,
        'params': {

        }
    },
    'Logistic Regression': {
        'class': LogisticRegression,
        'params': {
            'C': [0.01, 1.0, 10.0]
        }
    },
    'Support Vector Machine (SVM)': {
        'class': SVC,
        'params': {
            'C': [0.01, 1.0, 10.0],
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
        }
    },
    'Multi-Layer Perceptron (MLP)': {
        'class': MLPClassifier,
        'params': {
            'num_layers': [1, 2, 20],
            'hidden_layer_sizes': [1, 100, 1000],
            'activation': ['identity', 'logistic', 'tanh', 'relu'],
            'solver': ['lbfgs', 'sgd', 'adam'],
            'alpha': [0.00001, 0.00001, 10.0],
            'learning_rate': ['constant', 'invscaling', 'adaptive'],
            'learning_rate_init': [0.0001, 0.001, 10.0],
            'max_iter': [1, 200, 1000]
        }
    }
}


model_select = ipywidgets.Dropdown(description='Model:',
                                   options=MODELS.keys(),
                                   value='k-Nearest Neighbour')


show_probs = ipywidgets.Checkbox(description='Show probabilities if available',
                                value=False)


widgets, widget_lut = [], defaultdict(dict)


@output.capture(clear_output=True)
def on_change(change):
    df = widget.data_as_pandas
    if len(df) > 0 and df['color'].nunique() > 1:
        X = df[['x', 'y']].values
        y = df['color']

        # select and train model
        model_meta = MODELS[model_select.value]
        kwargs = {}
        for hkey, hwdgt in widget_lut[model_select.value].items():
            kwargs[hkey] = hwdgt.value

        if 'MLP' in model_select.value:
            # adjust hidden layer config (bit of a dirty hack)
            kwargs['hidden_layer_sizes'] = kwargs['num_layers'] * [kwargs['hidden_layer_sizes']]
            del kwargs['num_layers']

        model = model_meta['class'](**kwargs)
        model.fit(X, y)

        display(HTML("<br><br><br>"))
        plt.figure(figsize=(12, 12))
        response_method = 'predict'
        if show_probs.value and len(np.unique(df['color'])) == 2 and \
                hasattr(model, 'predict_proba'):
            response_method = 'predict_proba'
        disp = DecisionBoundaryDisplay.from_estimator(
            model, X, 
            response_method=response_method,
            xlabel="x", ylabel="y",
            alpha=0.5,
        )
        disp.ax_.scatter(X[:, 0], X[:, 1], c=y, edgecolor="k")
        plt.title(f"{model.__class__.__name__}")
        plt.show()


def set_model(change):
    if change is None or (change['type'] == 'change' and change['name'] == 'value'):
        # hide all widgets except for current model's
        for model_name, wdgts in widget_lut.items():
            vis = 'block' if model_name == model_select.value else 'none'
            for wdgt in wdgts.values():
                wdgt.layout.display = vis
        on_change(change)


# pre-create widgets
for model_name, model_meta in MODELS.items():
    for hkey, hvals in model_meta['params'].items():
        if isinstance(hvals[0], str):
            control = ipywidgets.Dropdown(description=f'{hkey}:',
                                          options=hvals)
        else:
            wdgt_class = ipywidgets.IntSlider if isinstance(hvals[0], int) else ipywidgets.FloatSlider
            control = wdgt_class(
                value=hvals[1],
                min=hvals[0],
                max=hvals[2],
                description=f'{hkey}:',
                continuous_update=False
            )
        control.layout.min_height = '50px'
        control.observe(set_model)
        control.layout.display = 'none'
        widgets.append(control)
        widget_lut[model_name][hkey] = control


model_select.observe(set_model)
widget.observe(on_change, names=["data"])
set_model(None)

hyperparams = ipywidgets.VBox(widgets)

show_probs.observe(set_model)

ipywidgets.VBox([
    model_select,
    hyperparams,
    show_probs,
    ipywidgets.HBox([widget, output])
])

VBox(children=(Dropdown(description='Model:', options=('k-Nearest Neighbour', 'Decision Tree', 'Random Forest'…

VBox(children=(Dropdown(description='Model:', options=('k-Nearest Neighbour', 'Decision Tree', 'Random Forest'…