In [1]:
%load_ext autoreload
%autoreload 2

import os

while "notebooks" in os.getcwd():
    os.chdir("..")

# Drawing ROC curves with a turtle

## Training a classifier on the Iris dataset

In [2]:
from time import sleep

import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_iris
from sklearn.metrics import RocCurveDisplay
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelBinarizer
from sklearn.svm import SVC

from lib.drawing import draw_roc_curve_with_a_turtle, draw_roc_curve_with_multiple_turtles

In [3]:
iris = load_iris()
target_names = iris.target_names
X, y = iris.data, iris.target
y = iris.target_names[y]

In [4]:
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
X = np.concatenate([X, random_state.randn(n_samples, 200 * n_features)], axis=1)

In [5]:
(
    X_train,
    X_test,
    y_train,
    y_test,
) = train_test_split(X, y, test_size=0.5, stratify=y, random_state=0)

In [6]:
classifier = LogisticRegression()
y_score = classifier.fit(X_train, y_train).predict_proba(X_test)

In [None]:
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape  # (n_samples, n_classes)

In [None]:
label_binarizer.transform(["virginica"])

In [None]:
class_of_interest = "virginica"
class_id = np.flatnonzero(label_binarizer.classes_ == class_of_interest)[0]
class_id

In [10]:
y_true = y_onehot_test[:, class_id]
y_pred = y_score[:, class_id]

## Draw with sklearn

In [None]:
roc_display = RocCurveDisplay.from_predictions(
    y_onehot_test[:, class_id],
    y_score[:, class_id],
    name=f"{class_of_interest} vs the rest",
    color="blue",
    plot_chance_level=True,
)
_ = roc_display.ax_.set(
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title="One-vs-Rest ROC curves:\nVirginica vs (Setosa & Versicolor)",
)

## Draw with a turtle

In [None]:
canvas = draw_roc_curve_with_a_turtle(y_true, y_pred, "blue")

In [13]:
canvas.sync_image_data = True
sleep(2)

In [14]:
canvas.to_file("just_logistic.png")

## Compare with the random classifier

In [15]:
random_scores = np.random.random(len(y_true))

In [None]:
canvas = draw_roc_curve_with_a_turtle(y_true, random_scores, "black", stop_at=0.8)

In [17]:
canvas.sync_image_data = True
sleep(2)

In [18]:
canvas.to_file("random.png")

## Race of the turtles

In [None]:
canvas = draw_roc_curve_with_multiple_turtles(y_true, [y_pred, random_scores], ["blue", "black"], stop_at=0.8)

In [20]:
canvas.sync_image_data = True
sleep(2)

In [21]:
canvas.to_file("logistic_vs_random.png")

## Logistic regression vs SVM race

In [22]:
y_onehot_train = label_binarizer.transform(y_train)

In [23]:
train_labels = y_onehot_test[:, class_id]

In [24]:
svm_model = SVC()
svm_model.fit(X_train, train_labels)
y_pred_svm = svm_model.decision_function(X_test)

In [None]:
random_scores_1 = np.random.random(len(y_true))
canvas = draw_roc_curve_with_multiple_turtles(y_true, [y_pred, y_pred_svm, random_scores_1], ["blue", "green", "black"])

In [26]:
canvas.sync_image_data = True
sleep(2)

In [27]:
canvas.to_file("logistic_vs_svm_vs_random.png")

In [None]:
roc_display = RocCurveDisplay.from_predictions(
    y_onehot_test[:, class_id],
    y_pred_svm,
    name=f"{class_of_interest} vs the rest",
    color="blue",
    plot_chance_level=True,
)
_ = roc_display.ax_.set(
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title="One-vs-Rest ROC curves:\nVirginica vs (Setosa & Versicolor)",
)