# Evaluating models

We'll go through some problematic behaviours of the nearest neighbor algorithm

## Problems with Nearest Neighbours

In [None]:
# Import necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython import display

In [None]:
# load data the butterfly data from csv file
butterflies = pd.read_csv("data/butterflies.csv")

In [None]:
# Implementation of calculation of distance between two points
def compute_distance(point1, point2):
    return np.sqrt((point1[0] - point2[0]) ** 2 + (point1[1] - point2[1]) ** 2)

In [None]:
def nearest_neighbour(test_point, train_dataset=butterflies):
    # compute the distance from the test point to every example in the train dataset
    distance = train_dataset.apply(
        lambda row: compute_distance(test_point, [row["Width"], row["Height"]]),
        axis=1,
    )

    # find the point in the dataset that is closest to the test point
    closest_point = train_dataset.iloc[distance.argmin()]

    # assume the test point is the same class as the datapoint it is closest to
    predicted_species = closest_point["Species"]

    return predicted_species, closest_point

### Problems with Nearest Neighbours #1: Overconfidence

In [None]:
# New test point far from all other data points
test_point = [10, 8]

# Find the nearest point and predicted species
predicted_species, closest_point = nearest_neighbour(test_point)

plt.figure(figsize=(10, 6))

# plot training dataset
ax = sns.scatterplot(
    data=butterflies,
    x="Width",
    y="Height",
    hue="Species",
)

# plot the test point as an 'x'
sns.scatterplot(
    x=[test_point[0]],
    y=[test_point[1]],
    marker="x",
    label="test point",
    color="black",
)

# plot a ring around the nearest datapoint
sns.scatterplot(
    x=[closest_point["Width"]],
    y=[closest_point["Height"]],
    marker="o",
    label="nearest training point",
    edgecolor="black",
    facecolor="none",
);

The closest point in the training dataset is a _pyronia tithonus_ point, and points of other species are considerably further away, hence the model confidently predicts _pyronia tithonus_ for the test point.

Is this a reasonable prediction?

### Problems with Nearest Neighbours #2: Memory & speed

In [None]:
# Lets fake a dataset with variable size.
def generate_synthetic_dataset(n_per_class=100, n_classes=3):
    # There will be n_classes "normal" clusters with random means and variances
    means = np.random.uniform(size=(n_classes, 2)) * np.array([8, 2])
    variances = np.random.uniform(low=0.1, high=0.9, size=(n_classes, 2)) * np.array(
        [2, 0.5]
    )

    # For each class, we sample N_per_class random points around the class mean.
    points = [
        (
            means[i, 0] + variances[i, 0] * np.random.normal(), # x coordinate
            means[i, 1] + variances[i, 1] * np.random.normal(), # y coordinate
            str(i), # species
        )
        for j in range(n_per_class)
        for i in range(n_classes)
    ]

    # Group into a pandas dataframe
    synthetic_dataset = pd.DataFrame(points, columns=["Width", "Height", "Species"])

    return synthetic_dataset

In [None]:
# generate a synthetic dataset with 3 classes and 100 points per class
synthetic_dataset = generate_synthetic_dataset(n_per_class=100, n_classes=3)

# plot training dataset
plt.figure(figsize=(10, 6))
ax = sns.scatterplot(
    data=synthetic_dataset,
    x="Width",
    y="Height",
    hue="Species",
)

The synthetic dataset has 300 points in total. Computing the nearest neighbor to a random point is relatively fast:

In [None]:
%timeit nearest_neighbour(np.random.uniform(low=[0, 0], high=[8, 3], size=2), train_dataset=synthetic_dataset)

The number of distance computations and computation time grows with the amount of data points

In [None]:
synthetic_dataset_2 = generate_synthetic_dataset(n_per_class=1000, n_classes=3)

In [None]:
%timeit nearest_neighbour(np.random.uniform(low=[0, 0], high=[8, 3], size=2), train_dataset=synthetic_dataset_2)

In [None]:
synthetic_dataset_3 = generate_synthetic_dataset(n_per_class=1000, n_classes=30)

In [None]:
%timeit nearest_neighbour(np.random.uniform(low=[0, 0], high=[8, 3], size=2), train_dataset=synthetic_dataset_3)

What happens for a dataset with 1500 species and 1000 observations per species (not uncommon)?

In [None]:
synthetic_dataset_4 = generate_synthetic_dataset(n_per_class=1000, n_classes=1500)

In [None]:
dataset_4_size_in_bytes = synthetic_dataset_4.memory_usage(index=False).sum()
print(f"Dataset 4 Size: {dataset_4_size_in_bytes / 1e6} MB")

In [None]:
%timeit nearest_neighbour(np.random.uniform(low=[0, 0], high=[8, 3], size=2), train_dataset=synthetic_dataset_4)

* It does not require a large dataset to make the algorithm slow.

* The whole training dataset needs to be "memorized" to make a single prediction.

### Problems with Nearest Neighbours #3: Noise

In [None]:
from matplotlib.colors import ListedColormap
from sklearn import neighbors
from sklearn.preprocessing import LabelEncoder

In [None]:
def plot_nearest_neighbors_decision_boundary(dataset):
    X = dataset[["Width", "Height"]].values
    y = dataset["Species"].values

    # We create an instance of Neighbours Classifier and fit the data.
    clf = neighbors.KNeighborsClassifier(1)
    enc = (
        LabelEncoder()
    )  # Note we use a label encoder to transform species names to whole numbers
    clf.fit(X, enc.fit_transform(y))

    # Create color maps
    cmap_light = ListedColormap(["orange", "cyan", "cornflowerblue"])
    cmap_bold = ["darkorange", "c", "darkblue"]

    h = 0.1  # step size in the mesh

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max] x [y_min, y_max].
    x_min, x_max = 0, 8
    y_min, y_max = 0, 3
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(figsize=(8, 6))
    plt.contourf(xx, yy, Z, cmap=cmap_light)

    # Plot also the training points
    sns.scatterplot(
        data=dataset,
        x="Width",
        y="Height",
        hue="Species",
        hue_order=enc.classes_,
        palette=cmap_bold,
        alpha=1.0,
        edgecolor="black",
    );

The decision boundary shows how different regions of the "feature" space would get classified by the nearest neighbor algorithm

In [None]:
plot_nearest_neighbors_decision_boundary(butterflies)

However, if a single mistake is introduced in the dataset the "decision boundaries" change a lot.

In [None]:
# Make a copy of the dataset
butterflies_corrupted = butterflies.copy()

# And change the label of a single entry
butterflies_corrupted.loc[90, "Species"] = "maniola jurtina"

In [None]:
plot_nearest_neighbors_decision_boundary(butterflies_corrupted)

* Is this model a good choice when you expect your data to be noisy?

### Problems with Nearest Neighbours #4: Scaling

Data points in the butterfly dataset use grams (g) and centimeter (cm) units for 

In [None]:
test_point = [2.7, 0.7]

predicted_species, closest_point = nearest_neighbour(test_point)

plt.figure(figsize=(10, 6))

# plot training dataset
ax = sns.scatterplot(
    data=butterflies,
    x="Width",
    y="Height",
    hue="Species",
)

# plot the test point as an 'x'
sns.scatterplot(
    x=[test_point[0]],
    y=[test_point[1]],
    marker="x",
    label="test point",
    color="black",
)

# plot a ring around the nearest datapoint
sns.scatterplot(
    x=[closest_point["Width"]],
    y=[closest_point["Height"]],
    marker="o",
    label="nearest training point",
    edgecolor="black",
    facecolor="none",
)

# add arrow to nearest neighbor
ax.annotate(
    text=f"Nearest neighbor",
    xy=(closest_point[0] + 0.05, closest_point[1] - 0.02),
    xytext=(closest_point[0] + 0.3, closest_point[1] - 0.3),
    fontsize=12,
    arrowprops={
        "width": 0.5,
        "headwidth": 4,
        "headlength": 4,
    },
)

ax.annotate(
    text=f"Test data\nx = ({test_point[0]} cm, {test_point[1]} cm)",
    xy=(test_point[0] + 0.05, test_point[1] - 0.02),
    xytext=(test_point[0] + 0.3, test_point[1] - 0.3),
    fontsize=12,
    arrowprops={
        "width": 0.5,
        "headwidth": 4,
        "headlength": 4,
    },
);

In [None]:
factor = 10

scaled_dataset = butterflies.copy()
scaled_dataset["Height"] = scaled_dataset["Height"] * factor

test_point = [2.7, factor * 0.7]

predicted_species, closest_point = nearest_neighbour(
    test_point, train_dataset=scaled_dataset
)

plt.figure(figsize=(10, 6))

# plot training dataset
ax = sns.scatterplot(
    data=scaled_dataset,
    x="Width",
    y="Height",
    hue="Species",
)

# plot the test point as an 'x'
sns.scatterplot(
    x=[test_point[0]],
    y=[test_point[1]],
    marker="x",
    label="test point",
    color="black",
)

# plot a ring around the nearest datapoint
sns.scatterplot(
    x=[closest_point["Width"]],
    y=[closest_point["Height"]],
    marker="o",
    label="nearest training point",
    edgecolor="black",
    facecolor="none",
)

# add arrow to nearest neighbor
ax.annotate(
    text=f"Nearest neighbor",
    xy=(closest_point[0] + 0.05, closest_point[1] - 0.02),
    xytext=(closest_point[0] + 0.3, closest_point[1] - 0.3),
    fontsize=12,
    arrowprops={
        "width": 0.5,
        "headwidth": 4,
        "headlength": 4,
    },
)

ax.annotate(
    text=f"Test data\nx = ({test_point[0]} cm, {test_point[1]} mm)",
    xy=(test_point[0] + 0.05, test_point[1] - 0.02),
    xytext=(test_point[0] + 0.3, test_point[1] - 0.3),
    fontsize=12,
    arrowprops={
        "width": 0.5,
        "headwidth": 4,
        "headlength": 4,
    },
);

## Model evaluation

How can we be confident about the predictions of a model, or evaluate its performance?

### Training / validation split

In [None]:
plt.pie(
    [15, 85],
    labels=("train data", "validation data"),
    startangle=90,
)
plt.title("Full data split");
plt.legend();

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Use the train_test_split function to split the full dataset into disjoint subsets
# for training and validation
train_data, validation_data = train_test_split(butterflies, test_size=25)

In [None]:
plt.figure(figsize=(10, 5))

# Plot the train data on the left
ax1 = plt.subplot(1, 2, 1)
sns.scatterplot(
    data=train_data,
    x="Width",
    y="Height",
    hue="Species",
    ax=ax1,
)
ax1.set_title("Train data (75%)")

# Plot the validation data on the right
ax2 = plt.subplot(1, 2, 2)
sns.scatterplot(
    data=validation_data,
    x="Width",
    y="Height",
    hue="Species",
    ax=ax2,
)
ax2.set_title("Validation data (25%)");

### How good is your model?

In [None]:
# The validation dataset contains the true species, or ground truth. We desire
# to build models that accurately predict the ground truth.
ground_truth = validation_data.Species
ground_truth

In [None]:
from functools import partial

# Use train dataset to create a nearest neighbour model
model = partial(nearest_neighbour, train_dataset=train_data)

In [None]:
# Use the model to make inference on the validation data
predictions = []
for butterfly in validation_data.itertuples():
    predicted_class, closest_point = model([butterfly.Width, butterfly.Height])
    predictions.append({
        "Width": butterfly.Width,
        "Height": butterfly.Height,
        "ground_truth": butterfly.Species,
        "prediction": predicted_class,
    })
    
# Make a dataframe from the results
prediction_dataframe = pd.DataFrame(predictions)

In [None]:
# Show the predictions dataframe
prediction_dataframe

In [None]:
# How many butterflies were classified correctly?
is_correct = prediction_dataframe["ground_truth"] == prediction_dataframe["prediction"]

is_correct.value_counts()

In [None]:
accuracy = is_correct.mean()
error = 1 - accuracy

print(f"Accuracy: {accuracy:.1%}")
print(f"Error: {error:.1%}")