# Classification

In [None]:
from functools import reduce

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn import model_selection
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

In [None]:
def _check_boundary_response_method(estimator):
    has_classes = hasattr(estimator, "classes_")

    if has_classes and len(estimator.classes_) > 2:
        methods_list = ["predict"]
    else:
        methods_list = ["decision_function", "predict_proba", "predict"]

    prediction_method = [getattr(estimator, method, None) for method in methods_list]

    prediction_method = reduce(lambda x, y: x or y, prediction_method)

    if prediction_method is None:
        raise ValueError(
            f"{estimator.__class__.__name__} has none of the following attributes: "
            f"{', '.join(methods_list)}."
        )

    return prediction_method

In [None]:
def plot_decision_boundary(
    estimator,
    x,
    y,
    *,
    plot_method="contourf",
    grid_resolution=100,
    eps=1.0,
    ax=None,
    **kwargs,
):
    x_min, x_max = x.min() - eps, x.max() + eps
    y_min, y_max = y.min() - eps, y.max() + eps
    xx, yy = np.meshgrid(
        np.linspace(x_min, x_max, grid_resolution),
        np.linspace(y_min, y_max, grid_resolution),
    )
    X_grid = np.c_[xx.ravel(), yy.ravel()]
    pred_func = _check_boundary_response_method(estimator)
    response = pred_func(X_grid)

    # convert classes predictions into integers
    if pred_func.__name__ == "predict" and hasattr(estimator, "classes_"):
        encoder = LabelEncoder()
        encoder.classes_ = estimator.classes_
        response = encoder.transform(response)

    if response.ndim != 1:
        response = response[:, 1]

    response = response.reshape(xx.shape)
    if ax is None:
        _, ax = plt.subplots()
    surface = getattr(ax, plot_method)(xx, yy, response, **kwargs)
    return ax

## Example application - Species identification

Given some features derived from an image, audio recording, a dna sample, or some physical measurements can we identify the species from which the data came?

![plant classification through leaf images](https://miro.medium.com/max/720/1*leLKD1K6sMtuqr9KK8RaJg.webp)

(image from [Building a Convolutional Neural Network to Classify Birds](https://blog.jovian.ai/building-a-convolutional-neural-network-to-classify-birds-528794240fa1))

Given our feature vector <b style="color: deepskyblue">x</b>, can we predict the correct species (i.e. class label) <span style="color: coral;">y</span>?

Given this digital image, can we predict which species is depicted?

![input image](https://miro.medium.com/max/514/1*60KGh4n-Rt3sjcWJVgRmlA.webp)

## Supervised classification

There exists a whole host of different classification algorithms each with their own strengths and weaknesses

Popular Algorithms:
* Nearest Neighbour
* Logistic Regression
* Support Vector Machines (SVM)
* Decision Trees
* Random Forests
* Neural Networks
* Gaussian Process Classification

For example, the nearest neighbour algorithm is flexible but requires a lot of computation.

What are some alternatives? Is there a way that involves less computation?

### Example: Iris dataset

In [None]:
# Load a the Iris dataset from scikit-learn.
data = load_iris(as_frame=True)
iris = data.data
iris["species"] = [data.target_names[i] for i in data.target]

# Only use petal length and width attributes.
x = "petal length (cm)"
y = "petal width (cm)"

In [None]:
# Plot the full dataset, color indicates the species
sns.scatterplot(data=iris, x=x, y=y, hue="species")

## Linear classifier

### Separable data

In [None]:
# Select only data of the setosa and virginica species
separable_dataset = iris[iris.species.isin(["setosa", "versicolor"])]

In [None]:
# Plot data points
sns.scatterplot(
    data=separable_dataset,
    x="petal length (cm)",
    y="petal width (cm)",
    hue="species",
);

Data points from the different species are clearly separated. In fact, they can be separated by a line, and new points can be classified depending
on which side of the line they fall.

This is what a linear classifier does, in essence. Here we will use the linear support vector classifier (SVC). See [here](https://scikit-learn.org/stable/modules/svm.html#svc) for more details.

In [None]:
# Fit a linear support vector classifier (SVC) on the separable dataset
clf = LinearSVC().fit(
    separable_dataset[[x, y]].values,
    separable_dataset["species"],
)

# Plot the decision boundary of the linear classifier
ax = plot_decision_boundary(
    clf,
    x=separable_dataset[x],
    y=separable_dataset[y],
    grid_resolution=100,
    plot_method="contour",
    levels=[0],
)

# Plot the decision regions of the linear classifier
plot_decision_boundary(
    clf,
    x=separable_dataset[x],
    y=separable_dataset[y],
    grid_resolution=100,
    levels=[-100, 0, 100],
    ax=ax,
    cmap=ListedColormap(["cornflowerblue", "cyan", "orange"]),
)

# Overlay the dataset points
sns.scatterplot(
    data=separable_dataset,
    x=x,
    y=y,
    hue="species",
    ax=ax,
);

Evaluating this model is computationally cheap and fast.

In [None]:
random_point = np.random.uniform(low=[0, 0], high=[6, 2.5], size=2).reshape(-1, 2)

In [None]:
%timeit clf.decision_function(random_point)

Compared to the nearest neighbour classifier

In [None]:
nn_model = KNeighborsClassifier(n_neighbors=1).fit(
    separable_dataset[[x, y]].values,
    separable_dataset["species"],
)

In [None]:
%timeit nn_model.predict(random_point)

it is around 10 times faster, and does not get slower for larger datasets.

### Non-separable data

In [None]:
# Select only data of the versicolor and virginica species
non_separable_species = ["virginica", "versicolor"]
non_separable_dataset = iris[iris.species.isin(non_separable_species)]

In [None]:
# Plot data points
x = "petal length (cm)"
y = "petal width (cm)"

sns.scatterplot(
    data=non_separable_dataset,
    x=x,
    y=y,
    hue="species",
);

Points of different species are not as separated as before. There is no line that will cleanly cut into the two species.

In [None]:
# Fit a linear support vector machine on the non-linearly-separable dataset
clf = LinearSVC().fit(
    non_separable_dataset[[x, y]].values,
    non_separable_dataset["species"],
)

# Plot the decision boundary of the linear classifier
ax = plot_decision_boundary(
    clf,
    x=non_separable_dataset[x],
    y=non_separable_dataset[y],
    grid_resolution=100,
    plot_method="contour",
    levels=[0],
)

plot_decision_boundary(
    clf,
    x=non_separable_dataset[x],
    y=non_separable_dataset[y],
    grid_resolution=100,
    levels=[-100, 0, 100],
    ax=ax,
    cmap=ListedColormap(["cornflowerblue", "cyan", "orange"]),
)

# Overlay the dataset points
sns.scatterplot(
    data=non_separable_dataset,
    x=x,
    y=y,
    hue="species",
    ax=ax,
);

## Non linear classifier

In [None]:
from sklearn.svm import SVC

In [None]:
# Fit a non-linear support vector machine on the non-linearly-separable dataset
clf = SVC(C=100, gamma=10).fit(
    non_separable_dataset[[x, y]].values,
    non_separable_dataset["species"],
)

# Plot the decision boundary of the linear classifier
ax = plot_decision_boundary(
    clf,
    x=non_separable_dataset[x],
    y=non_separable_dataset[y],
    grid_resolution=100,
    plot_method="contour",
    levels=[0],
)

plot_decision_boundary(
    clf,
    x=non_separable_dataset[x],
    y=non_separable_dataset[y],
    grid_resolution=100,
    levels=[-100, 0, 100],
    ax=ax,
    cmap=ListedColormap(["cornflowerblue", "cyan", "orange"]),
)

# Overlay the dataset points
sns.scatterplot(
    data=non_separable_dataset,
    x=x,
    y=y,
    hue="species",
    ax=ax,
);

### Decision tree

In [None]:
from sklearn import tree

Partitions up the feature space using very simple decision rules. For example if is the length $\leq$ 5.1, or the width is $\leq$ 1.75.

In [None]:
# Plot the dataset points
ax = sns.scatterplot(
    data=non_separable_dataset,
    x=x,
    y=y,
    hue="species",
)

# Draw horizontal line at y = 1.75
ax.axhline(1.75, color="gray", linewidth=3, alpha=0.2)

# Draw vertical line at x = 4.95
ax.axvline(4.95, color="blue", linewidth=3, alpha=0.2, ymax=0.5)

In [None]:
tree_model = tree.DecisionTreeClassifier(max_depth=2, min_impurity_decrease=0.01).fit(
    non_separable_dataset[[x, y]],
    non_separable_dataset["species"],
)

tree_model.classes_ = tree_model.classes_[::-1]

In [None]:
_, ax = plt.subplots(figsize=(10, 6))

tree.plot_tree(
    tree_model,
    feature_names=[x, y],
    class_names=tree_model.classes_,
    filled=True,
    impurity=False,
    label="root",
    rounded=True,
    ax=ax,
)

ax.text(0.43, 0.66, "yes")
ax.text(0.73, 0.66, "no")

In [None]:
# Fit a non-linear support vector machine on the non-linearly-separable dataset
clf = tree.DecisionTreeClassifier(max_depth=2, min_impurity_decrease=0.01).fit(
    non_separable_dataset[[x, y]].values,
    non_separable_dataset["species"],
)

# Plot the decision boundary of the linear classifier
ax = plot_decision_boundary(
    clf,
    x=non_separable_dataset[x],
    y=non_separable_dataset[y],
    grid_resolution=100,
    plot_method="contour",
    levels=[0.5],
)

plot_decision_boundary(
    clf,
    x=non_separable_dataset[x],
    y=non_separable_dataset[y],
    grid_resolution=100,
    levels=[0, 0.5, 1],
    ax=ax,
    cmap=ListedColormap(["cornflowerblue", "cyan", "orange"]),
)

# Overlay the dataset points
sns.scatterplot(
    data=non_separable_dataset,
    x=x,
    y=y,
    hue="species",
    ax=ax,
);

In [None]:
# Fit a non-linear support vector machine on the non-linearly-separable dataset
clf = tree.DecisionTreeClassifier(max_depth=4, min_impurity_decrease=0.01).fit(
    non_separable_dataset[[x, y]].values,
    non_separable_dataset["species"],
)

# Plot the decision boundary of the linear classifier
ax = plot_decision_boundary(
    clf,
    x=non_separable_dataset[x],
    y=non_separable_dataset[y],
    grid_resolution=100,
    plot_method="contour",
    levels=[0.5],
)

plot_decision_boundary(
    clf,
    x=non_separable_dataset[x],
    y=non_separable_dataset[y],
    grid_resolution=100,
    levels=[0, 0.5, 1],
    ax=ax,
    cmap=ListedColormap(["cornflowerblue", "cyan", "orange"]),
)

# Overlay the dataset points
sns.scatterplot(
    data=non_separable_dataset,
    x=x,
    y=y,
    hue="species",
    ax=ax,
);

In [None]:
_, ax = plt.subplots(figsize=(10, 6))

tree.plot_tree(
    clf,
    feature_names=[x, y],
    class_names=clf.classes_,
    filled=True,
    impurity=False,
    label="root",
    rounded=True,
    ax=ax,
);

### Random Forest

A Random Forest is a collection (or ensemble) of decision trees, where each tree is trained on a different random subset of the data

![Random forest](https://upload.wikimedia.org/wikipedia/commons/7/76/Random_forest_diagram_complete.png)

(image taken from the [wikipedia article](https://en.wikipedia.org/wiki/Random_forest) on random forests)

Wisdom of the crowd!

Some benefits of RF are:
    
* Are fast to train and test.
* Can deal with noisy features.
* Handle features of different units.
* Can cope with large datasets.

## Random Forest on Butterfly dataset

Here we will split our training data into a training and validation sets and compute classification accuracy.

In [None]:
import pandas as pd
import seaborn as sns

# load the scikit learn package
import sklearn
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier

In [None]:
# load data from csv file
butterflies = pd.read_csv("data/butterflies.csv")

In [None]:
# plot the dataset
sns.scatterplot(data=butterflies, x="Width", y="Height", hue="Species")

In [None]:
# split the data into validation and test
# randomly choose take 50 datapoints for validation
train_data, validation_data = model_selection.train_test_split(
    butterflies, test_size=50
)

In [None]:
print("Number of training samples: ", len(train_data))
print("Number of validation samples: ", len(validation_data))

In [None]:
# draw the dataset
sns.scatterplot(data=butterflies, x="Width", y="Height", hue="Species")

# with circles around the training set
sns.scatterplot(
    data=train_data,
    x="Width",
    y="Height",
    marker="o",
    edgecolor="black",
    facecolor="none",
    label="train set",
)

In [None]:
# train the classifier
rf = RandomForestClassifier()
rf.fit(train_data[["Width", "Height"]], train_data["Species"])

In [None]:
# make predictions for the validation set
species_prediction = rf.predict(validation_data[["Width", "Height"]])

In [None]:
# compute the classification accuracy on the validation set
correct_classifications = validation_data["Species"] == species_prediction
percent_correct_predictions = correct_classifications.mean() * 100
percent_incorrect_predictions = (1 - correct_classifications).mean() * 100
print("Classification accuracy (%):", percent_correct_predictions)

In [None]:
# extract the incorrect predictions so we can plot them
incorrect_predictions = validation_data[~correct_classifications]

In [None]:
# draw the dataset
sns.scatterplot(data=butterflies, x="Width", y="Height", hue="Species")

# with circles around the training set
sns.scatterplot(
    data=train_data,
    x="Width",
    y="Height",
    marker="o",
    edgecolor="black",
    facecolor="none",
    label="train data",
)

# draw an 'x' where we predict the wrong answer
sns.scatterplot(
    data=incorrect_predictions,
    x="Width",
    y="Height",
    marker="x",
    color="black",
    label="incorrect predictions",
);

In [None]:
# Fit a non-linear support vector machine on the non-linearly-separable dataset
clf = RandomForestClassifier().fit(
    non_separable_dataset[[x, y]].values,
    non_separable_dataset["species"],
)

# Plot the decision boundary of the linear classifier
ax = plot_decision_boundary(
    clf,
    x=non_separable_dataset[x],
    y=non_separable_dataset[y],
    grid_resolution=100,
    plot_method="contour",
    levels=[0.5],
)

plot_decision_boundary(
    clf,
    x=non_separable_dataset[x],
    y=non_separable_dataset[y],
    grid_resolution=100,
    levels=[0, 0.5, 1],
    ax=ax,
    cmap=ListedColormap(["cornflowerblue", "cyan", "orange"]),
)

# Overlay the dataset points
sns.scatterplot(
    data=non_separable_dataset,
    x=x,
    y=y,
    hue="species",
    ax=ax,
);

In [None]:
from IPython.display import IFrame

In [None]:
IFrame("http://cs.stanford.edu/people/karpathy/svmjs/demo/demoforest.html", width=1000, height=600)

## Which algorithm to choose?

Short answer: It depends!

No silver bullet, but often it is sensible to first try a Support Vector Machine or Random Forest.

This will give you an idea of how separable your data is. The next step is to try different features, and perhaps even collect more training data.

## How much data do I need?

Short answer: It depends!

It depends on how easy it is for your classifier to separate your data. Some problems are relatively easy and don’t require lots of data, others such as species identification in images can require 10,000s.