In [447]:
%matplotlib widget

from sklearn.datasets import load_wine
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import numpy as np

In [448]:
# load the full dataset and display it
dataset = load_wine()
names = dataset.feature_names
data = dataset.data
target = dataset.target

In [449]:
scaler = StandardScaler() # feature scale the data first
scaler.fit(data)
data = scaler.transform(data)

In [450]:
target = target

print(names)
print("Data: ", data.shape, data.dtype)
print("Target:", target.shape, target.dtype)

print(data)
print(target)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
Data:  (178, 13) float64
Target: (178,) int32
[[ 1.51861254 -0.5622498   0.23205254 ...  0.36217728  1.84791957
   1.01300893]
 [ 0.24628963 -0.49941338 -0.82799632 ...  0.40605066  1.1134493
   0.96524152]
 [ 0.19687903  0.02123125  1.10933436 ...  0.31830389  0.78858745
   1.39514818]
 ...
 [ 0.33275817  1.74474449 -0.38935541 ... -1.61212515 -1.48544548
   0.28057537]
 [ 0.20923168  0.22769377  0.01273209 ... -1.56825176 -1.40069891
   0.29649784]
 [ 1.39508604  1.58316512  1.36520822 ... -1.52437837 -1.42894777
  -0.59516041]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1

Now instead of using one vs rest where we rank one category against the rest, we will rank them in pairs. This will provide better accuracy and the models will be less prone to be biased. This new technique is known as multi class one vs one.

In [451]:
examples = data.shape[0]
features = data.shape[1]
categories = np.unique(target).shape[0]
categorysplits = []
models = []
data_pairs = []
target_pairs = []

old_ids = 0
print("Splits")
for i in range(categories):
    ids = np.searchsorted(target, i+1)
    categorysplits.append(np.array(data[old_ids:ids]))
    print(categorysplits[-1].shape)
    old_ids = ids
# split the categories of data

for i in range(categories):
    models.append([])
    for j in range(categories):
        models[-1].append(LogisticRegression(random_state=0))
# create a model for each pair of category

for i in range(categories):
    isize = categorysplits[i].shape[0]
    data_pairs.append([])
    target_pairs.append([])
    for j in range(categories):
        jsize = categorysplits[j].shape[0]
        joined_categories = np.vstack([categorysplits[i], categorysplits[j]])
        joined_targets = np.append(np.zeros(isize),np.ones(jsize)).astype(np.int32)

        data_pairs[-1].append(joined_categories)
        target_pairs[-1].append(joined_targets)

# transform each pair into a binary classification problem
for i in range(categories):
    for j in range(categories):
        models[i][j].fit(data_pairs[i][j], target_pairs[i][j])
# fit each category pair

Splits
(59, 13)
(71, 13)
(48, 13)


Since we're working with much more models here, it's a bit more tricky to organize the data. The idea here is to split the input data by the output categories, then throw away the outputs since we have the input splits. Then we create each category pair of models, create the right preprocessed inputs for a binary classification target and finally use the many models to fit each specific pair. Of course, there's a lot of problems with this approach. Firstly, category A and category B are fit twice, so we're duplicating twice. We could remove this and optimize this to use only half of the pair fits, but it's more annoying to work with that. Also we're fitting category A and category A together, which is useless. Turns out you can ignore this since the probability will be low anyways for the same categories. This can easily be optimized away but I wanted to make the code clean and clear to understand.

In [452]:
def predict(data, models):
    categories = len(models)
    predictions = []

    for i in range(categories):
        predictions.append([])
        for j in range(categories):
            prediction = models[i][j].predict_proba(data)
            prediction = prediction[:,0] # take the class associated with the ith category
            predictions[-1].append(prediction)
    # get the probabilities for each pair

    combined_predictions = []
    for i in range(categories):
        ipredictions = np.array(predictions[i])
        cpredictions = np.prod(ipredictions, axis=0) # take the likelihood of the target category being correct
        combined_predictions.append(cpredictions)
    # compute the likelihood of the ith category being correct
    
    pzip = zip(*combined_predictions)
    arr = np.array([*pzip]) # reshape the likelihoods into groups of each category
    indices = np.argmax(arr,axis=1) #finally take the biggest likelihood index
    return indices

The prediction function is very similar to the one versus rest approach. Since we have multiple models, we first predict the probabilities of all of the models on all of the data. Now there are multiple probabilities for each category. To get the joint probability, we can multiply the probabilities together. This is also known as the likelihood of the category being correct. Then, we transform the likelihoods pick the highest likelihood as the category for each training sample. 

In [453]:
predictions = predict(data, models)
print("Predictions:", predictions)

Predictions: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


In [454]:
errors = np.sum((target ^ predictions).astype(bool).astype(np.int32))
print("Misclassifications:", errors)

Misclassifications: 0


Wow, look at that. We have no errors. Remember that for one vs rest, we actually got one error. This shows that this technique is a lot more precise than the one vs rest. Of course, with extra precision comes extra cost as well. In one vs rest, we only needed $C$ models for $C$ categories. Here, we needed $C^2$ models since we have pairs of categories being classified together. So if you have too many categories like ~$10^3$, then one vs one is much worse to work with. Of course, in real life we might need to classify a lot more categories such as on the order of million. In this situation, one vs one or even one vs rest wouldn't work as the complexity is too high. To deal with the previous issues, we need to introduce a better technique known as the softmax.