## Homework 7

In [1]:
import os
import time

import numpy as np
import sklearn.datasets
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

import matplotlib.pyplot as plt

### Preprocess

Данные были загружены и ошкалированны с помощью StandardScaler.

In [2]:
def read_data():
    mnist = sklearn.datasets.fetch_openml('mnist_784', data_home=os.getcwd(), cache=True)
    X = mnist['data']
    y = mnist['target']

    return X, y

In [3]:
def transform_data(X):
    scaler = StandardScaler()
    X_transformed = scaler.fit_transform(X)
    return X_transformed

### One VS Rest

In [15]:
class OneVSRest(BaseEstimator, ClassifierMixin):
    def __init__(self, labels):
        self.labels = labels

        self.n_cls = len(labels)
        self.clfs = list()

        self.time = None
        self.probs = None

    def fit(self, X, y):
        start_time = time.time()
        for label in self.labels:
            print(f'Fitting {label} vs rest.')
            mask = (y.astype(int) == label)
            y_two_classes = mask.astype(int)

            clf = LogisticRegression()
            clf.fit(X, y_two_classes)

            self.clfs.append(clf)
        self.time = time.time() - start_time

    def predict_proba(self, X):
        self.probs = np.zeros((len(X), self.n_cls))

        for ind, clf in enumerate(self.clfs):
            y_prob = clf.predict_proba(X)
            self.probs[:, ind] = y_prob[:, 1]

        return self.probs

    def predict(self, X):
        self.predict_proba(X)
        return self.probs.argmax(axis=1)

In [5]:
X, y = read_data()
X_transformed = transform_data(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
one_vs_rest = OneVSRest(list(range(10)))

In [8]:
one_vs_rest.fit(X_train, y_train)
y_pred = one_vs_rest.predict(X_test)

Fitting 0 vs rest.




Fitting 1 vs rest.




Fitting 2 vs rest.




Fitting 3 vs rest.




Fitting 4 vs rest.
Fitting 5 vs rest.




Fitting 6 vs rest.




Fitting 7 vs rest.
Fitting 8 vs rest.




Fitting 9 vs rest.




In [12]:
y_test = y_test.astype(int)

In [46]:
print(f'Accuracy train: {accuracy_score(y_train.astype(int), one_vs_rest.predict(X_train))}')
print(f'Accuracy: {accuracy_score(y_test.astype(int), y_pred)}')
print(f'Fitting time: {one_vs_rest.time}')
print(f'Confusion matrix')
print(confusion_matrix(y_test, y_pred))

Accuracy train: 0.9332952380952381
Accuracy: 0.9142857142857143
Fitting time: 2445.3168938159943
Confusion matrix
[[1688    0    5    1    0    4   10    1   12    5]
 [   0 1899    8    9    1   13    4    9   21    5]
 [  16   18 1565   27   17    9   24   26   41    5]
 [  13    8   48 1580    5   46    6   16   42   21]
 [   5   11   12    7 1554    2   12    8   24   71]
 [  16    5    8   62   13 1358   29   13   47   27]
 [  12    6   10    0    7   23 1644    1   15    1]
 [   7    9   26    7   16    5    1 1695    6   51]
 [  18   32   21   44   10   45   10    9 1488   29]
 [   9   15    4   32   59   18    1   52   21 1529]]


In [42]:
class OneVSOne(BaseEstimator, ClassifierMixin):
    def __init__(self, labels):
        self.labels = labels

        self.n_cls = len(labels)
        self.clfs = list()

        self.time = None
        self.probs = None

    def fit(self, X, y):
        start_time = time.time()
        self.clfs = []

        for i in range(self.n_cls):
            i_clf = []
            for j in range(i + 1, self.n_cls):
                print(f'Fitting {i} vs {j}.')
                mask = np.array(y.astype(int) == i) | np.array(y.astype(int) == j)

                X_two_classes = X[mask]
                y_two_classes = y[mask].astype(int)

                y_two_classes[y_two_classes == j] = -1
                y_two_classes[y_two_classes == i] = 1


                clf = LogisticRegression()
                clf.fit(X_two_classes, y_two_classes)

                i_clf.append(clf)
            self.clfs.append(i_clf)

        self.time = time.time() - start_time

    def predict(self, X):
        self.probs = np.zeros((len(X), self.n_cls, self.n_cls))

        for i in range(self.n_cls):
            for j in range(i + 1, self.n_cls):
                clf = self.clfs[i][j - i - 1]
                self.probs[:, i, j] = clf.predict(X)
                self.probs[:, j, i] = -self.probs[:, i, j]

        return self.probs.sum(axis=2).argmax(axis=1)

In [43]:
one_vs_one = OneVSOne(list(range(10)))

In [44]:
one_vs_one.fit(X_train, y_train)
y_pred_OvsO = one_vs_one.predict(X_test)

Fitting 0 vs 1.




Fitting 0 vs 2.




Fitting 0 vs 3.




Fitting 0 vs 4.




Fitting 0 vs 5.




Fitting 0 vs 6.




Fitting 0 vs 7.




Fitting 0 vs 8.




Fitting 0 vs 9.




Fitting 1 vs 2.




Fitting 1 vs 3.




Fitting 1 vs 4.




Fitting 1 vs 5.




Fitting 1 vs 6.




Fitting 1 vs 7.




Fitting 1 vs 8.




Fitting 1 vs 9.




Fitting 2 vs 3.




Fitting 2 vs 4.




Fitting 2 vs 5.




Fitting 2 vs 6.




Fitting 2 vs 7.




Fitting 2 vs 8.




Fitting 2 vs 9.




Fitting 3 vs 4.




Fitting 3 vs 5.




Fitting 3 vs 6.




Fitting 3 vs 7.




Fitting 3 vs 8.




Fitting 3 vs 9.




Fitting 4 vs 5.




Fitting 4 vs 6.




Fitting 4 vs 7.




Fitting 4 vs 8.




Fitting 4 vs 9.




Fitting 5 vs 6.




Fitting 5 vs 7.




Fitting 5 vs 8.




Fitting 5 vs 9.
Fitting 6 vs 7.




Fitting 6 vs 8.




Fitting 6 vs 9.




Fitting 7 vs 8.




Fitting 7 vs 9.




Fitting 8 vs 9.




In [47]:
print(f'Accuracy train: {accuracy_score(y_train.astype(int), one_vs_one.predict(X_train))}')
print(f'Accuracy test: {accuracy_score(y_test.astype(int), y_pred_OvsO)}')
print(f'Fitting time: {one_vs_one.time}')
print(f'Confusion matrix')
print(confusion_matrix(y_test, y_pred_OvsO))

Accuracy train: 0.9838476190476191
Accuracy test: 0.9189142857142857
Fitting time: 387.65949273109436
Confusion matrix
[[1684    0    6    3    3   17    6    0    5    2]
 [   0 1921   13    9    1    5    1    6   11    2]
 [  15   20 1593   29   17    6   13   15   36    4]
 [  13    7   45 1620    2   53    2   11   24    8]
 [   4    8   28    3 1571    0   15   14    3   60]
 [  21    7   12   72   12 1386   20    2   29   17]
 [  19    3   29    0   15   26 1622    0    5    0]
 [   7    8   22   14   24    4    0 1700    9   35]
 [  19   33   32   67    8   38   13    6 1482    8]
 [  12   13    5   27   77   15    1   72   16 1502]]


### Resume

- Данные были разбиты на тренировачную и тестовую выборки в пропорции 3 к 1. Также выборки формировались с учетом процентного содержания классов в изначальной выборке.
- Качество оценивалось с помощью точности.
- Можно видеть, что оба подхода выдали высокое качество: 0.91 на тестовой выборке. При этом, подход 1 vs 1 дает качество 0.98 на тренировочной выборке.
- Для подхода One vs Rest - O(n), для подхода One vs One - O(n^2). 
- В реальности, как видно из результатов, второй подход сработал быстрее. Это вызвано тем, что в первом подходе для обучения каждого классификатора мы используем всю выборку, а во втором - только 2/10 (у нас классы примерно сбалансированы).