In [1]:
from IPython.display import display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Datenimport

In [3]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', as_frame=False)

In [4]:
X, y = mnist.data, mnist.target

In [5]:
# X

In [6]:
# X.shape

In [7]:
# y

In [9]:
# y.shape

## Bild darstellen

In [8]:
def plot_digit(image_data):
    image = image_data.reshape(28,28)
    plt.imshow(image, cmap='binary')
    plt.axis('off')

In [10]:
some_digit = X[0]
# plot_digit(some_digit)

In [10]:
y[0]

'5'

# Test- und Trainingsdaten erzeugen

In [11]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

# Binären Klassifikator

## Model Training

In [12]:
y_train_5 = (y_train == '5')
y_test_5 = (y_test == '5')

In [13]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

In [14]:
sgd_clf.predict([some_digit])

array([ True])

## Qualitätsmaße

### Kreuzvalidierung (S. 139 ff)

In [15]:
from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")

array([0.95035, 0.96035, 0.9604 ])

### Konfusionsmatrix (S. 140 ff)

In [16]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3) # liefert für jeden Test-Fold die berechneten Vorhersagen

In [17]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_train_5, y_train_pred)
cm

array([[53892,   687],
       [ 1891,  3530]], dtype=int64)

### Relevanz und Sensivität (S. 142 ff)

* Relevanz (Precision) = $\frac{RP}{RP + FP} \to$ Genauigkeit
* Sensitivität (Recall) = $\frac{RP}{RP + FN} \to$ Trefferquote
* Ein Erhöhen der Relevanz senkt die Sensitivität und umgekehrt

In [19]:
from sklearn.metrics import precision_score, recall_score

print(f'Precision: {round(precision_score(y_train_5, y_train_pred), 2)}')
print(f'Recall: {round(recall_score(y_train_5, y_train_pred), 2)}')

Precision: 0.84
Recall: 0.65


### $F_{1}$-Score

Harmonischer Mittelwert von Relevanz und Sensitivität

In [20]:
from sklearn.metrics import f1_score

f1_score(y_train_5, y_train_pred)

0.7325171197343846

### Wechselbeziehung zwischen Relevanz und Sensitivtät

In [21]:
y_scores = sgd_clf.decision_function([some_digit])
y_scores

array([2164.22030239])

### Die Receiver Operating Characteristics (ROC) Kurve

* Zeigt Richtig-positiv-Rate (TPR, anderer Name für Sensitivität) gegen Falsch-positiv-Rate (FPR, Ausfallrate)
* FPR: Anteil negativer Datenpunkte, die fälschlicherweise als positiv eingestuft worden sind

In [22]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_train_pred)

# Klassifikatoren mit mehreren Kategorien (S. 151 ff)

## SVC

In [None]:
from sklearn.svm import SVC

svm_clf = SVC(random_state=42)
svm_clf.fit(X_train[:2000], y_train[:2000])

In [None]:
svm_clf.predict([some_digit])

In [None]:
some_digit_scores = svm_clf.decision_function([some_digit])
some_digit_scores.round(2)

In [None]:
class_id = some_digit_scores.argmax()
class_id

In [None]:
svm_clf.classes_

In [None]:
svm_clf.classes_[class_id] # Label der Kategorie bestimmen

## OvR

### SVM

In [None]:
from sklearn.multiclass import OneVsRestClassifier

ovr_clf = OneVsRestClassifier(SVC(random_state=42))
ovr_clf.fit(X_train[:2000], y_train[:2000])

In [None]:
ovr_clf.predict([some_digit])

### SGDClassifier

In [26]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])

array(['3'], dtype='<U1')

In [27]:
sgd_clf.decision_function([some_digit]).round()

NameError: name 'sgd_clf_clf' is not defined

In [None]:
# cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

In [25]:
# Verbesserung durch Skalieren
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype('float64'))
# cross_val_score(sgd_clf, X_train_scaled, y_train,cv=3, scoring="accuracy")

array([0.8983, 0.891 , 0.9018])

## Fehleranalyse (S. 154 ff)

### Confusionmatrix

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred)
plt.show()

In [None]:
# Normalisiert
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred, normalize="true", values_format=".0%")
plt.show()

In [None]:
# Gewichtung von Null für korrekte Vorhersagen
sample_weight = (y_train_pred != y_train)
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred, sample_weight=sample_weight, normalize="true", values_format=".0%")
plt.show()

In [None]:
# Normalisierung nach Spalten
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred, normalize="pred", values_format=".0%")
plt.show()

# Klassifikation mit mehreren Labels (S. 158 ff)

In [16]:
from sklearn.neighbors import KNeighborsClassifier

y_train_large = (y_train >= '7')
y_train_odd = (y_train.astype('int8')% 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

In [20]:
knn_clf.predict([some_digit])

array([[False,  True]])

## $F_1$-Score

In [None]:
# y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)
# f1_score(y_multilabel, y_train_knn_pred, average="macro")

## Chain Classifier

In [24]:
from sklearn.multioutput import ClassifierChain
from sklearn.svm import SVC

chain_clf = ClassifierChain(SVC(), cv=3, random_state=42)
chain_clf.fit(X_train[:2000], y_multilabel[:2000])

In [25]:
chain_clf.predict([some_digit])

array([[0., 1.]])