In [1]:
from IPython.display import display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Datenimport

In [3]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', as_frame=False)

In [4]:
X, y = mnist.data, mnist.target

In [5]:
# X

In [6]:
# X.shape

In [7]:
# y

In [9]:
# y.shape

## Bild darstellen

In [8]:
def plot_digit(image_data):
    image = image_data.reshape(28,28)
    plt.imshow(image, cmap='binary')
    plt.axis('off')

In [10]:
some_digit = X[0]
# plot_digit(some_digit)

In [10]:
y[0]

'5'

# Test- und Trainingsdaten erzeugen

In [11]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

# Binären Klassifikator

## Model Training

In [12]:
y_train_5 = (y_train == '5')
y_test_5 = (y_test == '5')

In [13]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

In [14]:
sgd_clf.predict([some_digit])

array([ True])

## Qualitätsmaße

### Kreuzvalidierung (S. 139 ff)

In [15]:
from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")

array([0.95035, 0.96035, 0.9604 ])

### Konfusionsmatrix (S. 140 ff)

In [16]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3) # liefert für jeden Test-Fold die berechneten Vorhersagen

In [17]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_train_5, y_train_pred)
cm

array([[53892,   687],
       [ 1891,  3530]], dtype=int64)

### Relevanz und Sensivität (S. 142 ff)

* Relevanz (Precision) = $\frac{RP}{RP + FP} \to$ Genauigkeit
* Sensitivität (Recall) = $\frac{RP}{RP + FN} \to$ Trefferquote
* Ein Erhöhen der Relevanz senkt die Sensitivität und umgekehrt

In [19]:
from sklearn.metrics import precision_score, recall_score

print(f'Precision: {round(precision_score(y_train_5, y_train_pred), 2)}')
print(f'Recall: {round(recall_score(y_train_5, y_train_pred), 2)}')

Precision: 0.84
Recall: 0.65


### $F_{1}$-Score

Harmonischer Mittelwert von Relevanz und Sensitivität

In [20]:
from sklearn.metrics import f1_score

f1_score(y_train_5, y_train_pred)

0.7325171197343846

### Wechselbeziehung zwischen Relevanz und Sensitivtät

In [21]:
y_scores = sgd_clf.decision_function([some_digit])
y_scores

array([2164.22030239])

### Die Receiver Operating Characteristics (ROC) Kurve

* Zeigt Richtig-positiv-Rate (TPR, anderer Name für Sensitivität) gegen Falsch-positiv-Rate (FPR, Ausfallrate)
* FPR: Anteil negativer Datenpunkte, die fälschlicherweise als positiv eingestuft worden sind

In [22]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_train_pred)

# Klassifikatoren mit mehreren Kategorien (S. 151 ff)

## SVC

In [None]:
from sklearn.svm import SVC

svm_clf = SVC(random_state=42)
svm_clf.fit(X_train[:2000], y_train[:2000])

In [None]:
svm_clf.predict([some_digit])

In [None]:
some_digit_scores = svm_clf.decision_function([some_digit])
some_digit_scores.round(2)

In [None]:
class_id = some_digit_scores.argmax()
class_id

In [None]:
svm_clf.classes_

In [None]:
svm_clf.classes_[class_id] # Label der Kategorie bestimmen

## OvR

### SVM

In [None]:
from sklearn.multiclass import OneVsRestClassifier

ovr_clf = OneVsRestClassifier(SVC(random_state=42))
ovr_clf.fit(X_train[:2000], y_train[:2000])

In [None]:
ovr_clf.predict([some_digit])

### SGDClassifier

In [26]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])

array(['3'], dtype='<U1')

In [27]:
sgd_clf.decision_function([some_digit]).round()

NameError: name 'sgd_clf_clf' is not defined

In [None]:
# cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

In [25]:
# Verbesserung durch Skalieren
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype('float64'))
# cross_val_score(sgd_clf, X_train_scaled, y_train,cv=3, scoring="accuracy")

array([0.8983, 0.891 , 0.9018])

## Fehleranalyse (S. 154 ff)

### Confusionmatrix

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred)
plt.show()

In [None]:
# Normalisiert
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred, normalize="true", values_format=".0%")
plt.show()

In [None]:
# Gewichtung von Null für korrekte Vorhersagen
sample_weight = (y_train_pred != y_train)
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred, sample_weight=sample_weight, normalize="true", values_format=".0%")
plt.show()

In [None]:
# Normalisierung nach Spalten
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred, normalize="pred", values_format=".0%")
plt.show()

# Klassifikation mit mehreren Labels (S. 158 ff)

In [16]:
from sklearn.neighbors import KNeighborsClassifier

y_train_large = (y_train >= '7')
y_train_odd = (y_train.astype('int8')% 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

In [20]:
knn_clf.predict([some_digit])

array([[False,  True]])

## $F_1$-Score

In [None]:
# y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)
# f1_score(y_multilabel, y_train_knn_pred, average="macro")

## Chain Classifier

In [24]:
from sklearn.multioutput import ClassifierChain
from sklearn.svm import SVC

chain_clf = ClassifierChain(SVC(), cv=3, random_state=42)
chain_clf.fit(X_train[:2000], y_multilabel[:2000])

In [25]:
chain_clf.predict([some_digit])

array([[0., 1.]])

# Übungen
Lösungen : https://github.com/ageron/handson-ml2/blob/master/03_classification.ipynb

## MNIST-Klassifikator

In [4]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', as_frame=False)

  warn(


In [6]:
X, y = mnist.data, mnist.target
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

param_grid = [
    {'n_neighbors': [3, 4, 5]},
    {'weights': ['uniform', 'distance']},
]

grid_search = GridSearchCV(
    KNeighborsClassifier(), 
    param_grid,
    cv=5,
    verbose=3
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END .....................n_neighbors=3;, score=0.972 total time=   8.6s
[CV 2/5] END .....................n_neighbors=3;, score=0.971 total time=   8.8s
[CV 3/5] END .....................n_neighbors=3;, score=0.969 total time=   8.6s
[CV 4/5] END .....................n_neighbors=3;, score=0.969 total time=  10.3s
[CV 5/5] END .....................n_neighbors=3;, score=0.970 total time=   9.9s
[CV 1/5] END .....................n_neighbors=4;, score=0.969 total time=   9.6s
[CV 2/5] END .....................n_neighbors=4;, score=0.968 total time=  10.3s
[CV 3/5] END .....................n_neighbors=4;, score=0.968 total time=  10.4s
[CV 4/5] END .....................n_neighbors=4;, score=0.967 total time=   9.8s
[CV 5/5] END .....................n_neighbors=4;, score=0.970 total time=  10.5s
[CV 1/5] END .....................n_neighbors=5;, score=0.970 total time=  10.3s
[CV 2/5] END .....................n_neighbors=5;,

In [35]:
grid_search.best_params_

{'weights': 'distance'}

In [36]:
grid_search.best_score_

0.9704166666666667

## Titanic

### Data Import

In [38]:

import os
import urllib.request

TITANIC_PATH = os.path.join("datasets", "titanic")
DOWNLOAD_URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/titanic/"

def fetch_titanic_data(url=DOWNLOAD_URL, path=TITANIC_PATH):
    if not os.path.isdir(path):
        os.makedirs(path)
    for filename in ("train.csv", "test.csv"):
        filepath = os.path.join(path, filename)
        if not os.path.isfile(filepath):
            print("Downloading", filename)
            urllib.request.urlretrieve(url + filename, filepath)

fetch_titanic_data()    

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

### Trainings- und Testdaten

In [59]:
train_data = load_titanic_data("train.csv").set_index("PassengerId")
test_data = load_titanic_data("test.csv").set_index("PassengerId")

In [63]:
train_data

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen 'Carrie'",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [62]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [84]:
test_data

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...
1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


### Pipeline

In [67]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [71]:
num_pipline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("cat_encoder", OneHotEncoder(sparse=False))
])

num_attribs = ["Age", "SibSp", "Parch", "Fare"]
cat_attribs = ["Pclass", "Sex", "Embarked"]

preprocess_pipeline = ColumnTransformer([
    ('num', num_pipline, num_attribs),
    ('cat', cat_pipeline, cat_attribs)
])

In [73]:
X_train = preprocess_pipeline.fit_transform(train_data[num_attribs + cat_attribs])
X_train



array([[-0.56573582,  0.43279337, -0.47367361, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.6638609 ,  0.43279337, -0.47367361, ...,  1.        ,
         0.        ,  0.        ],
       [-0.25833664, -0.4745452 , -0.47367361, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.10463705,  0.43279337,  2.00893337, ...,  0.        ,
         0.        ,  1.        ],
       [-0.25833664, -0.4745452 , -0.47367361, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.20276213, -0.4745452 , -0.47367361, ...,  0.        ,
         1.        ,  0.        ]])

In [74]:
y_train = train_data["Survived"]

### Model

#### RandomForestClassifier

In [75]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train, y_train)

In [86]:
X_test = preprocess_pipeline.transform(test_data[num_attribs + cat_attribs])
y_pred = forest_clf.predict(X_test)
# y_pred

In [78]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.8137578027465668

#### SVC

In [83]:
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto")
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()

0.8249313358302123