In [1]:
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

#from sklearn.datasets import fetch_mldata
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# fetch MNIST data: Dosen't work!
#mnist = fetch_mldata("MNIST original")

# load csv
df_train = pd.read_csv("../datasets/mnist_train.csv")
df_test = pd.read_csv("../datasets/mnist_test.csv")
X_train = df_train.drop(df_train.columns[[0]], axis=1).values
y_train = df_train[df_train.columns[[0]]].values.ravel()
X_test = df_test.drop(df_test.columns[[0]], axis=1).values
y_test = df_test[df_test.columns[[0]]].values.ravel()

# plot one example as image
some_index = 36000
some_digit = X_train[some_index]
some_digit_image = some_digit.reshape(28, 28)
plt.figure()
plt.imshow(some_digit_image, cmap=matplotlib.cm.binary, interpolation="nearest")
plt.axis("off")
plt.savefig("../plots/ex_3_01.pdf")
plt.axis("on")

# permuatate train set
shuffle_index = np.random.permutation(len(y_train))
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

# SGD for "5"
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

# CV for accuracy
sgd_accuracy = cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")

# CV for confusion matrix
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
sgd_conMat = confusion_matrix(y_train_5, y_train_pred)

# score for each instance
y_scores = sgd_clf.decision_function([some_digit])

# scores for all
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function")

# random forest classifier
forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, 
                                    cv=3, method="predict_proba")
y_scores_forest = y_probas_forest[:, -1]

# calculate precision and recall as a function of thresholds
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
precisions_forest, recalls_forest, thresholds_forest = precision_recall_curve(y_train_5, 
                                                                              y_scores_forest)
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="center left")
    plt.ylim([0,1])
plt.figure()
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.savefig("../plots/ex_3_02.pdf")
plt.figure()
plt.plot(recalls, precisions, "b:", linewidth=2, label="SGD")
plt.plot(recalls_forest, precisions_forest, linewidth=2, label="Random Forest")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.xlim([0,1])
plt.ylim([0,1])
plt.legend(loc="lower left")
plt.savefig("../plots/ex_3_03.pdf")

# ROC
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1], [0,1], "k--")
    plt.axis([0,1,0,1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
plt.figure()
plt.plot(fpr, tpr, "b:", label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right")
plt.savefig("../plots/ex_3_04.pdf")
roc_auc_score(y_train_5, y_scores)
roc_auc_score(y_train_5, y_scores_forest)

# SGD OvA
sgd_clf.fit(X_train, y_train)
some_digit_scores = sgd_clf.decision_function([some_digit])

# SGD forced OvO
ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42))
ovo_clf.fit(X_train, y_train)

# SGD OvA CV
cv_scores = cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

# SGD OvA scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cv_scores = cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

# error analysis
y_train_pred = cross_val_predict(forest_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)

# plot confusion matrix
plt.figure()
plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.savefig("../plots/ex_3_05.pdf")

# normalize confusion matrix
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums
np.fill_diagonal(norm_conf_mx, 0)
plt.figure()
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.savefig("../plots/ex_3_06.pdf")

# look into "3" and "5"
cl_a, cl_b = 3, 5
X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]
def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap=matplotlib.cm.binary, **options)
    plt.axis("off")
plt.figure(figsize=(8,8))
plt.subplot(221);
plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222);
plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223);
plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224);
plot_digits(X_bb[:25], images_per_row=5)
plt.savefig("../plots/ex_3_07.pdf")

# multilabel classification
y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

# multioutput-multiclass classification
noise = np.random.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
noise = np.random.randint(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test
knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod[3]])
plt.figure()
plt.subplot(131)
plt.axis("off")
plt.imshow(X_test_mod[3].reshape(28,28), cmap=matplotlib.cm.binary, interpolation="nearest")
plt.subplot(132)
plt.axis("off")
plt.imshow(X_test[3].reshape(28,28), cmap=matplotlib.cm.binary, interpolation="nearest")
plt.subplot(133)
plt.axis("off")
plt.imshow(clean_digit.reshape(28,28), cmap=matplotlib.cm.binary, interpolation="nearest")
plt.savefig("../plots/ex_3_08.pdf")
plt.axis("on")

(-0.5, 27.5, 27.5, -0.5)