In [49]:
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

#from sklearn.datasets import fetch_mldata
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

# fetch MNIST data: Dosen't work!
#mnist = fetch_mldata("MNIST original")

# load csv
df_train = pd.read_csv("../datasets/mnist_train.csv")
df_test = pd.read_csv("../datasets/mnist_test.csv")
X_train = df_train.drop(df_train.columns[[0]], axis=1).values
y_train = df_train[df_train.columns[[0]]].values
X_test = df_test.drop(df_train.columns[[0]], axis=1).values
y_test = df_test[df_train.columns[[0]]].values

# plot one example as image
some_digit = X_train[36000]
some_digit_image = some_digit.reshape(28, 28)
plt.figure()
plt.imshow(some_digit_image, cmap=matplotlib.cm.binary, interpolation="nearest")
plt.axis("off")
plt.savefig("../plots/ex_3_01.pdf")
plt.axis("on")

# permuatate train set
shuffle_index = np.random.permutation(len(y_train))
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

# SGD for "5"
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

# CV for accuracy
sgd_accuracy = cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")

# CV for confusion matrix
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
sgd_conMat = confusion_matrix(y_train_5, y_train_pred)

# score for each instance
y_scores = sgd_clf.decision_function([some_digit])

# scores for all
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function")

# random forest classifier
forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, 
                                    cv=3, method="predict_proba")
y_scores_forest = y_probas_forest[:, -1]

# calculate precision and recall as a function of thresholds
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
precisions_forest, recalls_forest, thresholds_forest = precision_recall_curve(y_train_5, 
                                                                              y_scores_forest)
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="center left")
    plt.ylim([0,1])
plt.figure()
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.savefig("../plots/ex_3_02.pdf")
plt.figure()
plt.plot(recalls, precisions, "b:", linewidth=2, label="SGD")
plt.plot(recalls_forest, precisions_forest, linewidth=2, label="Random Forest")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.xlim([0,1])
plt.ylim([0,1])
plt.legend(loc="lower left")
plt.savefig("../plots/ex_3_03.pdf")

# ROC
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1], [0,1], "k--")
    plt.axis([0,1,0,1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
plt.figure()
plt.plot(fpr, tpr, "b:", label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right")
plt.savefig("../plots/ex_3_04.pdf")
roc_auc_score(y_train_5, y_scores)
roc_auc_score(y_train_5, y_scores_forest)


0.99293870816188523