In [None]:
# 使用sklearn的函数来获取MNIST数据集
from sklearn.datasets import fetch_openml
import numpy as np
import os

In [None]:
# to make this notebook's output stable across runs
np.random.seed(42)
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
# 为了显示中文
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False

In [None]:
# 耗时巨大
def sort_by_target(mnist):
    reorder_train=np.array(sorted([(target,i) 
       for i, target in enumerate(mnist.target[:60000])]))[:,1]
    reorder_test=np.array(sorted([(target,i) 
       for i, target in enumerate(mnist.target[60000:])]))[:,1]
    mnist.data[:60000]=mnist.data[reorder_train]
    mnist.target[:60000]=mnist.target[reorder_train]
    mnist.data[60000:]=mnist.data[reorder_test+60000]
    mnist.target[60000:]=mnist.target[reorder_test+60000]
mnist=fetch_openml('mnist_784',version=1,cache=True)
mnist.target=mnist.target.astype(np.int8)
sort_by_target(mnist)
mnist["data"], mnist["target"]
mnist.data.shape
X,y=mnist["data"],mnist["target"]

In [None]:
# 展示图片
def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = mpl.cm.binary,interpolation="nearest")
    plt.axis("on")
some_digit = X[30000]
plot_digit(X[30000].reshape(28,28))

In [None]:
# 更好看的图片展示
def plot_digits(instances,images_per_row=10,**options):
    size=28
    # 每一行有一个
    image_pre_row=min(len(instances),images_per_row)
    images=[instances.reshape(size,size) for instances in instances]
#     有几行
    n_rows=(len(instances)-1) // image_pre_row+1
    row_images=[]
    n_empty=n_rows*image_pre_row-len(instances)
    images.append(np.zeros((size,size*n_empty)))
    for row in range(n_rows):
        # 每一次添加一行
        rimages=images[row*image_pre_row:(row+1)*image_pre_row]
        # 对添加的每一行的额图片左右连接
        row_images.append(np.concatenate(rimages,axis=1))
    # 对添加的每一列图片 上下连接
    image=np.concatenate(row_images,axis=0)
    plt.imshow(image,cmap=mpl.cm.binary,**options)
    plt.axis("off")
plt.figure(figsize=(9,9))
example_images=np.r_[X[:12000:600],X[13000:30600:600],X[30600:60000:590]]
plot_digits(example_images,images_per_row=10)
plt.show()

In [None]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
import numpy as np
shuffer_index=np.random.permutation(60000)
X_train,y_train=X_train[shuffer_index],y_train[shuffer_index]

In [None]:
#准备使用支持向量机
from sklearn import svm
# 获取一个支持向量机模型
predictor = svm.SVC(gamma='scale', C=1.0, decision_function_shape='ovr', kernel='rbf')
# 把数据丢进去
predictor.fit(X_train, y_train)
# 预测结果
result = predictor.predict(X_test)
# 准确率估计
accurancy = np.sum(np.equal(result, y_test)) / 10000
print(accurancy)
some_digit_scores = predictor.decision_function(X_test)
some_digit_scores

In [None]:
import pandas as pd
pd.crosstab(y_test.reshape(-1),
            result,
            rownames=['label'],
            colnames=['predict'])

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score,roc_curve
y_train_5=(y_train==5)
y_test_5=(y_test==5)
# 具体RF的原理，第七章介绍
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)
forest_clf.fit(X_train,y_train)
result=forest_clf.predict(X_test)
accurancy = np.sum(np.equal(result, y_test)) / 10000
print(accurancy)

In [None]:
import pandas as pd
pd.crosstab(y_test.reshape(-1),
            result,
            rownames=['label'],
            colnames=['predict'])

In [None]:
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,
                                    method="predict_proba")
y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_scores_forest)
plt.figure(figsize=(8, 6))
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

y_probas_svm = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,
                                    method="predict_proba")
y_scores_svm = y_probas_svm[:, 1] # score = proba of positive class
fpr_svm, tpr_svm, thresholds_svm = roc_curve(y_train_5,y_scores_svm)

plt.figure(figsize=(8, 6))
plot_roc_curve(fpr_svm, tpr_svm, "SVM")
plt.plot(fpr_forest, tpr_forest, "b:", linewidth=2, label="RF")
plt.title("SVM与随机森林的ROC曲线")
plt.legend(loc="lower right", fontsize=16)
plt.show()