In [2]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost
from sklearn.metrics import accuracy_score

# 读入乳腺癌数据集
data = load_breast_cancer()
x = pd.DataFrame(data.data)
y = data.target
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0, test_size=0.3)

# 度量单个决策树的准确性
tree = DecisionTreeClassifier(criterion="entropy", max_depth=None)
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)
tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print("Decision tree train/test accuracies %.3f/%.3f" % (tree_train, tree_test))
# Decision tree train/test accuracies 1.000/0.942

# Boosting分类器准确性
ada = AdaBoostClassifier(n_estimators=1000, learning_rate=0.1, random_state=0)
ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)
ada_train = accuracy_score(y_train, y_train_pred)
ada_test = accuracy_score(y_test, y_test_pred)
print("AdaBoost train/test accuracies %.3f/%.3f" % (ada_train, ada_test))
# AdaBoost train/test accuracies 1.000/0.977

gbdt = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1, random_state=0)
gbdt = gbdt.fit(X_train, y_train)
y_train_pred = gbdt.predict(X_train)
y_test_pred = gbdt.predict(X_test)
gbdt_train = accuracy_score(y_train, y_train_pred)
gbdt_test = accuracy_score(y_test, y_test_pred)
print("GBDT train/test accuracies %.3f/%.3f" % (gbdt_train, gbdt_test))
# GBDT train/test accuracies 1.000/0.982

xgb = xgboost.XGBClassifier(n_estimators=1000, learning_rate=0.1)
xgb = xgb.fit(X_train, y_train)
y_train_pred = xgb.predict(X_train)
y_test_pred = xgb.predict(X_test)
xgb_train = accuracy_score(y_train, y_train_pred)
xgb_test = accuracy_score(y_test, y_test_pred)
print("XGBoost train/test accuracies %.3f/%.3f" % (xgb_train, xgb_test))
# XGBoost train/test accuracies 1.000/0.982

Decision tree train/test accuracies 1.000/0.947
AdaBoost train/test accuracies 1.000/0.977
GBDT train/test accuracies 1.000/0.988
XGBoost train/test accuracies 1.000/0.965


In [7]:
from __future__ import division
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

if __name__ == "__main__":
    np.random.seed(0)  # seed to shuffle the train set
    n_folds = 10
    verbose = True
    shuffle = False
    data = load_breast_cancer()
    X = data.data
    y = data.target
    # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=0, test_size=0.2
    )

    if shuffle:
        idx = np.random.permutation(y_train.size)
        X_train = X_train[idx]
        y_train = y_train[idx]

    skf = StratifiedKFold(n_folds)

    clfs = [
        RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion="gini"),
        RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion="entropy"),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion="gini"),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion="entropy"),
        GradientBoostingClassifier(
            learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100
        ),
    ]

    print("Creating train and test sets for stacking.")

    dataset_blend_train = np.zeros((X_train.shape[0], len(clfs)))
    dataset_blend_test = np.zeros((X_test.shape[0], len(clfs)))

    for j, clf in enumerate(clfs):
        print(j, clf)
        dataset_blend_test_j = np.zeros((X_test.shape[0], n_folds))
        for i, (train, test) in enumerate(skf.split(X_train, y_train)):
            print("Fold", i)
            X_train_b = X_train[train]
            y_train_b = y_train[train]
            X_test_b = X_train[test]
            y_test_b = y_train[test]
            clf.fit(X_train_b, y_train_b)
            y_submission = clf.predict_proba(X_test_b)[:, 1]
            dataset_blend_train[test, j] = y_submission
            dataset_blend_test_j[:, i] = clf.predict_proba(X_test)[:, 1]
        dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

    print("Stacking.")
    clf = LogisticRegression()
    clf.fit(dataset_blend_train, y_train)
    print(
        "Stacking Accuracy %0.6f:"
        % accuracy_score(y_test, clf.predict(dataset_blend_test))
    )
    n = 1
    for model in clfs:
        model.fit(X_train, y_train)
        y_test_pred = model.predict(X_test)
        print("模型%d,Accuracy %0.6f:" % (n, accuracy_score(y_test, y_test_pred)))
        n = n + 1

Creating train and test sets for stacking.
0 RandomForestClassifier(n_jobs=-1)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
1 RandomForestClassifier(criterion='entropy', n_jobs=-1)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
2 ExtraTreesClassifier(n_jobs=-1)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
3 ExtraTreesClassifier(criterion='entropy', n_jobs=-1)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
4 GradientBoostingClassifier(learning_rate=0.05, max_depth=6, subsample=0.5)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
Stacking.
Stacking Accuracy 0.964912:
模型1,Accuracy 0.964912:
模型2,Accuracy 0.964912:
模型3,Accuracy 0.973684:
模型4,Accuracy 0.956140:
模型5,Accuracy 0.947368:
