In [2]:
# -*- coding: gbk -*-
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
import copy
'''
代码的博客已经放在收藏夹
'''

class stacking():
    def __init__(self, n_folds=5, meta_model_name='svc'):
        self.n_folds = n_folds
        self.meta_model_name = meta_model_name

        try:
            if self.meta_model_name == 'svc':
                self.meta_model = SVC(gamma=1, C=1)
            elif self.meta_model_name == 'gbc':
                self.meta_model = GradientBoostingClassifier()
            elif self.meta_model_name == 'etc':
                self.meta_model = ExtraTreesClassifier()
            elif self.meta_model_name == 'dtc':
                self.meta_model = DecisionTreeClassifier()
            elif self.meta_model_name == 'rfc':
                self.meta_model = RandomForestClassifier(n_estimators=50)
        except ValueError:
            print('error arg')

        self.gbc_model = GradientBoostingClassifier()
        self.svc_model = SVC(gamma=0.5, C=50)
        self.etc_model = ExtraTreesClassifier()
        self.dtc_model = DecisionTreeClassifier()
        self.rfc_model = RandomForestClassifier(n_estimators=50)

        self.gbc_models = []
        self.svc_models = []
        self.etc_models = []
        self.dtc_models = []
        self.rfc_models = []

        self.base_models = [self.gbc_models, self.svc_models,
                            self.etc_models, self.dtc_models, self.rfc_models]

    def get_n_folds(self, X, y):
        n_X = []
        n_y = []
        for i in range(self.n_folds):
            n_X.append(X[i::self.n_folds])
            n_y.append(y[i::self.n_folds])
        return n_X, n_y

    def multi_base_models_train(self, X, y):
        n_X, n_y = self.get_n_folds(X, y)

        meta_data = np.array([])
        meta_label = np.array([])

        for i in range(self.n_folds):

            index = np.arange(self.n_folds)
            index = np.delete(index, i)

            # 获取n_fold中除去某一折以后的训练集
            n_fold_X = np.array([])
            n_fold_y = np.array([])
            for k in index:
                n_fold_X = np.append(n_fold_X, n_X[k])
                n_fold_y = np.append(n_fold_y, n_y[k])

            n_fold_X = n_fold_X.reshape(-1, X.shape[1])

            part_meta_data = np.array([])

            for model_, j in [(self.gbc_model, 0), (self.svc_model, 1), (self.etc_model, 2), (self.dtc_model, 3), (self.rfc_model, 4)]:

                # 训练初级分类器
                model_.fit(n_fold_X, n_fold_y)

                new_model = copy.deepcopy(model_)
                self.base_models[j].append(new_model)

                predict_y = self.base_models[j][i].predict(n_X[i])
                predict_y = predict_y.reshape(-1, 1)

                # 将一折的预测label数据作为次级训练集
                if j == 0:
                    part_meta_data = predict_y
                else:
                    part_meta_data = np.append(
                        part_meta_data, predict_y, axis=1)
            if i == 0:
                meta_data = part_meta_data
            else:
                meta_data = np.append(meta_data, part_meta_data, axis=0)

            meta_label = np.append(meta_label, n_y[i])

        return meta_data, meta_label

    def meta_model_train(self, X, y):

        self.meta_model.fit(X, y)

    def fit(self, X, y):
        meta_data, meta_label = self.multi_base_models_train(X, y)

        self.meta_model_train(meta_data, meta_label)

    def predict(self, XX):

        for i in range(self.n_folds):  # 选择交叉验证得到的分类器组
            for j in range(5):  # 选择分类器类型
                predict_y = self.base_models[j][i].predict(XX)
                predict_y = predict_y.reshape(-1, 1)

                if j == 0:
                    meta_data = predict_y
                else:
                    meta_data = np.append(meta_data, predict_y, axis=1)
            if i == 0:
                ave_meta_data = meta_data
            else:
                ave_meta_data = ave_meta_data + meta_data

        ave_meta_data = ave_meta_data / self.n_folds
        pre = self.meta_model.predict(ave_meta_data)
        return pre


def count_true(pre_p, p):#计算正确率
    p1 = pre_p - p
    count = 0
    for i in p1:
        if i == 0:
            count += 1
    print(count / p.shape[0])


if __name__ == '__main__':

    label_in = {'A': 0, 'F': 1, 'H': 2, 'E': 3, 'I': 4, 'Y': 5,
                'D': 6, 'X': 7, 'G': 8, 'W': 9, 'C': 10, 'B': 11}  # 将标签从字母转换为数字，便于使用

    dir = 'D:\\2345Downloads\\10.28以后下载的项目\\集成学习\\avila\\'
    train_name = 'avila-tr.txt'
    test_name = 'avila-ts.txt'

    obj = pd.read_csv(dir + train_name, header=None)
    data = np.array(obj[:][:])
    train_label = data[:, -1]

    for i in range(train_label.shape[0]):
        train_label[i] = label_in[train_label[i]]
    train_label = train_label.astype(int)

    train_data = data[:, 0:data.shape[1] - 1]

    obj = pd.read_csv(dir + test_name, header=None)
    data = np.array(obj[:][:])
    test_label = data[:, -1]
    for i in range(test_label.shape[0]):
        test_label[i] = label_in[test_label[i]]
    test_label = test_label.astype(int)

    test_data = data[:, 0:data.shape[1] - 1]

    stk = stacking(n_folds=5, meta_model_name='dtc')
    stk.fit(train_data, train_label)

    for j in range(5):  # 显示出每个基分类器对测试集的预测正确率
        for i in range(stk.n_folds):
            a = stk.base_models[j][i].predict(test_data)
            count_true(a, test_label)
        print('-------')

    a = stk.predict(test_data)
    count_true(a, test_label)


0.9451949794002108
0.9467279869694356
0.9459614831848232
0.9428954680463735
0.9485484334578902
-------
0.8215004311583789
0.8243748203506754
0.8232250646737568
0.8215962441314554
0.8267701446775894
-------
0.9771006994347035
0.9752802529462489
0.9732681805116413
0.9803583405193063
0.9746095621347131
-------
0.9678068410462777
0.9699147264539618
0.9662738334770528
0.9626329405001437
0.9370508766887037
-------
0.9761425697039379
0.9676152151001246
0.9721184248347227
0.9732681805116413
0.9755676918654785
-------
0.9644533869885983
