# Main

## Import Libraries

In [1]:
# import library
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import trange

# sklearn
import sklearn
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.feature_selection import RFECV, SelectKBest, r_regression, f_regression
from sklearn.gaussian_process.kernels import Matern, RBF, CompoundKernel, Product, Sum, ExpSineSquared, RationalQuadratic
from sklearn.gaussian_process import GaussianProcessRegressor, GaussianProcessClassifier
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, StackingClassifier
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeClassifier
from sklearn.metrics import r2_score, make_scorer, f1_score, confusion_matrix
from sklearn.decomposition import PCA

# boost algorithm
import xgboost as xgb
import catboost as cat
import lightgbm as lgb

# torch
import torch
from torch import nn
from torch.nn import Module, Linear, Dropout
from torch.nn.functional import tanh, softmax, mse_loss, relu, sigmoid, binary_cross_entropy, nll_loss
from torch.optim import Adam, SGD

# bio library
import biosppy
from biosppy import storage
from biosppy.signals import ecg

# import ResNet
from ResNet import ResNet

DATA_DIR = "Data"
RESULT_DIR = "Result"

## Load Features Data

In [2]:
# Load Data
X_train_df = pd.read_csv(os.path.join(DATA_DIR, "X_train.csv"), header=0, index_col=0)
X_test_df = pd.read_csv(os.path.join(DATA_DIR, "X_test.csv"), header=0, index_col=0)
y_train_df = pd.read_csv(os.path.join(DATA_DIR, "y_train.csv"), header=0, index_col=0)

X_train = X_train_df.values
X_test = X_test_df.values
y_train = y_train_df.values.ravel()

In [2]:
X_train_expert = np.load("./Data/X_train_features.npy")
X_train_DNN = np.load("./Data/X_train_DNN_features.npy")
y_train = np.load("./Data/y_train.npy")

X_test_expert = np.load("./Data/X_test_features.npy")
X_test_DNN = np.load("./Data/X_test_DNN_features.npy")

y_train_DNN = np.load("./Data/DNN_train_result.npy")
y_test_DNN = np.load("./Data/DNN_test_result.npy")

In [3]:
X_train_features = np.concatenate([X_train_expert, X_train_DNN], axis=1)
X_test_features = np.concatenate([X_test_expert, X_test_DNN], axis=1)

In [4]:
X_train_expert_extra = np.load("./Data/X_train_features_extra.npy")
y_train_extra = np.load("./Data/y_train_extra.npy")
pid_extra = np.load("./Data/pid_train_extra.npy")

## Data Preprocessing

In [4]:
# feature_select = SelectKBest(k=100)
# feature_select.fit(X_train_features, y_train)
# X_train_features = feature_select.transform(X_train_features)
# X_test_features = feature_select.transform(X_test_features)

  f = msb / msw


In [4]:
# 计算一些不同类的y和属性
y_onevsall = (y_train == 3).astype(np.int32)
without_class3_ids = np.where(y_train != 3)
X_train_features_without_class3 = X_train_features[without_class3_ids]
y_train_without_class3 = y_train[without_class3_ids]

## Select Models for class 3

In [None]:
# Logistic Classier 0.75
# 模型：换loss function
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train_expert)):
    # split validation data
    fold_X_train = X_train_expert[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train_expert[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = LogisticRegression()
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(f1_score(fold_y_valid, fold_y_pred, average="micro"))
fold_score = np.average(fold_scores)
print(fold_score)

## Select Models for class 0 1 2

In [None]:
# Logistic Classier 0.75
# 模型：换loss function
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train_expert)):
    # split validation data
    fold_X_train = X_train_expert[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train_expert[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = LogisticRegression()
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(f1_score(fold_y_valid, fold_y_pred, average="micro"))
fold_score = np.average(fold_scores)
print(fold_score)

In [21]:
# Ridge Classifier 0.72
# 模型：换loss function
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train_expert)):
    # split validation data
    fold_X_train = X_train_expert[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train_expert[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = RidgeClassifier(alpha=1)
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(f1_score(fold_y_valid, fold_y_pred, average="micro"))
fold_score = np.average(fold_scores)
print(fold_score)

0.724642786229228


In [None]:
# Gaussian Classifier
# 模型：换loss function
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train_features)):
    # split validation data
    fold_X_train = X_train_features[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train_features[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = GaussianProcessClassifier(kernel=Matern(nu=1.5))
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(f1_score(fold_y_valid, fold_y_pred, average="micro"))
fold_score = np.average(fold_scores)
print(fold_score)

In [22]:
# Random Forest 0.63
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train_expert)):
    # split validation data
    fold_X_train = X_train_expert[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train_expert[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = RandomForestClassifier(n_estimators=500, max_depth=10, criterion="entropy", max_leaf_nodes=4)
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(f1_score(fold_y_valid, fold_y_pred, average="micro"))
fold_score = np.average(fold_scores)
print(fold_score)

0.6339633736559139


In [None]:
# GradientBoosting
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train_features)):
    # split validation data
    fold_X_train = X_train_features[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train_features[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = GradientBoostingClassifier(n_estimators=100, max_depth=5)
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(f1_score(fold_y_valid, fold_y_pred, average="micro"))
fold_score = np.average(fold_scores)
print(fold_score)

In [None]:
# Adaboost
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train_features)):
    # split validation data
    fold_X_train = X_train_features[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train_features[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = AdaBoostClassifier(n_estimators=100)
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(f1_score(fold_y_valid, fold_y_pred, average="micro"))
fold_score = np.average(fold_scores)
print(fold_score)

In [27]:
# Extra Trees
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train_features)):
    # split validation data
    fold_X_train = X_train_features[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train_features[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = ExtraTreesClassifier(n_estimators=100, max_depth=5, criterion="entropy")
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(f1_score(fold_y_valid, fold_y_pred, average="micro"))
fold_score = np.average(fold_scores)
print(fold_score)

0.9994136806573802


In [4]:
# XGB
# 模型：换loss function
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train_features)):
    # split validation data
    fold_X_train = X_train_features[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train_features[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = xgb.XGBClassifier(n_estimators=1000, max_depth=10, learning_rate=0.01, objective="multi:softmax", num_class=4, n_jobs=-1)
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(f1_score(fold_y_valid, fold_y_pred, average="micro"))
fold_score = np.average(fold_scores)
print(fold_score)

0.9988271703934506


In [23]:
# CatBoost 0.81
# 模型：换loss function
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
cf_mat = pd.DataFrame(np.zeros((3, 3)))
y_onevsall = (y_train == 3).astype(int)
X_train_features_without_class3 = X_train_features[without_class3_ids]
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train_expert[without_class3_ids])):
    # split validation data
    fold_X_train = X_train_expert[without_class3_ids][train_ids]
    fold_y_train = y_train_without_class3[train_ids]
    fold_X_valid = X_train_expert[without_class3_ids][valid_ids]
    fold_y_valid = y_train_without_class3[valid_ids]

    # train model
    model = cat.CatBoostClassifier(iterations=2000, learning_rate=0.01, max_depth=5)
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(f1_score(fold_y_valid, fold_y_pred, average="micro"))
    cf_mat += confusion_matrix(fold_y_valid, fold_y_pred)
fold_score = np.average(fold_scores)
print(fold_score)
cf_mat

0:	learn: 1.0908490	total: 11.8ms	remaining: 23.6s
1:	learn: 1.0836935	total: 17.2ms	remaining: 17.2s
2:	learn: 1.0766253	total: 23.4ms	remaining: 15.6s
3:	learn: 1.0698884	total: 28.9ms	remaining: 14.4s
4:	learn: 1.0623913	total: 34.3ms	remaining: 13.7s
5:	learn: 1.0553225	total: 39.8ms	remaining: 13.2s
6:	learn: 1.0495192	total: 45.1ms	remaining: 12.9s
7:	learn: 1.0429884	total: 50.5ms	remaining: 12.6s
8:	learn: 1.0366043	total: 56.1ms	remaining: 12.4s
9:	learn: 1.0305566	total: 61.7ms	remaining: 12.3s
10:	learn: 1.0238339	total: 67.2ms	remaining: 12.1s
11:	learn: 1.0174769	total: 72.9ms	remaining: 12.1s
12:	learn: 1.0114289	total: 78.9ms	remaining: 12.1s
13:	learn: 1.0056757	total: 85.2ms	remaining: 12.1s
14:	learn: 0.9997502	total: 91.1ms	remaining: 12.1s
15:	learn: 0.9939694	total: 96.7ms	remaining: 12s
16:	learn: 0.9885255	total: 103ms	remaining: 12s
17:	learn: 0.9829438	total: 108ms	remaining: 11.9s
18:	learn: 0.9780331	total: 114ms	remaining: 11.9s
19:	learn: 0.9726972	total: 1

Unnamed: 0,0,1,2
0,2828.0,6.0,196.0
1,42.0,249.0,152.0
2,458.0,54.0,962.0


In [9]:
# LGBM
# 模型：换loss function
fold_num = 5

kf = KFold(n_splits=fold_num)
fold_scores = []
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train_features)):
    # split validation data
    fold_X_train = X_train_features[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train_features[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = lgb.LGBMClassifier(n_estimators=1000, max_depth=10, learning_rate=0.01, objective="multiclass", num_class=4, n_jobs=-1)
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(f1_score(fold_y_valid, fold_y_pred, average="micro"))
fold_score = np.average(fold_scores)
print(fold_score)

0.9990226738147605


In [None]:
# stacking Classification
model1 = RandomForestClassifier(n_estimators=500, max_depth=5, criterion="entropy", max_leaf_nodes=4)
model2 = LogisticRegression()
model3 = xgb.XGBClassifier(n_estimators=1000, max_depth=5, learning_rate=0.01, num_class=4, n_jobs=-1)
model4 = cat.CatBoostClassifier(iterations=2000, learning_rate=0.05, max_depth=5)
model5 = lgb.LGBMClassifier(n_estimators=1000, max_depth=5, learning_rate=0.01, num_class=4, n_jobs=-1)
model6 = RidgeClassifier(alpha=1)
model7 = ExtraTreesClassifier(n_estimators=100, max_depth=5, criterion="entropy")
sub_models = [
    ("random forest", model1), 
    ("logistic regression", model2), 
    ("xgboost", model3), 
    ("catboost", model4), 
    ("lgb", model5), 
    ("ridge", model6), 
    ("extra tress", model7), 
]

final_classifier = cat.CatBoostClassifier(iterations=2000, learning_rate=0.05, max_depth=5)
stack_model = StackingClassifier(sub_models, final_classifier, stack_method="predict", n_jobs=-1)

kf = KFold(n_splits=5)
fold_scores = []
cf_mat = pd.DataFrame(np.zeros((4, 4)))
y_onevsall = (y_train == 3).astype(int)
X_train_features_without_class3 = X_train_features[without_class3_ids]
for i, (train_ids, valid_ids) in enumerate(kf.split(X_train_expert)):
    # split validation data
    fold_X_train = X_train_expert[train_ids]
    fold_y_train = y_train[train_ids]
    fold_X_valid = X_train_expert[valid_ids]
    fold_y_valid = y_train[valid_ids]

    # train model
    model = stack_model
    model.fit(fold_X_train, fold_y_train)
    fold_y_pred = model.predict(fold_X_valid)

    # calculate score
    fold_scores.append(f1_score(fold_y_valid, fold_y_pred, average="micro"))
    cf_mat += confusion_matrix(fold_y_valid, fold_y_pred)
fold_score = np.average(fold_scores)
print(fold_score)
cf_mat

In [17]:
# DNN Model
class ResNet_sklearn(object):
    def __init__(self):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.cpu_device = torch.device("cpu")
        self.model = ResNet(input_channels=1, output_features=32, output_dim=4).to(self.device)
        self.optimizer = Adam(self.model.parameters(), lr=0.001)
        self.criterion = nn.CrossEntropyLoss()
    
    def fit(self, x: np.ndarray, y: np.ndarray, epochs: int = 50):
        x = np.nan_to_num(x[:, :6000], nan=0) # 只保留前6000
        x = torch.from_numpy(x).float().to(self.device)
        y = torch.from_numpy(y).long().to(self.device)

        dataset = torch.utils.data.TensorDataset(x, y)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)

        self.model.train()
        with trange(epochs, desc="Training") as pbar:
            pbar_dict = {}
            pbar_dict["loss"] = round(0, 3)
            for epoch in pbar:
                for X_batch, y_batch in dataloader:
                    self.optimizer.zero_grad()
                    # 处理X_batch
                    X_batch = X_batch.reshape([X_batch.shape[0], 1, X_batch.shape[1]]) # 输入需要是(sequence_len, 1, batch_size)

                    y_pred = self.model(X_batch)
                    loss = self.criterion(y_pred, y_batch)
                    loss.backward()
                    
                    pbar_dict["loss"] = round(loss.detach().item(), 3)
                    self.optimizer.step()

                    pbar.set_postfix(pbar_dict)
    
    def predict(self, x: np.ndarray) -> np.ndarray:
        """
        预测
        :param model: 模型
        :param x: 数据
        :return: 预测结果
        """
        self.model.eval()
        x = np.nan_to_num(x[:, :6000], nan=0)
        x = torch.from_numpy(x).float().to(self.device)
        with torch.no_grad():
            y_preds = []
            for i in range(0, x.shape[0], 64):
                x_batch = x[i:i + 64, :6000]
                x_batch = x_batch.reshape([x_batch.shape[0], 1, x_batch.shape[1]])
                y_pred = self.model(x_batch)
                y_pred = torch.argmax(torch.softmax(y_pred, dim=-1), dim=-1).to(self.cpu_device).detach().numpy()
                y_preds.append(y_pred)
            y_preds = np.concatenate(y_preds, axis=0)
        return y_preds

## Train and Test

In [18]:
# 最终使用该模型
class MixModelCL(object):
    def __init__(self, num_class=4):
        model3 = xgb.XGBClassifier(n_estimators=1000, max_depth=10, learning_rate=0.01, num_class=num_class, n_jobs=-1)
        model4 = cat.CatBoostClassifier(iterations=2000, learning_rate=0.01, max_depth=5, verbose=False)
        model5 = lgb.LGBMClassifier(n_estimators=1000, max_depth=10, learning_rate=0.01, num_class=num_class, n_jobs=-1)
        self.dnn_model = ResNet_sklearn()
        self.mix_model = cat.CatBoostClassifier(iterations=1500, learning_rate=0.05, max_depth=5, verbose=False)
        self.basic_models = [model3, model4, model5]

    def fit(self, X_expert: np.array, X: np.ndarray, y: np.array) -> None:
        # 训练mix model
        temp_x_train_expert, temp_x_valid_expert, temp_y_train, temp_y_valid, temp_x_train, temp_x_valid = train_test_split(X_expert, y, X, test_size=0.3, random_state=10)
        intermediate_prediction = []
        for model in self.basic_models:
            model.fit(temp_x_train_expert, temp_y_train)
            intermediate_prediction.append(model.predict(temp_x_valid_expert).reshape((-1, 1)))
        self.dnn_model.fit(temp_x_train, temp_y_train, epochs=50)
        dnn_result = self.dnn_model.predict(temp_x_valid).reshape((-1, 1))
        intermediate_prediction.append(dnn_result)
        intermediate_prediction = np.concatenate(intermediate_prediction, axis=1)
        self.mix_model.fit(intermediate_prediction, temp_y_valid)

        # 训练模型
        for model in self.basic_models:
            model.fit(X_expert, y)
        self.dnn_model.fit(X, y)

    def predict(self, X_expert: np.array, X: np.ndarray) -> np.array:
        intermediate_prediction = []
        for model in self.basic_models:
            intermediate_prediction.append(model.predict(X_expert).reshape((-1, 1)))
        
        dnn_result = self.dnn_model.predict(X).reshape((-1, 1))
        intermediate_prediction.append(dnn_result)

        intermediate_prediction = np.concatenate(intermediate_prediction, axis=1)

        y_pred = self.mix_model.predict(intermediate_prediction)

        return y_pred

In [20]:
model = MixModelCL()
model.fit(X_train_expert, X_train, y_train)
result = model.predict(X_test_expert, X_test)

Training: 100%|██████████| 50/50 [01:44<00:00,  2.09s/it, loss=0.359]
Training: 100%|██████████| 50/50 [02:36<00:00,  3.13s/it, loss=0.298]


In [20]:
temp_x_train_expert, temp_x_valid_expert, temp_y_train, temp_y_valid, temp_x_train, temp_x_valid = train_test_split(X_train_expert, y_train, X_train, test_size=0.3, random_state=10)
stack_model.fit(temp_x_train_expert, temp_y_train)
stack_result = stack_model.predict(temp_x_valid_expert)
dnn_model = ResNet_sklearn()
dnn_model.fit(temp_x_train, temp_y_train)
dnn_result = dnn_model.predict(temp_x_valid)
final_classifier = cat.CatBoostClassifier(iterations=100, learning_rate=0.05, max_depth=5)
final_classifier.fit(np.concatenate([stack_result.reshape((-1, 1)), dnn_result.reshape((-1, 1))], axis=1), temp_y_valid)

In [6]:
# stacking Classification
model3 = xgb.XGBClassifier(n_estimators=1000, max_depth=5, learning_rate=0.01, num_class=4, n_jobs=-1, )
model4 = cat.CatBoostClassifier(iterations=2000, learning_rate=0.05, max_depth=5)
model5 = lgb.LGBMClassifier(n_estimators=1000, max_depth=5, learning_rate=0.01, num_class=4, n_jobs=-1)
results = []
for model in [model3, model4, model5]:
    model.fit(X_train_expert, y_train)
    results.append(model.predict(X_test_expert).ravel())
results.append(y_test_DNN)
results = np.array(results)
results = np.round(results.mean(axis=0))

0:	learn: 1.3255767	total: 153ms	remaining: 5m 5s
1:	learn: 1.2757483	total: 162ms	remaining: 2m 41s
2:	learn: 1.2275800	total: 172ms	remaining: 1m 54s
3:	learn: 1.1866023	total: 181ms	remaining: 1m 30s
4:	learn: 1.1498226	total: 190ms	remaining: 1m 15s
5:	learn: 1.1167309	total: 200ms	remaining: 1m 6s
6:	learn: 1.0886221	total: 210ms	remaining: 59.7s
7:	learn: 1.0586738	total: 219ms	remaining: 54.6s
8:	learn: 1.0318985	total: 228ms	remaining: 50.4s
9:	learn: 1.0091678	total: 238ms	remaining: 47.3s
10:	learn: 0.9866077	total: 247ms	remaining: 44.7s
11:	learn: 0.9650194	total: 256ms	remaining: 42.5s
12:	learn: 0.9473933	total: 266ms	remaining: 40.7s
13:	learn: 0.9295611	total: 275ms	remaining: 39.1s
14:	learn: 0.9136032	total: 285ms	remaining: 37.7s
15:	learn: 0.8989601	total: 293ms	remaining: 36.4s
16:	learn: 0.8842034	total: 303ms	remaining: 35.3s
17:	learn: 0.8694130	total: 312ms	remaining: 34.4s
18:	learn: 0.8565192	total: 321ms	remaining: 33.5s
19:	learn: 0.8453752	total: 331ms	rem

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (4, 3411) + inhomogeneous part.

In [6]:
from sklearn.ensemble import VotingClassifier
model3 = xgb.XGBClassifier(n_estimators=1000, max_depth=5, learning_rate=0.01, num_class=4, n_jobs=-1, )
model4 = cat.CatBoostClassifier(iterations=2000, learning_rate=0.05, max_depth=5)
model5 = lgb.LGBMClassifier(n_estimators=1000, max_depth=5, learning_rate=0.01, num_class=4, n_jobs=-1)
# model6 = ResNet_sklearn()
sub_models = [
    ("xgboost", model3), 
    ("catboost", model4), 
    ("lgb", model5), 
]

stack_model = VotingClassifier(sub_models, n_jobs=-1)
stack_model.fit(X_train_expert, y_train)

In [None]:
stack_model.predict(X_train_expert)

In [89]:
# 预测数据
y_onevsall = (y_train == 3).astype(np.int32)
without_class3_ids = np.where(y_train != 3)
X_train_features_without_class3 = X_train_features[without_class3_ids]
y_train_without_class3 = y_train[without_class3_ids]

model_onevsall = MixModelCL(use_dnn_results=True, num_class=1)
model = MixModelCL(use_dnn_results=True, num_class=3)
model_onevsall.fit(X_train_features, y_onevsall, y_train_DNN)
model.fit(X_train_features_without_class3, y_train_without_class3, y_train_DNN[without_class3_ids])
result_onevsall = model_onevsall.predict(X_test_features, y_test_DNN)
result_without_class3 = model.predict(X_test_features, y_test_DNN)

# result = result_onevsall * 3 + result_without_class3.ravel() * (- (result_onevsall - 1))

0:	learn: 0.6660470	total: 5.06ms	remaining: 10.1s
1:	learn: 0.6356021	total: 9.23ms	remaining: 9.22s
2:	learn: 0.6045634	total: 13.3ms	remaining: 8.86s
3:	learn: 0.5740726	total: 17.4ms	remaining: 8.69s
4:	learn: 0.5442417	total: 21.2ms	remaining: 8.47s
5:	learn: 0.5179215	total: 25ms	remaining: 8.31s
6:	learn: 0.4939993	total: 29.1ms	remaining: 8.29s
7:	learn: 0.4726758	total: 32.8ms	remaining: 8.16s
8:	learn: 0.4532264	total: 36.5ms	remaining: 8.07s
9:	learn: 0.4335665	total: 40.6ms	remaining: 8.08s
10:	learn: 0.4122629	total: 44.6ms	remaining: 8.07s
11:	learn: 0.3924831	total: 48.4ms	remaining: 8.02s
12:	learn: 0.3802764	total: 52ms	remaining: 7.95s
13:	learn: 0.3606120	total: 56ms	remaining: 7.95s
14:	learn: 0.3434722	total: 60.2ms	remaining: 7.97s
15:	learn: 0.3276678	total: 64ms	remaining: 7.93s
16:	learn: 0.3113730	total: 67.8ms	remaining: 7.9s
17:	learn: 0.2963996	total: 72ms	remaining: 7.93s
18:	learn: 0.2822347	total: 76.1ms	remaining: 7.94s
19:	learn: 0.2697068	total: 80ms	

In [90]:
result = result_onevsall.ravel() * 3 + result_without_class3.ravel() * (- (result_onevsall - 1))

In [17]:
result_df = pd.DataFrame(results.reshape((-1, 1)), columns=["y"]).reset_index()
result_df.columns = ["id", "y"]
result_df.to_csv("./Result/result_stack_model.csv", index=False)