In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

import warnings
warnings.filterwarnings("ignore")

# === 加载数据 ===
dataTrain = pd.read_csv("allAtt_onehot_large_train_new8.csv")
dataTest = pd.read_csv("allAtt_onehot_large_test_new8.csv")

x_train, y_train = dataTrain.iloc[:, 6:38].values, dataTrain.iloc[:, 38:].values
x_test, y_test = dataTest.iloc[:, 6:38].values, dataTest.iloc[:, 38:].values

y_train_int = np.argmax(y_train, axis=1)
y_test_int = np.argmax(y_test, axis=1)

# === 定义模型 ===
xgb = XGBClassifier(objective="multi:softprob", num_class=2, eval_metric="mlogloss", use_label_encoder=False)
lgb = LGBMClassifier(objective='multiclass', num_class=2)
cat = CatBoostClassifier(iterations=300, learning_rate=0.1, depth=6, loss_function='MultiClass', verbose=0)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# === 模型训练 ===
xgb.fit(x_train, y_train_int)
lgb.fit(x_train, y_train_int)
cat.fit(x_train, y_train_int)
rf.fit(x_train, y_train_int)

# === 获取每个模型的预测概率（作为时间步）===
def get_stacked_proba(model, X):
    return model.predict_proba(X)

train_probs = np.stack([
    get_stacked_proba(xgb, x_train),
    get_stacked_proba(lgb, x_train),
    get_stacked_proba(cat, x_train),
    get_stacked_proba(rf, x_train)
], axis=1)  # shape: [N, 4, 3]

test_probs = np.stack([
    get_stacked_proba(xgb, x_test),
    get_stacked_proba(lgb, x_test),
    get_stacked_proba(cat, x_test),
    get_stacked_proba(rf, x_test)
], axis=1)  # shape: [N, 4, 3]

# === 构建 LSTM 模型 ===
def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(32, input_shape=input_shape))
    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

early_stop = EarlyStopping(patience=5, restore_best_weights=True)

# === 训练 LSTM ===
lstm_model = build_lstm_model((train_probs.shape[1], train_probs.shape[2]))
lstm_model.fit(
    train_probs, 
    to_categorical(y_train_int),
    epochs=30,
    batch_size=64,
    validation_split=0.2,
    verbose=1,
    callbacks=[early_stop]
)

# === 预测 + 评估 ===
y_pred_probs = lstm_model.predict(test_probs)
y_pred = np.argmax(y_pred_probs, axis=1)

acc = accuracy_score(y_test_int, y_pred)
print(f"\n✅ Final LSTM on model outputs Accuracy: {acc:.4f}")
print("📊 Classification Report:")
print(classification_report(y_test_int, y_pred))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000962 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1967
[LightGBM] [Info] Number of data points in the train set: 4940, number of used features: 32
[LightGBM] [Info] Start training from score -0.792230
[LightGBM] [Info] Start training from score -0.603003
Epoch 1/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.6038 - loss: 0.6554 - val_accuracy: 0.9555 - val_loss: 0.3657
Epoch 2/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9773 - loss: 0.2436 - val_accuracy: 0.9787 - val_loss: 0.0671
Epoch 3/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9795 - loss: 0.0577 - val_accuracy: 0.9787 - val_loss: 0.0566
Epoch 4/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9778 - loss: 0.0614 - val_

In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

import warnings
warnings.filterwarnings("ignore")

# === 加载数据 ===
dataTrain = pd.read_csv("allAtt_onehot_large_train_new4.csv")
dataTest = pd.read_csv("allAtt_onehot_large_test_new4.csv")

x_train, y_train = dataTrain.iloc[:, 1:30].values, dataTrain.iloc[:, 30:].values
x_test, y_test = dataTest.iloc[:, 1:30].values, dataTest.iloc[:, 30:].values

y_train_int = np.argmax(y_train, axis=1)
y_test_int = np.argmax(y_test, axis=1)

# === 定义基模型 ===
xgb = XGBClassifier(objective="multi:softprob", num_class=3, eval_metric="mlogloss", use_label_encoder=False)
lgb = LGBMClassifier(objective='multiclass', num_class=3)
cat = CatBoostClassifier(iterations=300, learning_rate=0.1, depth=6, loss_function='MultiClass', verbose=0)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# === 模型训练 ===
xgb.fit(x_train, y_train_int)
lgb.fit(x_train, y_train_int)
cat.fit(x_train, y_train_int)
rf.fit(x_train, y_train_int)

# === 获取各模型预测概率 ===
def get_stacked_proba(model, X):
    return model.predict_proba(X)

train_probs = np.concatenate([
    get_stacked_proba(xgb, x_train),
    get_stacked_proba(lgb, x_train),
    get_stacked_proba(cat, x_train),
    get_stacked_proba(rf, x_train)
], axis=1)  # shape: [N, 12]

test_probs = np.concatenate([
    get_stacked_proba(xgb, x_test),
    get_stacked_proba(lgb, x_test),
    get_stacked_proba(cat, x_test),
    get_stacked_proba(rf, x_test)
], axis=1)  # shape: [N, 12]

# === 构建 MLP 模型 ===
def build_mlp_model(input_dim):
    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=(input_dim,)))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

early_stop = EarlyStopping(patience=5, restore_best_weights=True)

mlp_model = build_mlp_model(train_probs.shape[1])
mlp_model.fit(
    train_probs,
    to_categorical(y_train_int),
    epochs=30,
    batch_size=64,
    validation_split=0.2,
    verbose=1,
    callbacks=[early_stop]
)

# === 预测 + 评估 ===
y_pred_probs = mlp_model.predict(test_probs)
y_pred = np.argmax(y_pred_probs, axis=1)

acc = accuracy_score(y_test_int, y_pred)
print(f"\n✅ Final MLP Stacking Accuracy: {acc:.4f}")
print("📊 Classification Report:")
print(classification_report(y_test_int, y_pred))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000562 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2173
[LightGBM] [Info] Number of data points in the train set: 4940, number of used features: 29
[LightGBM] [Info] Start training from score -1.186581
[LightGBM] [Info] Start training from score -1.419219
[LightGBM] [Info] Start training from score -0.792230
Epoch 1/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6345 - loss: 0.9076 - val_accuracy: 0.9899 - val_loss: 0.2945
Epoch 2/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9948 - loss: 0.2156 - val_accuracy: 0.9970 - val_loss: 0.0395
Epoch 3/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9965 - loss: 0.0428 - val_accuracy: 0.9990 - val_loss: 0.0117
Epoch 4/30
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

In [15]:
import catboost
from catboost import CatBoostClassifier
import numpy as np
import tensorflow as tf
from tensorflow import keras

dataTrain=pd.read_csv("allAtt_onehot_large_train_new7.csv")
dataTrain = dataTrain.drop(['Hist_H_Prob','Hist_D_Prob','Hist_A_Prob'],axis=1)
dataTest=pd.read_csv("allAtt_onehot_large_test_new7.csv")
dataTest = dataTest.drop(['Hist_H_Prob','Hist_D_Prob','Hist_A_Prob'],axis=1)
# 假设x_train和x_test已经定义并预处理为适合输入LSTM的格式
x_train, y_train = dataTrain.iloc[:, 4:38].values, dataTrain.iloc[:, 38:].values
x_train = np.reshape(x_train, (4180, 34, 1))
x_test, y_test = dataTest.iloc[:, 4:38].values, dataTest.iloc[:, 38:].values
x_test = np.reshape(x_test, (380, 34, 1))

# 构建LSTM模型
def build_model(allow_cudnn_kernel=True):
    if allow_cudnn_kernel:
        lstm_layer = keras.layers.LSTM(64, input_shape=(34, 1))
    else:
        lstm_layer = keras.layers.RNN(keras.layers.LSTMCell(64), input_shape=(34, 1))
    
    model = keras.models.Sequential([
        lstm_layer,
        keras.layers.BatchNormalization(),
        keras.layers.Dense(3, activation="softmax")
    ])
    return model

model = build_model(allow_cudnn_kernel=True)

model.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=False),
    optimizer="Adam",
    metrics=["categorical_accuracy"]
)

# 训练LSTM模型
model.fit(x_train, y_train, validation_split=0.1, epochs=20, batch_size=64)

# 获取LSTM模型的输出特征
lstm_train_predictions = model.predict(x_train)
lstm_test_predictions = model.predict(x_test)

# 将LSTM输出作为CatBoost的输入特征
# LSTM模型输出的是预测的概率（或类别），你可以使用它们作为训练CatBoost的输入
lstm_train_predictions = np.array(lstm_train_predictions)
lstm_test_predictions = np.array(lstm_test_predictions)

# 初始化CatBoost分类器
catboost_model = CatBoostClassifier(
    iterations=1000,
    depth=10,
    learning_rate=0.05,
    loss_function='MultiClass'
)

# 使用LSTM预测的结果训练CatBoost模型
catboost_model.fit(lstm_train_predictions, np.argmax(y_train, axis=1), eval_set=(lstm_test_predictions, np.argmax(y_test, axis=1)))

# 在测试集上进行预测
catboost_pred = catboost_model.predict(lstm_test_predictions)

# 打印准确率
acc = accuracy_score(np.argmax(y_test, axis=1), catboost_pred)
print(f"Test Accuracy: {acc * 100:.2f}%")

# 打印分类报告
print("\nClassification Report:")
print(classification_report(np.argmax(y_test, axis=1), catboost_pred))

# 混淆矩阵
conf_matrix = confusion_matrix(np.argmax(y_test, axis=1), catboost_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

# 可视化混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1, 2], yticklabels=[0, 1, 2])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


Epoch 1/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - categorical_accuracy: 0.4747 - loss: 1.1344 - val_categorical_accuracy: 0.4809 - val_loss: 1.0421
Epoch 2/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - categorical_accuracy: 0.5219 - loss: 0.9920 - val_categorical_accuracy: 0.4856 - val_loss: 1.0336
Epoch 3/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - categorical_accuracy: 0.5312 - loss: 0.9912 - val_categorical_accuracy: 0.4809 - val_loss: 1.0372
Epoch 4/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - categorical_accuracy: 0.5222 - loss: 0.9947 - val_categorical_accuracy: 0.5072 - val_loss: 1.0249
Epoch 5/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - categorical_accuracy: 0.5301 - loss: 0.9811 - val_categorical_accuracy: 0.5383 - val_loss: 1.0218
Epoch 6/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/

NameError: name 'confusion_matrix' is not defined

In [16]:
import numpy as np
import pandas as pd
from tensorflow import keras
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# 假设 x_train 和 y_train 已经是适当的训练数据
dataTrain=pd.read_csv("allAtt_onehot_large_train_new7.csv")
dataTrain = dataTrain.drop(['Hist_H_Prob','Hist_D_Prob','Hist_A_Prob'],axis=1)
dataTest=pd.read_csv("allAtt_onehot_large_test_new7.csv")
dataTest = dataTest.drop(['Hist_H_Prob','Hist_D_Prob','Hist_A_Prob'],axis=1)
x_train, y_train = dataTrain.iloc[:, 4:38].values, dataTrain.iloc[:, 38:].values
x_train = np.reshape(x_train, (4180, 34, 1))
x_test, y_test = dataTest.iloc[:, 4:38].values, dataTest.iloc[:, 38:].values
x_test = np.reshape(x_test, (380, 34, 1))

# LSTM Model
def build_model(input_dim=34, units=64, output_size=3):
    lstm_layer = keras.layers.LSTM(units, input_shape=(input_dim, 1))
    model = keras.models.Sequential([
        lstm_layer,
        keras.layers.BatchNormalization(),
        keras.layers.Dense(output_size, activation="softmax")
    ])
    return model

lstm_model = build_model(input_dim=34, units=64, output_size=3)
lstm_model.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=False),
    optimizer="Adam",
    metrics=["categorical_accuracy"]
)

# 训练LSTM模型
lstm_model.fit(x_train, y_train, validation_split=0.1, epochs=20, batch_size=64)

# 获取LSTM模型的预测结果
lstm_train_predictions = lstm_model.predict(x_train)
lstm_test_predictions = lstm_model.predict(x_test)

# 将LSTM预测结果（概率）转换为类别
lstm_train_predictions = np.argmax(lstm_train_predictions, axis=1)
lstm_test_predictions = np.argmax(lstm_test_predictions, axis=1)

# CatBoost Model
catboost_model = CatBoostClassifier(iterations=1000, depth=10, learning_rate=0.05, loss_function='MultiClass', verbose=0)

# 训练CatBoost模型
catboost_model.fit(x_train, np.argmax(y_train, axis=1))

# 获取CatBoost的预测结果
catboost_train_predictions = catboost_model.predict(x_train)
catboost_test_predictions = catboost_model.predict(x_test)

# 将CatBoost的预测结果和LSTM的预测结果结合作为堆叠特征
stacked_train_features = np.vstack([lstm_train_predictions, catboost_train_predictions]).T
stacked_test_features = np.vstack([lstm_test_predictions, catboost_test_predictions]).T

# 训练堆叠分类器
stacked_model = CatBoostClassifier(iterations=1000, depth=10, learning_rate=0.05, loss_function='MultiClass', verbose=0)
stacked_model.fit(stacked_train_features, np.argmax(y_train, axis=1))

# 预测堆叠模型
stacked_pred = stacked_model.predict(stacked_test_features)

# 打印准确率
acc = accuracy_score(np.argmax(y_test, axis=1), stacked_pred)
print(f"Test Accuracy: {acc * 100:.2f}%")

# 打印分类报告
print("\nClassification Report:")
print(classification_report(np.argmax(y_test, axis=1), stacked_pred))

# 混淆矩阵
conf_matrix = confusion_matrix(np.argmax(y_test, axis=1), stacked_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

# 可视化混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1, 2], yticklabels=[0, 1, 2])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


Epoch 1/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - categorical_accuracy: 0.4875 - loss: 1.0520 - val_categorical_accuracy: 0.4809 - val_loss: 1.0452
Epoch 2/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - categorical_accuracy: 0.5380 - loss: 0.9861 - val_categorical_accuracy: 0.4809 - val_loss: 1.0390
Epoch 3/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - categorical_accuracy: 0.5387 - loss: 0.9767 - val_categorical_accuracy: 0.4809 - val_loss: 1.0365
Epoch 4/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - categorical_accuracy: 0.5409 - loss: 0.9779 - val_categorical_accuracy: 0.4809 - val_loss: 1.0380
Epoch 5/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - categorical_accuracy: 0.5266 - loss: 0.9869 - val_categorical_accuracy: 0.4809 - val_loss: 1.0308
Epoch 6/20
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/

CatBoostError: Input data has invalid shape: (4180, 34, 1). Must be 2 dimensional