# 第二模型训练（回归/分类）

In [13]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

模型路径：
1. 通过简单的决策树分类出确定的0
2. 然后通过mlp对较为均衡的数据集再进行学习

In [14]:
# 构建 MLP 模型
def build_mlp_ecg(input_length, num_classes):
    model = models.Sequential([
        layers.Input(shape=(input_length,)),

        layers.Dense(512, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.4),

        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.4),

        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),

        layers.Dense(num_classes, activation='softmax')
    ])
    return model

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [16]:
# read data frame
# === 加载 MIT-BIH (5类) ===
mitbih_test = pd.read_csv("../data/ecg_category/mitbih_test.csv")
mitbih_train = pd.read_csv("../data/ecg_category/mitbih_train.csv")

X_mitbih_train, y_mitbih_train = mitbih_train.iloc[:, :-1].values, mitbih_train.iloc[:, -1].values
X_mitbih_test, y_mitbih_test = mitbih_test.iloc[:, :-1].values, mitbih_test.iloc[:, -1].values

In [17]:
# === 检查标签分布 ===
print("训练集标签分布：")
print(pd.Series(y_mitbih_train).value_counts())

print("\n测试集标签分布：")
print(pd.Series(y_mitbih_test).value_counts())

# === 检查标签类型 ===
print("\ny_mitbih_train dtype:", y_mitbih_train.dtype)
print("y_mitbih_train unique values:", np.unique(y_mitbih_train)[:10])
print("y_mitbih_test unique values:", np.unique(y_mitbih_test)[:10])

# === 强制转 int，避免 float/NaN ===
y_mitbih_train = y_mitbih_train.astype(int)
y_mitbih_test = y_mitbih_test.astype(int)

训练集标签分布：
0.0    72470
4.0     6431
2.0     5788
1.0     2223
3.0      641
Name: count, dtype: int64

测试集标签分布：
0.0    18117
4.0     1608
2.0     1448
1.0      556
3.0      162
Name: count, dtype: int64

y_mitbih_train dtype: float64
y_mitbih_train unique values: [0. 1. 2. 3. 4.]
y_mitbih_test unique values: [0. 1. 2. 3. 4.]


In [18]:
# 1) 训练集上用决策树筛“易例 0”
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# 假设已有 X_train, y_train (标签为 0..4 的整数)
tree = DecisionTreeClassifier(
    max_depth=21,            # 浅一点，避免过拟合
    min_samples_leaf=20,    # 提高叶子样本量，稳一些
    class_weight='balanced',
    random_state=42
)
tree.fit(X_mitbih_train, y_mitbih_train)

# 预测 P(y=0|x)
p0 = tree.predict_proba(X_mitbih_train)[:, 0]

tau = 0.95  # 训练阶段阈值（可用验证集调参）
is_easy_zero = (y_mitbih_train == 0) & (p0 >= tau)

# 可选：限制最多剔除的数量，避免过头
easy_zero_idx = np.where(is_easy_zero)[0]

# 构建“难例训练集”：保留所有少数类 + 难例0
keep_mask = np.ones_like(y_mitbih_train, dtype=bool)
keep_mask[easy_zero_idx] = False
X_train_hard = X_mitbih_train[keep_mask]
y_train_hard = y_mitbih_train[keep_mask]

print("原训练集大小:", len(y_mitbih_train), "筛后训练集大小:", len(y_train_hard))
unique, counts = np.unique(y_train_hard, return_counts=True)
print("筛后类别分布:", dict(zip(unique, counts)))


原训练集大小: 87553 筛后训练集大小: 24479
筛后类别分布: {0: 9396, 1: 2223, 2: 5788, 3: 641, 4: 6431}


In [19]:
# 1) 确认 predict_proba 列顺序
print("classes_ =", tree.classes_)   # 应为 [0,1,2,3,4]
p = tree.predict_proba(X_mitbih_train)      # shape: (N, 5)
p0 = p[:, list(tree.classes_).index(0)]

# 2) 看一下 p0 的分布
import numpy as np
q = np.quantile(p0, [0, .5, .9, .95, .98, .99, .995, 1.0])
print("p0 quantiles:", q)

# 3) 看一下在真0类里的 p0
q0 = np.quantile(p0[y_mitbih_train==0], [0, .5, .9, .95, .98, .99, .995, 1.0])
print("p0 | y=0 quantiles:", q0)

# 4) 也看一下非0类里被“高p0”覆盖多少
for t in [0.95, 0.98, 0.99]:
    tp = np.mean((p0>=t) & (y_mitbih_train==0))
    fp = np.mean((p0>=t) & (y_mitbih_train!=0))
    print(f"tau={t}: 训练集里 高p0比例，总={np.mean(p0>=t):.4f}, 真0占比={tp:.4f}, 误杀非0占比={fp:.4f}")


classes_ = [0 1 2 3 4]
p0 quantiles: [0. 1. 1. 1. 1. 1. 1. 1.]
p0 | y=0 quantiles: [1.13737725e-04 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00]
tau=0.95: 训练集里 高p0比例，总=0.7205, 真0占比=0.7204, 误杀非0占比=0.0001
tau=0.98: 训练集里 高p0比例，总=0.6881, 真0占比=0.6881, 误杀非0占比=0.0000
tau=0.99: 训练集里 高p0比例，总=0.6881, 真0占比=0.6881, 误杀非0占比=0.0000


In [20]:
# class_weight（基于筛后训练集）
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
import numpy as np


cls_w = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.arange(5),
    y=y_train_hard
)
cls_w = {i: w for i, w in enumerate(cls_w)}

# 一个稳健的 MLP
import tensorflow as tf
inputs = tf.keras.Input(shape=(X_train_hard.shape[1],))
x = tf.keras.layers.Dense(256, activation='relu')(inputs)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(0.3)(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(0.3)(x)
outputs = tf.keras.layers.Dense(5, activation='softmax')(x)
mlp = tf.keras.Model(inputs, outputs)

mlp.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

cb = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)
]

# 假设你已经得到筛后的：
# X_train_hard, y_train_hard
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_hard, y_train_hard,
    test_size=0.15,           # 15% 做验证
    random_state=42,
    stratify=y_train_hard     # 保持类别比例
)

# 然后把 fit 里的变量换成：
mlp.fit(
    X_tr, y_tr,
    validation_data=(X_val, y_val),
    epochs=10, batch_size=256,
    class_weight=cls_w,
    callbacks=cb,
    verbose=1
)


Epoch 1/10
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.5242 - loss: 1.1204 - val_accuracy: 0.6473 - val_loss: 1.0389 - learning_rate: 0.0010
Epoch 2/10
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.5835 - loss: 0.9466 - val_accuracy: 0.6800 - val_loss: 0.9272 - learning_rate: 0.0010
Epoch 3/10
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.5867 - loss: 0.9131 - val_accuracy: 0.6593 - val_loss: 0.8837 - learning_rate: 0.0010
Epoch 4/10
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.5983 - loss: 0.8776 - val_accuracy: 0.6408 - val_loss: 0.9118 - learning_rate: 0.0010
Epoch 5/10
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.6068 - loss: 0.8643 - val_accuracy: 0.6035 - val_loss: 0.9612 - learning_rate: 0.0010
Epoch 6/10
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 

<keras.src.callbacks.history.History at 0x358538e50>

In [21]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1) Keras 自带 evaluate（只看准确率）
val_loss, val_acc = mlp.evaluate(X_val, y_val, verbose=0)
print(f"[MLP] Val accuracy: {val_acc:.4f}")

# 2) 更详细的指标
y_val_pred = np.argmax(mlp.predict(X_val, verbose=0), axis=1)
print(f"[MLP] Val accuracy(sklearn): {accuracy_score(y_val, y_val_pred):.4f}")
print("[MLP] Val classification report:\n", classification_report(y_val, y_val_pred, digits=4))
print("[MLP] Val confusion matrix:\n", confusion_matrix(y_val, y_val_pred))


[MLP] Val accuracy: 0.6525
[MLP] Val accuracy(sklearn): 0.6525
[MLP] Val classification report:
               precision    recall  f1-score   support

           0     0.7189    0.4393    0.5454      1409
           1     0.4246    0.7246    0.5354       334
           2     0.6806    0.6947    0.6876       868
           3     0.1894    0.8229    0.3080        96
           4     0.9094    0.8839    0.8965       965

    accuracy                         0.6525      3672
   macro avg     0.5846    0.7131    0.5946      3672
weighted avg     0.7193    0.6525    0.6641      3672

[MLP] Val confusion matrix:
 [[619 296 211 236  47]
 [ 63 242  15  12   2]
 [118  26 603  86  35]
 [ 11   2   3  79   1]
 [ 50   4  54   4 853]]


In [22]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# 若你用的是多分类树（classes_=[0 1 2 3 4]），取 p0 的函数
idx0 = list(tree.classes_).index(0)
def p0_from_tree(tree, X):
    return tree.predict_proba(X)[:, idx0]

def predict_cascade_on(X, tau=0.98):
    p0 = p0_from_tree(tree, X)
    gate = (p0 >= tau)               # 高置信 0 → 直接判 0
    y_pred = np.empty(X.shape[0], dtype=int)
    idx = np.where(~gate)[0]         # 其余交给 MLP 五分类
    if len(idx) > 0:
        y_pred[idx] = np.argmax(mlp.predict(X[idx], verbose=0), axis=1)
    y_pred[gate] = 0
    return y_pred, gate

taus = [0.95, 0.97, 0.98, 0.99]
p0_val = p0_from_tree(tree, X_val)
non0_val = np.sum(y_val != 0)

best = None
for tau in taus:
    y_pred_c, gate = predict_cascade_on(X_val, tau=tau)
    acc = accuracy_score(y_val, y_pred_c)
    macro_f1 = f1_score(y_val, y_pred_c, average='macro')
    gate_fp = np.sum(gate & (y_val != 0))  # 被树误杀的非0
    print(f"[Cascade@tau={tau}] acc={acc:.4f}  macroF1={macro_f1:.4f}  "
          f"gated={gate.sum()}/{len(y_val)} ({gate.mean():.1%})  "
          f"gate_fp={gate_fp} ({gate_fp/non0_val:.2%} of non-0)")
    score = (macro_f1, acc)  # 先看宏F1，再看acc
    if (best is None) or (score > best[0]):
        best = (score, tau, y_pred_c)

best_tau = best[1]
print("\nBest tau on val:", best_tau)
print(classification_report(y_val, best[2], digits=4))
print(confusion_matrix(y_val, best[2]))


[Cascade@tau=0.95] acc=0.6520  macroF1=0.5942  gated=3/3672 (0.1%)  gate_fp=3 (0.13% of non-0)
[Cascade@tau=0.97] acc=0.6520  macroF1=0.5942  gated=3/3672 (0.1%)  gate_fp=3 (0.13% of non-0)
[Cascade@tau=0.98] acc=0.6525  macroF1=0.5946  gated=0/3672 (0.0%)  gate_fp=0 (0.00% of non-0)
[Cascade@tau=0.99] acc=0.6525  macroF1=0.5946  gated=0/3672 (0.0%)  gate_fp=0 (0.00% of non-0)

Best tau on val: 0.98
              precision    recall  f1-score   support

           0     0.7189    0.4393    0.5454      1409
           1     0.4246    0.7246    0.5354       334
           2     0.6806    0.6947    0.6876       868
           3     0.1894    0.8229    0.3080        96
           4     0.9094    0.8839    0.8965       965

    accuracy                         0.6525      3672
   macro avg     0.5846    0.7131    0.5946      3672
weighted avg     0.7193    0.6525    0.6641      3672

[[619 296 211 236  47]
 [ 63 242  15  12   2]
 [118  26 603  86  35]
 [ 11   2   3  79   1]
 [ 50   4  54   