In [1]:
# ============================================================================
# Two-Stream V4.1 - Cross-Attention Fusion
# 核心思想：利用 eGeMAPS 全局特征作为 Context，通过 Attention 指导 Log-Mel 序列特征
# ============================================================================

import os
import math
import numpy as np
import pandas as pd
import librosa
import opensmile
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix, f1_score

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

# 1. 基础配置
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

# 自动生成目录
def create_experiment_dir(base_dir="Output"):
    script_name = "twostream_v4_1_crossattn"
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    exp_dir = os.path.join(base_dir, f"{script_name}_{timestamp}")
    os.makedirs(os.path.join(exp_dir, "models"), exist_ok=True)
    os.makedirs(os.path.join(exp_dir, "plots"), exist_ok=True)
    os.makedirs(os.path.join(exp_dir, "logs"), exist_ok=True)
    return exp_dir, os.path.join(exp_dir, "models"), os.path.join(exp_dir, "plots"), os.path.join(exp_dir, "logs")

EXP_DIR, MODEL_DIR, PLOT_DIR, LOG_DIR = create_experiment_dir()
AUDIO_DIR = Path("AudioWAV") 

# =====================
# 2. 特征提取 (Log-Mel + eGeMAPS)
# =====================
SR = 16000
N_MELS = 64
FIXED_SECONDS = 3.0
MAX_FRAMES = int(math.ceil(FIXED_SECONDS * SR / 160))

# 初始化 OpenSmile
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals,
)

def load_data_and_extract(audio_dir):
    print("Scanning files...")
    paths, emotions, speakers = [], [], []
    for f in audio_dir.glob("*.wav"):
        parts = f.stem.split("_")
        if len(parts) == 4:
            paths.append(f); emotions.append(parts[2]); speakers.append(parts[0])
    
    print(f"Extracting features for {len(paths)} files...")
    X_mel, X_ege = [], []
    
    for i, p in enumerate(paths):
        if i % 500 == 0: print(f"Processing {i}...")
        
        # A. Log-Mel
        y, _ = librosa.load(p, sr=SR, mono=True)
        tgt = int(FIXED_SECONDS * SR)
        if len(y) < tgt: y = np.pad(y, (0, tgt-len(y)))
        else: y = y[:tgt]
        S = librosa.feature.melspectrogram(y=y, sr=SR, n_mels=N_MELS, hop_length=160)
        S_db = librosa.power_to_db(S, ref=np.max).astype(np.float32)
        feat = np.transpose(S_db[..., None], (1, 0, 2))
        if feat.shape[0] < MAX_FRAMES:
            feat = np.pad(feat, ((0, MAX_FRAMES-feat.shape[0]),(0,0),(0,0)))
        else: feat = feat[:MAX_FRAMES]
        X_mel.append(feat)
        
        # B. eGeMAPS (Functionals)
        try:
            ege = smile.process_file(p).values.flatten().astype(np.float32)
            # 简单的NaN处理
            if np.isnan(ege).any(): ege = np.nan_to_num(ege)
            X_ege.append(ege)
        except:
            X_ege.append(np.zeros(88, dtype=np.float32))
            
    return np.stack(X_mel), np.stack(X_ege), np.array(emotions), np.array(speakers)

# 执行提取 (如果是Notebook环境，确保只运行一次)
if 'X_mel_all' not in globals():
    X_mel_all, X_ege_all, y_all, spk_all = load_data_and_extract(AUDIO_DIR)

# =====================
# 3. 数据准备
# =====================
le = LabelEncoder()
y_enc = le.fit_transform(y_all)
n_classes = len(le.classes_)

# 划分
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
tr_idx, te_idx = next(gss.split(X_mel_all, y_enc, groups=spk_all))

X_mel_tr, X_mel_te = X_mel_all[tr_idx], X_mel_all[te_idx]
X_ege_tr, X_ege_te = X_ege_all[tr_idx], X_ege_all[te_idx]
y_tr, y_te = y_enc[tr_idx], y_enc[te_idx]

# 归一化
# Log-Mel: Global Z-Score
mean_mel = X_mel_tr.mean(axis=(0,1,2), keepdims=True)
std_mel = X_mel_tr.std(axis=(0,1,2), keepdims=True) + 1e-6
X_mel_tr = (X_mel_tr - mean_mel) / std_mel
X_mel_te = (X_mel_te - mean_mel) / std_mel

# eGeMAPS: StandardScaler
scaler = StandardScaler()
X_ege_tr = scaler.fit_transform(X_ege_tr)
X_ege_te = scaler.transform(X_ege_te)

print(f"Train shape: Mel {X_mel_tr.shape}, eGe {X_ege_tr.shape}")

# =====================
# 4. 核心组件：Conformer & Cross-Attention
# =====================
def glu(x): return x[..., :x.shape[-1]//2] * tf.sigmoid(x[..., x.shape[-1]//2:])

def conformer_block(x, d_model=128, dropout=0.15):
    # FFN 1
    r = x
    x = layers.Dense(d_model*4, activation="swish")(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Dense(d_model)(x)
    x = layers.LayerNormalization()(r + 0.5*x)
    
    # MHSA
    r = x
    x = layers.MultiHeadAttention(4, d_model//4, dropout=dropout)(x, x)
    x = layers.LayerNormalization()(r + x)
    
    # Conv
    r = x
    x = layers.Conv1D(d_model*2, 1)(x)
    x = layers.Lambda(glu)(x)
    x = layers.DepthwiseConv1D(15, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('swish')(x)
    x = layers.Conv1D(d_model, 1)(x)
    x = layers.Dropout(dropout)(x)
    x = layers.LayerNormalization()(r + x)
    
    # FFN 2
    r = x
    x = layers.Dense(d_model*4, activation="swish")(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Dense(d_model)(x)
    x = layers.LayerNormalization()(r + 0.5*x)
    return x

# === 关键修改：互注意力融合层 ===
class CrossAttentionFusion(layers.Layer):
    def __init__(self, d_model, dropout=0.1, **kwargs):
        super().__init__(**kwargs)
        self.d_model = d_model
        self.mha = layers.MultiHeadAttention(num_heads=4, key_dim=d_model//4, dropout=dropout)
        self.layernorm = layers.LayerNormalization()
        self.add = layers.Add()
        
    def call(self, sequence, context):
        # sequence: (B, T, D) -> Query
        # context: (B, D_ctx) -> Key, Value
        
        # 1. Expand context to be sequence-like for attention
        # context: (B, 1, D)
        context = tf.expand_dims(context, axis=1)
        
        # 2. Cross Attention
        # Query = sequence (Conformer features)
        # Key/Value = context (eGeMAPS features)
        # 这样模型会根据 eGeMAPS 的信息，去“关注”序列中特定的时间步
        attn_out = self.mha(query=sequence, value=context, key=context)
        
        # 3. Residual Connection + Norm
        # 将 Attention 的结果加回原序列，实现“调制”效果
        x = self.add([sequence, attn_out])
        return self.layernorm(x)

# =====================
# 5. 模型构建 (修复版)
# =====================
def build_cross_fusion_model(mel_shape, ege_shape, n_classes, d_model=128):
    # --- Stream 1: Conformer ---
    in_mel = layers.Input(shape=mel_shape)
    x = layers.GaussianNoise(0.02)(in_mel) 
    
    # CNN Front-end
    # === 修复点开始 ===
    # 必须在 BatchNormalization() 后面加上 (x)
    x = layers.Conv2D(32, 3, padding='same', activation='relu', kernel_regularizer=tf.keras.regularizers.l2(5e-5))(x)
    x = layers.BatchNormalization()(x)  # 注意这里加上了 (x)
    x = layers.MaxPooling2D((2,2))(x)
    
    x = layers.Conv2D(64, 3, padding='same', activation='relu', kernel_regularizer=tf.keras.regularizers.l2(5e-5))(x)
    x = layers.BatchNormalization()(x)  # 注意这里加上了 (x)
    x = layers.MaxPooling2D((2,2))(x)
    
    x = layers.Conv2D(128, 3, padding='same', activation='relu', kernel_regularizer=tf.keras.regularizers.l2(5e-5))(x)
    x = layers.BatchNormalization()(x)  # 注意这里加上了 (x)
    x = layers.MaxPooling2D((2,2))(x)
    # === 修复点结束 ===
    
    x = layers.Reshape((-1, x.shape[-1]))(x)
    x = layers.Dense(d_model)(x)
    x = layers.Dropout(0.15)(x)
    
    # Conformer Encoders
    x = conformer_block(x, d_model, dropout=0.15)
    x = conformer_block(x, d_model, dropout=0.15)
    
    # --- Stream 2: eGeMAPS Context ---
    in_ege = layers.Input(shape=ege_shape)
    c = layers.GaussianNoise(0.05)(in_ege)
    c = layers.Dense(d_model, activation='relu')(c)
    c = layers.BatchNormalization()(c) # 这里你也漏了 (c)，但我顺手帮你补上了
    c = layers.Dropout(0.3)(c) 
    
    # --- Fusion: Cross-Attention ---
    fused_seq = CrossAttentionFusion(d_model, dropout=0.2)(sequence=x, context=c)
    
    # --- Pooling & Classification ---
    att = layers.Dense(1, activation="tanh")(fused_seq)
    att = layers.Softmax(axis=1)(att)
    pooled = tf.reduce_sum(fused_seq * att, axis=1)
    
    final_vec = layers.Concatenate()([pooled, c])
    
    z = layers.Dense(64, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(5e-5))(final_vec)
    z = layers.Dropout(0.3)(z)
    out = layers.Dense(n_classes, activation="softmax", name="emotion")(z)
    
    return models.Model(inputs=[in_mel, in_ege], outputs=out)

# =====================
# 6. 训练配置
# =====================
class LabelSmoothingFocalLoss(tf.keras.losses.Loss):
    def __init__(self, smoothing=0.05, gamma=2.0, **kwargs):
        super().__init__(**kwargs)
        self.smoothing = smoothing
        self.gamma = gamma
    def call(self, y_true, y_pred):
        n = tf.cast(tf.shape(y_pred)[-1], tf.float32)
        y_true = y_true * (1 - self.smoothing) + self.smoothing / n
        y_pred = tf.clip_by_value(y_pred, 1e-7, 1 - 1e-7)
        return tf.reduce_sum(-y_true * tf.math.log(y_pred) * tf.pow(1 - y_pred, self.gamma), axis=-1)

# Generator (Mixup)
class DualMixupGen(tf.keras.utils.Sequence):
    def __init__(self, x1, x2, y, batch=64, alpha=0.2):
        self.x1, self.x2, self.y = x1, x2, y
        self.batch, self.alpha = batch, alpha
        self.idx = np.arange(len(x1))
        np.random.shuffle(self.idx)
    def __len__(self): return int(np.ceil(len(self.x1)/self.batch))
    def __getitem__(self, i):
        inds = self.idx[i*self.batch:(i+1)*self.batch]
        bx1, bx2, by = self.x1[inds], self.x2[inds], self.y[inds]
        if np.random.random()<0.5: # Mixup
            lam = np.random.beta(self.alpha, self.alpha)
            perm = np.random.permutation(len(bx1))
            bx1 = lam*bx1 + (1-lam)*bx1[perm]
            bx2 = lam*bx2 + (1-lam)*bx2[perm]
            by = lam*by + (1-lam)*by[perm]
        return [bx1, bx2], by
    def on_epoch_end(self): np.random.shuffle(self.idx)

# 编译
model = build_cross_fusion_model(X_mel_tr.shape[1:], (88,), n_classes)
model.compile(
    optimizer=tf.keras.optimizers.AdamW(learning_rate=8e-4, weight_decay=2e-4),
    loss=LabelSmoothingFocalLoss(smoothing=0.05),
    metrics=['accuracy']
)
model.summary()

# 训练
y_tr_oh = tf.keras.utils.to_categorical(y_tr)
y_te_oh = tf.keras.utils.to_categorical(y_te)
gen = DualMixupGen(X_mel_tr, X_ege_tr, y_tr_oh, batch=64)

# Callbacks
class F1Cb(callbacks.Callback):
    def on_epoch_end(self, e, logs):
        p = np.argmax(self.model.predict([X_mel_te, X_ege_te], verbose=0), axis=1)
        f1 = f1_score(y_te, p, average='macro')
        print(f" — val_f1: {f1:.4f}")
        logs['val_macro_f1'] = f1

ckpt = callbacks.ModelCheckpoint(os.path.join(MODEL_DIR, "best.h5"), monitor='val_macro_f1', mode='max', save_best_only=True, verbose=1)
early = callbacks.EarlyStopping(monitor='val_macro_f1', mode='max', patience=25, restore_best_weights=True)
def sched(e): return 8e-4 * (e+1)/5 if e<5 else 8e-4 * 0.5 * (1+np.cos(np.pi*(e-5)/95))

print("Starting Cross-Attention Fusion Training...")
history = model.fit(
    gen, 
    validation_data=([X_mel_te, X_ege_te], y_te_oh),
    epochs=100,
    callbacks=[F1Cb(), ckpt, early, callbacks.LearningRateScheduler(sched)],
    verbose=1
)

# =====================
# 7. 评估
# =====================
print("Evaluating...")
p = np.argmax(model.predict([X_mel_te, X_ege_te], verbose=0), axis=1)
print(classification_report(y_te, p, target_names=le.classes_, digits=4))

cm = confusion_matrix(y_te, p)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Confusion Matrix (Cross-Attn Fusion)")
plt.savefig(os.path.join(PLOT_DIR, "cm.png"))
plt.show()

2025-12-04 05:26:45.274683: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-04 05:26:45.343803: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Scanning files...
Extracting features for 7442 files...
Processing 0...
Processing 500...
Processing 1000...
Processing 2000...
Processing 2500...
Processing 3000...
Processing 3500...
Processing 4000...
Processing 4500...
Processing 5000...
Processing 5500...
Processing 6000...
Processing 6500...
Processing 7000...
Train shape: Mel (5890, 300, 64, 1), eGe (5890, 88)


2025-12-04 05:41:03.749161: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 31132 MB memory:  -> device: 0, name: Tesla V100-PCIE-32GB, pci bus id: 0000:65:03.0, compute capability: 7.0


TypeError: Inputs to a layer should be tensors. Got '<keras.src.layers.normalization.batch_normalization.BatchNormalization object at 0x7fb589f45fa0>' (of type <class 'keras.src.layers.normalization.batch_normalization.BatchNormalization'>) as input for layer 'max_pooling2d'.