In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import numpy as np
import glob
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import re
from tqdm.notebook import tqdm
import ipywidgets as widgets
widgets.IntProgress(value=50, min=0, max=100)
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib notebook

from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

Load Dataset

In [3]:
class BinauralCueDataset(Dataset):
    def __init__(self, npz_dir, audio_ids=range(1, 701)):
        self.dir = npz_dir
        pattern = re.compile(r'main_audio_(\d+)_azi(\d+)\.npz')
        self.files = []
        for f in os.listdir(npz_dir):
            if f.endswith('.npz'):
                match = pattern.match(f)
                if match and int(match.group(1)) in audio_ids:
                    self.files.append(f)
        self.files.sort()

        print(f"📁 已加载 {len(self.files)} 个 .npz 文件，共 {len(self)} 个样本。")

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        path = os.path.join(self.dir, self.files[idx])
        data = np.load(path)
        itd = data["itd"].astype(np.float32)
        ild = data["ild"].astype(np.float32)
        ic = data["ic"].astype(np.float32)

        cue = np.stack([itd, ild, ic], axis=0)  # [3, filters, frames]

        # 提取 azimuth label
        azimuth = int(re.search(r'azi(\d+)', self.files[idx]).group(1))
        label = azimuth // 5  # 共72类（0-71）

        return cue, label

Model1

In [7]:
class ResBlock(nn.Module):
    def __init__(self, in_channels, out_channels, downsample=False):
        super().__init__()
        stride = 2 if downsample else 1
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, stride=stride)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.downsample = downsample
        if downsample or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
                nn.BatchNorm2d(out_channels)
            )
        else:
            self.shortcut = nn.Identity()

    def forward(self, x):
        identity = self.shortcut(x)
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += identity
        return self.relu(out)

class AzimuthResNetCNN(nn.Module):
    def __init__(self, num_classes=72):
        super().__init__()
        self.layer0 = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU()
        )
        self.layer1 = ResBlock(32, 64, downsample=True)
        self.layer2 = ResBlock(64, 128, downsample=True)
        self.layer3 = ResBlock(128, 128)
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.layer0(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.pool(x)        # shape: [B, 128, 1, 1]
        x = torch.flatten(x, 1) # shape: [B, 128]
        x = self.dropout(x)
        return self.fc(x)

Model2

In [4]:
import torch
import torch.nn as nn

# 一维卷积分支：包含 Conv1d 层、BatchNorm1d 和 ReLU 激活
class ConvBranch(nn.Module):
    def __init__(self, input_channels=32, conv_channels=64, kernel_size=5, stride=1, num_layers=2, use_batchnorm=True):
        super(ConvBranch, self).__init__()
        layers = []
        in_ch = input_channels
        for i in range(num_layers):
            out_ch = conv_channels
            layers.append(nn.Conv1d(in_ch, out_ch, kernel_size, stride, padding=kernel_size//2))
            if use_batchnorm:
                layers.append(nn.BatchNorm1d(out_ch))
            layers.append(nn.ReLU(inplace=True))
            in_ch = out_ch
        self.conv = nn.Sequential(*layers)
    def forward(self, x):
        # x: [B, input_channels, L]  （L为时间长度，如44100）
        return self.conv(x)         # 输出: [B, conv_channels, L]

# 自注意力池化：将可变长度的时间序列特征加权汇聚为一个固定向量
class SelfAttentionPooling(nn.Module):
    def __init__(self, embed_dim):
        super(SelfAttentionPooling, self).__init__()
        # 可学习的线性层，用于计算每个时间步的注意力分数
        self.attn_score = nn.Linear(embed_dim, 1)
    def forward(self, x):
        # x: [B, L, embed_dim]  （输入特征需先调换维度到 [时间步, 特征]）
        scores = self.attn_score(x)                 # 计算注意力分数: [B, L, 1]
        weights = torch.softmax(scores, dim=1)      # 对时间维度做softmax归一化
        context = (x * weights).sum(dim=1)          # 加权求和得到上下文向量: [B, embed_dim]
        return context

# 单个线索分支模块：Conv1D 提取特征 + 注意力池化得到线索表示向量
class CueBranch(nn.Module):
    def __init__(self, input_channels=32, conv_channels=64, kernel_size=5, stride=1, num_layers=2, embed_dim=None, use_batchnorm=True):
        super(CueBranch, self).__init__()
        self.conv_net = ConvBranch(input_channels, conv_channels, kernel_size, stride, num_layers, use_batchnorm)
        # 设置嵌入维度：若未指定则与卷积输出通道数相同
        self.embed_dim = conv_channels if embed_dim is None else embed_dim
        # 若需要将卷积输出映射到不同的embed_dim，可添加线性层
        if embed_dim is not None and embed_dim != conv_channels:
            self.proj = nn.Linear(conv_channels, embed_dim)
        else:
            self.proj = None
        self.attn_pool = SelfAttentionPooling(self.embed_dim)
    def forward(self, x):
        # x: [B, input_channels, L]
        feat = self.conv_net(x)               # 卷积提取特征: [B, conv_channels, L]
        feat = feat.transpose(1, 2)           # 调整为 [B, L, conv_channels] 以方便注意力计算
        if self.proj is not None:
            feat = self.proj(feat)           # 可选：映射到指定的嵌入维度 [B, L, embed_dim]
        cue_vector = self.attn_pool(feat)     # 注意力池化得到线索上下文向量: [B, embed_dim]
        return cue_vector

# 主模型：包含三个线索分支、跨线索自注意力层和最终分类器
class SoundLocalizationModel(nn.Module):
    def __init__(self, num_classes=72, input_channels_per_cue=32, conv_channels=64, kernel_size=5, stride=1,
                 num_layers=2, embed_dim=64, num_heads=4, use_batchnorm=True):
        super(SoundLocalizationModel, self).__init__()
        # 三个独立的线索分支
        self.cue_branches = nn.ModuleList([
            CueBranch(input_channels_per_cue, conv_channels, kernel_size, stride, num_layers, embed_dim, use_batchnorm)
            for _ in range(3)
        ])
        # 跨线索多头自注意力层，将 embed_dim 维的3个线索向量作为序列
        self.cross_attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        self.norm = nn.LayerNorm(embed_dim)            # 层归一化，规范注意力输出
        # 全连接分类层，将3*embed_dim映射为num_classes（72个方位角类别）
        self.fc = nn.Linear(embed_dim * 3, num_classes)
    def forward(self, x):
        # x: [B, 3, 32, L]  其中3表示三个线索通道 (ITD, ILD, IC)
        B, cue_count, C, L = x.shape
        assert cue_count == 3, "模型期望输入包含3个线索通道"
        # 分别通过每个线索分支提取表示向量
        cue_vectors = []  # 将收集每个分支输出 [B, embed_dim]
        for i, branch in enumerate(self.cue_branches):
            cue_input = x[:, i]                   # 取出第 i 个线索: [B, 32, L]
            vec = branch(cue_input)               # 得到该线索的表示向量: [B, embed_dim]
            cue_vectors.append(vec)
        # 将三个线索向量堆叠成序列，形状 [B, 3, embed_dim]
        seq = torch.stack(cue_vectors, dim=1)
        # 自注意力层：让每个线索向量与其他线索交互得到新的表示
        attn_out, _ = self.cross_attn(seq, seq, seq)   # [B, 3, embed_dim]
        attn_out = self.norm(attn_out)                 # 层归一化输出
        # 将3个线索向量展平为单一向量 [B, 3*embed_dim]
        combined = attn_out.reshape(B, -1)
        # 全连接分类，输出72维类别分数
        logits = self.fc(combined)                     # [B, 72]
        # 模型输出为未归一化的得分，可在需要时使用 Softmax 做归一化:
        # probs = torch.softmax(logits, dim=1)
        return logits

# 使用模型示例：创建模型并测试一次前向传播
model = SoundLocalizationModel(num_classes=72, input_channels_per_cue=32, conv_channels=64, kernel_size=5,
                               stride=1, num_layers=2, embed_dim=64, num_heads=4, use_batchnorm=True)
dummy_input = torch.randn(8, 3, 32, 44100)   # 假设批大小B=8
output = model(dummy_input)
print("输出维度:", output.shape)  # 应为 [8, 72]

输出维度: torch.Size([8, 72])


In [7]:
full_dataset = BinauralCueDataset(r"C:\Users\TIANY1\OneDrive - Trinity College Dublin\Documents\SoundSourceLocalization\features")
train_dataset, val_dataset = random_split(full_dataset, [0.8, 0.2], generator=torch.Generator().manual_seed(42))

📁 已加载 50400 个 .npz 文件，共 50400 个样本。


In [12]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# model = AzimuthResNetCNN(num_classes=72)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("The device type is: ", device)
model = SoundLocalizationModel(
    num_classes=72, input_channels_per_cue=32,
    conv_channels=64, kernel_size=5,
    stride=1, num_layers=2, embed_dim=64,
    num_heads=4, use_batchnorm=True
).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-3, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()

The device type is:  cuda


Training

In [13]:
num_epochs = 20
best_acc1 = 0.0
best_epoch = 0
best_model_wts = None

train_loss_history, val_loss_history = [], []
train_acc1_history, val_acc1_history = [], []
train_acc5_history, val_acc5_history = [], []

os.makedirs(r"C:\Users\TIANY1\OneDrive - Trinity College Dublin\Documents\SoundSourceLocalization\checkpoints", exist_ok=True)

fig, axs = plt.subplots(1, 2, figsize=(12, 4))
axs[0].set_title("Loss Curve")
axs[0].set_xlabel("Epoch")
axs[0].set_ylabel("Loss")
train_loss_line, = axs[0].plot([], [], label='Train Loss')
val_loss_line, = axs[0].plot([], [], label='Val Loss')
axs[0].legend()

axs[1].set_title("Accuracy Curve")
axs[1].set_xlabel("Epoch")
axs[1].set_ylabel("Accuracy")
train_acc1_line, = axs[1].plot([], [], 'b-', label='Top-1 Train')
val_acc1_line, = axs[1].plot([], [], 'b--', label='Top-1 Val')
train_acc5_line, = axs[1].plot([], [], 'r-', label='Top-5 Train')
val_acc5_line, = axs[1].plot([], [], 'r--', label='Top-5 Val')
axs[1].legend()

plt.tight_layout()

for epoch in range(1, num_epochs + 1):
    model.train()
    train_loss_sum = 0.0
    train_correct_top1 = 0
    train_correct_top5 = 0
    train_total = 0

    pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs}", leave=False)
    for cue, azi in pbar:
        cue, azi = cue.to(device), azi.to(device)
        optimizer.zero_grad()
        outputs = model(cue)
        loss = criterion(outputs, azi)
        loss.backward()
        optimizer.step()

        train_loss_sum += loss.item() * cue.size(0)
        train_total += cue.size(0)

        # Top-1 and Top-5 Accuracy
        _, pred_top1 = outputs.max(dim=1)
        train_correct_top1 += (pred_top1 == azi).sum().item()

        top5_val, top5_idx = outputs.topk(5, dim=1)
        diff = (top5_idx - azi.view(-1, 1)).abs()
        diff = torch.minimum(diff, 72 - diff)
        train_correct_top5 += (diff <=2).any(dim=1).sum().item()

        batch_acc1 = (pred_top1 == azi).float().mean().item()
        batch_acc5 = (diff <= 2).any(dim=1).float().mean().item()
        pbar.set_postfix(loss=loss.item(), top1=f"{batch_acc1*100:.1f}%", top5=f"{batch_acc5*100:.1f}%")

    train_loss_avg = train_loss_sum / train_total
    train_acc1 = train_correct_top1 / train_total
    train_acc5 = train_correct_top5 / train_total

    # Evaluation
    model.eval()
    val_loss_sum = 0.0
    val_correct_top1 = 0
    val_correct_top5 = 0
    val_total = 0

    with torch.no_grad():
        for cue, azi in tqdm(val_loader, desc="Evaluating...", leave=False):
            cue, azi = cue.to(device), azi.to(device)
            outputs = model(cue)
            loss = criterion(outputs, azi)

            val_loss_sum += loss.item() * cue.size(0)
            val_total += cue.size(0)
            _, pred_top1 = outputs.max(dim=1)
            val_correct_top1 += (pred_top1 == azi).sum().item()
            top5_vals, top5_idx = outputs.topk(5, dim=1)
            diff = (top5_idx - azi.view(-1, 1)).abs()
            diff = torch.minimum(diff, 72 - diff)
            val_correct_top5 += (diff <= 2).any(dim=1).sum().item()

    val_loss_avg = val_loss_sum / val_total
    val_acc1 = val_correct_top1 / val_total
    val_acc5 = val_correct_top5 / val_total

    # Record results
    train_loss_history.append(train_loss_avg)
    val_loss_history.append(val_loss_avg)
    train_acc1_history.append(train_acc1)
    val_acc1_history.append(val_acc1)
    train_acc5_history.append(train_acc5)
    val_acc5_history.append(val_acc5)

    # 实时更新图像
    epochs_range = range(1, epoch + 1)
    train_loss_line.set_data(epochs_range, train_loss_history)
    val_loss_line.set_data(epochs_range, val_loss_history)
    train_acc1_line.set_data(epochs_range, train_acc1_history)
    val_acc1_line.set_data(epochs_range, val_acc1_history)
    train_acc5_line.set_data(epochs_range, train_acc5_history)
    val_acc5_line.set_data(epochs_range, val_acc5_history)

    # 自适应坐标轴范围
    axs[0].relim(); axs[0].autoscale_view()
    axs[1].relim(); axs[1].autoscale_view()

    plt.pause(0.01)  # 让图像刷新

    print(f"Epoch {epoch}/{num_epochs}: Train Loss={train_loss_avg:.4f}, Top-1={train_acc1*100:.2f}%, Top-5={train_acc5*100:.2f}% | "
          f"Evaluation Loss={val_loss_avg:.4f}, Top-1={val_acc1*100:.2f}%, Top-5={val_acc5*100:.2f}%")
    
    # Checkpoints
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict()
    }

    checkpoint_path = os.path.join(
        r"C:\Users\TIANY1\OneDrive - Trinity College Dublin\Documents\SoundSourceLocalization\checkpoints",
        f"epoch_{epoch}.pth"
    )
    torch.save(checkpoint, checkpoint_path)
    
    if val_acc1 > best_acc1:
        best_acc1 = val_acc1
        best_epoch = epoch
        best_model_wts = model.state_dict()

if best_model_wts is not None:
    torch.save(best_model_wts, os.path.join(
        r"C:\Users\TIANY1\OneDrive - Trinity College Dublin\Documents\SoundSourceLocalization\checkpoints",
        "best_model.pth"
    ))
    print(f"The best model appears in epoch {best_epoch}, and the Validation Top-1 Accuracy is {best_acc1*100:.2f}%, save as best_model.pth")

<IPython.core.display.Javascript object>

Epoch 1/20:   0%|          | 0/1260 [00:00<?, ?it/s]

Load checkpoint and recover

In [7]:
checkpoint = torch.load(r"C:\Users\TIANY1\OneDrive - Trinity College Dublin\Documents\SoundSourceLocalization\checkpoints\epoch_6.pth", map_location=device)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
start_epoch = checkpoint["epoch"] + 1

num_epochs = 20
best_acc1 = 0.0
best_epoch = 0
best_model_wts = None

train_loss_history, val_loss_history = [], []
train_acc1_history, val_acc1_history = [], []
train_acc5_history, val_acc5_history = [], []

os.makedirs(r"C:\Users\TIANY1\OneDrive - Trinity College Dublin\Documents\SoundSourceLocalization\checkpoints", exist_ok=True)

fig, axs = plt.subplots(1, 2, figsize=(12, 4))
axs[0].set_title("Loss Curve")
axs[0].set_xlabel("Epoch")
axs[0].set_ylabel("Loss")
train_loss_line, = axs[0].plot([], [], label='Train Loss')
val_loss_line, = axs[0].plot([], [], label='Val Loss')
axs[0].legend()

axs[1].set_title("Accuracy Curve")
axs[1].set_xlabel("Epoch")
axs[1].set_ylabel("Accuracy")
train_acc1_line, = axs[1].plot([], [], 'b-', label='Top-1 Train')
val_acc1_line, = axs[1].plot([], [], 'b--', label='Top-1 Val')
train_acc5_line, = axs[1].plot([], [], 'r-', label='Top-5 Train')
val_acc5_line, = axs[1].plot([], [], 'r--', label='Top-5 Val')
axs[1].legend()

plt.tight_layout()

for epoch in range(start_epoch, num_epochs + 1):
    model.train()
    train_loss_sum = 0.0
    train_correct_top1 = 0
    train_correct_top5 = 0
    train_total = 0

    for cue, azi in tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs}", leave=False):
        cue, azi = cue.to(device), azi.to(device)
        optimizer.zero_grad()
        outputs = model(cue)
        loss = criterion(outputs, azi)
        loss.backward()
        optimizer.step()

        train_loss_sum += loss.item() * cue.size(0)
        train_total += cue.size(0)

        # Top-1 and Top-5 Accuracy
        _, pred_top1 = outputs.max(dim=1)
        train_correct_top1 += (pred_top1 == azi).sum().item()

        top5_val, top5_idx = outputs.topk(5, dim=1)
        diff = (top5_idx - azi.view(-1, 1)).abs()
        diff = torch.minimum(diff, 72 - diff)
        train_correct_top5 += (diff <=2).any(dim=1).sum().item()

    train_loss_avg = train_loss_sum / train_total
    train_acc1 = train_correct_top1 / train_total
    train_acc5 = train_correct_top5 / train_total

    # Evaluation
    model.eval()
    val_loss_sum = 0.0
    val_correct_top1 = 0
    val_correct_top5 = 0
    val_total = 0

    with torch.no_grad():
        for cue, azi in tqdm(val_loader, desc="Evaluating...", leave=False):
            cue, azi = cue.to(device), azi.to(device)
            outputs = model(cue)
            loss = criterion(outputs, azi)

            val_loss_sum += loss.item() * cue.size(0)
            val_total += cue.size(0)
            _, pred_top1 = outputs.max(dim=1)
            val_correct_top1 += (pred_top1 == azi).sum().item()
            top5_vals, top5_idx = outputs.topk(5, dim=1)
            diff = (top5_idx - azi.view(-1, 1)).abs()
            diff = torch.minimum(diff, 72 - diff)
            val_correct_top5 += (diff <= 2).any(dim=1).sum().item()

    val_loss_avg = val_loss_sum / val_total
    val_acc1 = val_correct_top1 / val_total
    val_acc5 = val_correct_top5 / val_total

    # Record results
    train_loss_history.append(train_loss_avg)
    val_loss_history.append(val_loss_avg)
    train_acc1_history.append(train_acc1)
    val_acc1_history.append(val_acc1)
    train_acc5_history.append(train_acc5)
    val_acc5_history.append(val_acc5)

    # 实时更新图像
    epochs_range = range(1, epoch + 1)
    train_loss_line.set_data(epochs_range, train_loss_history)
    val_loss_line.set_data(epochs_range, val_loss_history)
    train_acc1_line.set_data(epochs_range, train_acc1_history)
    val_acc1_line.set_data(epochs_range, val_acc1_history)
    train_acc5_line.set_data(epochs_range, train_acc5_history)
    val_acc5_line.set_data(epochs_range, val_acc5_history)

    # 自适应坐标轴范围
    axs[0].relim(); axs[0].autoscale_view()
    axs[1].relim(); axs[1].autoscale_view()

    plt.pause(0.01)  # 让图像刷新

    print(f"Epoch {epoch}/{num_epochs}: Train Loss={train_loss_avg:.4f}, Top-1={train_acc1*100:.2f}%, Top-5={train_acc5*100:.2f}% | "
          f"Evaluation Loss={val_loss_avg:.4f}, Top-1={val_acc1*100:.2f}%, Top-5={val_acc5*100:.2f}%")
    
    # Checkpoints
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict()
    }

    checkpoint_path = os.path.join(
        r"C:\Users\TIANY1\OneDrive - Trinity College Dublin\Documents\SoundSourceLocalization\checkpoints",
        f"epoch_{epoch}.pth"
    )
    torch.save(checkpoint, checkpoint_path)
    
    if val_acc1 > best_acc1:
        best_acc1 = val_acc1
        best_epoch = epoch
        best_model_wts = model.state_dict()

if best_model_wts is not None:
    torch.save(best_model_wts, os.path.join(
        r"C:\Users\TIANY1\OneDrive - Trinity College Dublin\Documents\SoundSourceLocalization\checkpoints",
        "best_model.pth"
    ))
    print(f"The best model appears in epoch {best_epoch}, and the Validation Top-1 Accuracy is {best_acc1*100:.2f}%, save as best_model.pth")

  checkpoint = torch.load(r"C:\Users\TIANY1\OneDrive - Trinity College Dublin\Documents\SoundSourceLocalization\checkpoints\epoch_6.pth", map_location=device)


<IPython.core.display.Javascript object>

Epoch 7/20:   0%|          | 0/360 [00:00<?, ?it/s]

Run Evaluate

In [12]:
# 2️⃣ 加载模型权重
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AzimuthResNetCNN(num_classes=72).to(device)
model_path = r"C:\Users\TIANY1\OneDrive - Trinity College Dublin\Documents\SoundSourceLocalization\checkpoints\epoch_6.pth"

checkpoint = torch.load(model_path, map_location=device)
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()

# 3️⃣ 准备测试集
val_path = r"C:\Users\TIANY1\OneDrive - Trinity College Dublin\Documents\SoundSourceLocalization\features"
val_dataset = BinauralCueDataset(val_path, audio_ids=range(80, 101))
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False)

# 4️⃣ 推理并收集预测
all_preds, all_labels = [], []

with torch.no_grad():
    for X, y in val_loader:
        X, y = X.to(device), y.to(device)
        outputs = model(X)
        preds = outputs.argmax(dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y.cpu().numpy())

all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

accuracy = np.mean(all_preds == all_labels)
print(f"🎯 Top-1 Accuracy: {accuracy * 100:.2f}%")

def mean_class_accuracy(y_true, y_pred, num_classes=72):
    class_accs = []
    for cls in range(num_classes):
        cls_mask = (y_true == cls)
        if cls_mask.sum() == 0: continue
        acc = (y_pred[cls_mask] == y_true[cls_mask]).sum() / cls_mask.sum()
        class_accs.append(acc)
    return np.mean(class_accs)

mean_acc = mean_class_accuracy(all_labels, all_preds)
print(f"📊 Mean Accuracy per Class: {mean_acc * 100:.2f}%")

from sklearn.metrics import classification_report
print(classification_report(all_labels, all_preds, digits=3))

cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()



  checkpoint = torch.load(model_path, map_location=device)


📁 已加载 1512 个 .npz 文件，共 1512 个样本。
🎯 Top-1 Accuracy: 16.07%
📊 Mean Accuracy per Class: 16.07%
              precision    recall  f1-score   support

           0      0.667     0.095     0.167        21
           1      0.155     0.429     0.228        21
           2      0.500     0.048     0.087        21
           3      0.000     0.000     0.000        21
           4      0.500     0.048     0.087        21
           5      0.385     0.238     0.294        21
           6      0.000     0.000     0.000        21
           7      0.000     0.000     0.000        21
           8      0.154     0.095     0.118        21
           9      0.267     0.190     0.222        21
          10      0.000     0.000     0.000        21
          11      0.000     0.000     0.000        21
          12      0.000     0.000     0.000        21
          13      0.000     0.000     0.000        21
          14      0.000     0.000     0.000        21
          15      0.143     0.048     0.071

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


<IPython.core.display.Javascript object>