In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
from core.config import CONFIG

if __name__ == "__main__":
    CONFIG.load_config("config.yaml")

    print("Training Dataset Path:", CONFIG.dataset_path("training"))
    print("Evaluation Dataset Path:", CONFIG.dataset_path("evaluation"))
    print("IEMOCAP Preprocessed Dir:", CONFIG.dataset_preprocessed_dir_path(CONFIG.training_dataset_name()))
    print("CREMA-D Preprocessed Dir:", CONFIG.dataset_preprocessed_dir_path(CONFIG.evaluation_dataset_name()))
    print("IEMOCAP Emotions:", CONFIG.dataset_emotions(CONFIG.training_dataset_name()))
    print("CREMA-D Emotions:", CONFIG.dataset_emotions(CONFIG.evaluation_dataset_name()))

Training Dataset Path: E:/Unitec/SER/audio\dataset\IEMOCAP
Evaluation Dataset Path: E:/Unitec/SER/audio\dataset\CREMA-D
IEMOCAP Preprocessed Dir: E:/Unitec/SER/audio\dataset\IEMOCAP\Preprocessed
CREMA-D Preprocessed Dir: E:/Unitec/SER/audio\dataset\CREMA-D\Preprocessed
IEMOCAP Emotions: ['ang', 'neu', 'sad', 'hap']
CREMA-D Emotions: ['ang', 'neu', 'sad', 'hap']


## test processing

In [3]:
import warnings
warnings.filterwarnings('ignore')
from core.config import CONFIG
from preprocessing.iemocap import IemocapPreprocessor

# 确保 CONFIG 已经加载了配置文件
CONFIG.load_config("config.yaml")

# 初始化 IemocapPreprocessor，使用配置中训练数据集的路径
# 假设您想验证训练数据的加载
iemocap_dataset_path = CONFIG.dataset_path("training")
iemocap_preprocessor = IemocapPreprocessor(iemocap_dataset_path)

# 生成 DataFrame
iemocap_df = iemocap_preprocessor.generate_dataframe()

# 显示 DataFrame 的前几行
print("DataFrame head:")
display(iemocap_df.head())

# 您还可以打印一些关于 DataFrame 的信息来进一步验证
print("\nDataFrame Info:")
iemocap_df.info()

print("\nEmotion Distribution:")
display(iemocap_df['emotion'].value_counts())

print("\nData per Session:")
display(iemocap_df['session'].value_counts())

[INFO] Target emotions being extracted: ['ang', 'neu', 'sad', 'hap']

[INFO] Preprocessing complete. Total entries extracted: 4490
[INFO] Emotion distribution:
emotion
neu    1708
ang    1103
sad    1084
hap     595
Name: count, dtype: int64

[INFO] Data per session:
session
Session3    1000
Session1     942
Session5     942
Session2     813
Session4     793
Name: count, dtype: int64
DataFrame head:


Unnamed: 0,audio_path,audio_filename,text,emotion,session
0,E:/Unitec/SER/audio\dataset\IEMOCAP\Session1\s...,Ses01F_impro01_F000.wav,Excuse me.,neu,Session1
1,E:/Unitec/SER/audio\dataset\IEMOCAP\Session1\s...,Ses01F_impro01_F001.wav,Yeah.,neu,Session1
2,E:/Unitec/SER/audio\dataset\IEMOCAP\Session1\s...,Ses01F_impro01_F002.wav,Is there a problem?,neu,Session1
3,E:/Unitec/SER/audio\dataset\IEMOCAP\Session1\s...,Ses01F_impro01_F005.wav,Well what's the problem? Let me change it.,neu,Session1
4,E:/Unitec/SER/audio\dataset\IEMOCAP\Session1\s...,Ses01F_impro01_F012.wav,That's out of control.,ang,Session1



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4490 entries, 0 to 4489
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   audio_path      4490 non-null   object
 1   audio_filename  4490 non-null   object
 2   text            4490 non-null   object
 3   emotion         4490 non-null   object
 4   session         4490 non-null   object
dtypes: object(5)
memory usage: 175.5+ KB

Emotion Distribution:


emotion
neu    1708
ang    1103
sad    1084
hap     595
Name: count, dtype: int64


Data per Session:


session
Session3    1000
Session1     942
Session5     942
Session2     813
Session4     793
Name: count, dtype: int64

In [4]:

from preprocessing.cremad import CremaDPreprocessor

# 假设您想验证训练数据的加载
cremad_dataset_path = CONFIG.dataset_path("evaluation")
cremad_preprocessor = CremaDPreprocessor(cremad_dataset_path)

# 生成 DataFrame
cremad_df = cremad_preprocessor.generate_dataframe()

# 显示 DataFrame 的前几行
print("DataFrame head:")
display(cremad_df.head())

# 您还可以打印一些关于 DataFrame 的信息来进一步验证
print("\nDataFrame Info:")
cremad_df.info()

print("\nEmotion Distribution:")
display(cremad_df['emotion'].value_counts())

print("\nData per speaker:")
display(cremad_df['speaker'].value_counts())

[INFO] Target emotions being extracted for CREMA-D: ['ang', 'neu', 'sad', 'hap']

[INFO] CREMA-D Preprocessing complete. Total entries extracted: 4900
[INFO] Emotion distribution for CREMA-D:
emotion
ang    1271
hap    1271
sad    1271
neu    1087
Name: count, dtype: int64

[INFO] Data per Speaker:
speaker
1001    54
1047    54
1067    54
1066    54
1065    54
        ..
1076    53
1002    53
1009    50
1008    50
1019    50
Name: count, Length: 91, dtype: int64
DataFrame head:


Unnamed: 0,audio_path,audio_filename,text,emotion,speaker
0,E:/Unitec/SER/audio\dataset\CREMA-D\AudioWAV\1...,1001_DFA_ANG_XX.wav,Don't forget a jacket,ang,1001
1,E:/Unitec/SER/audio\dataset\CREMA-D\AudioWAV\1...,1001_DFA_HAP_XX.wav,Don't forget a jacket,hap,1001
2,E:/Unitec/SER/audio\dataset\CREMA-D\AudioWAV\1...,1001_DFA_NEU_XX.wav,Don't forget a jacket,neu,1001
3,E:/Unitec/SER/audio\dataset\CREMA-D\AudioWAV\1...,1001_DFA_SAD_XX.wav,Don't forget a jacket,sad,1001
4,E:/Unitec/SER/audio\dataset\CREMA-D\AudioWAV\1...,1001_IEO_ANG_HI.wav,It's eleven o'clock,ang,1001



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4900 entries, 0 to 4899
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   audio_path      4900 non-null   object
 1   audio_filename  4900 non-null   object
 2   text            4900 non-null   object
 3   emotion         4900 non-null   object
 4   speaker         4900 non-null   object
dtypes: object(5)
memory usage: 191.5+ KB

Emotion Distribution:


emotion
ang    1271
hap    1271
sad    1271
neu    1087
Name: count, dtype: int64


Data per speaker:


speaker
1001    54
1047    54
1067    54
1066    54
1065    54
        ..
1076    53
1002    53
1009    50
1008    50
1019    50
Name: count, Length: 91, dtype: int64

In [6]:
import os
from transformers import AutoTokenizer
from audio.extractor import WavLMEmotionExtractor
from core.config import CONFIG
from scripts.preprocess_data import process_raw_data_to_pickle


def run_preprocessing_pipeline(dataset_name: str):
    """
    为一个指定的数据集完整地执行数据预处理的三个步骤。
    
    Args:
        dataset_name (str): 数据集的名称 (从 CONFIG 中获取)。
        audio_extractor: 初始化后的音频特征提取器。
        text_tokenizer: 初始化后的文本分词器。
    """
    print(f"\n{'='*20}\n[START] Processing dataset: {dataset_name}\n{'='*20}")

    # 1. 动态生成文件名，避免硬编码
    # 例如从 "IEMOCAP_full_release" 生成 "iemocap" 作为文件名前缀
    base_name = dataset_name.split('_')[0].lower() 
    raw_file = f"{base_name}_raw.pkl"
    audio_file = f"{base_name}_audio_features.pkl"
    text_file = f"{base_name}_text_tokens.pkl"

    # 2. 按顺序执行数据处理流程
    print(f"--- Step 1: Processing raw data to '{raw_file}' ---")
    process_raw_data_to_pickle(dataset_name, raw_file)


    print(f"\n[SUCCESS] Finished processing for {dataset_name}.")


# --- 主执行脚本 ---
if __name__ == "__main__":
    # 1. 集中进行初始化
    print("--- Initializing models and config ---")
    CONFIG.load_config("config.yaml")

    # 2. 定义需要处理的数据集列表
    datasets_to_process = [
        CONFIG.training_dataset_name(),
        CONFIG.evaluation_dataset_name()
    ]

    # 3. 循环调用处理流程
    for name in datasets_to_process:
        run_preprocessing_pipeline(name)

    print(f"\n{'='*20}\n--- All processing complete! ---\n{'='*20}")

--- Initializing models and config ---

[START] Processing dataset: IEMOCAP
--- Step 1: Processing raw data to 'iemocap_raw.pkl' ---
[INFO] Using IemocapPreprocessor for dataset: IEMOCAP
[INFO] Target emotions being extracted: ['ang', 'neu', 'sad', 'hap']

[INFO] Preprocessing complete. Total entries extracted: 4490
[INFO] Emotion distribution:
emotion
neu    1708
ang    1103
sad    1084
hap     595
Name: count, dtype: int64

[INFO] Data per session:
session
Session3    1000
Session1     942
Session5     942
Session2     813
Session4     793
Name: count, dtype: int64
[INFO] Raw data DataFrame saved to: E:/Unitec/SER/audio\dataset\IEMOCAP\Preprocessed\iemocap_raw.pkl

[SUCCESS] Finished processing for IEMOCAP.

[START] Processing dataset: CREMA-D
--- Step 1: Processing raw data to 'crema-d_raw.pkl' ---
[INFO] Using CremaDPreprocessor for dataset: CREMA-D
[INFO] Target emotions being extracted for CREMA-D: ['ang', 'neu', 'sad', 'hap']

[INFO] CREMA-D Preprocessing complete. Total entries

## 实例化数据集


In [1]:
from core.config import CONFIG
from scripts.get_dataloaders import get_dataloaders
CONFIG.load_config("config.yaml")

# --- 训练和验证流程 ---
print("加载 IEMOCAP 数据集用于训练...")
# 只需一行代码，即可获取训练和验证所需的所有 dataloader
iemocap_loaders = get_dataloaders(CONFIG.training_dataset_name())
train_loader = iemocap_loaders['train']
validation_loader = iemocap_loaders['validation']

加载 IEMOCAP 数据集用于训练...
--- 正在为数据集 'IEMOCAP' 准备Dataloaders ---
[INFO] 已从以下路径加载音频特征: E:/Iris_project/SER\dataset\IEMOCAP\Preprocessed\iemocap_audio_features.pkl
[INFO] 已从以下路径加载文本Tokens: E:/Iris_project/SER\dataset\IEMOCAP\Preprocessed\iemocap_text_tokens.pkl


### train baseline model（New）



In [None]:
# Rerun the training cell
import os
import torch
import gc
from core.config import CONFIG, device
from scripts.get_dataloaders import get_dataloaders
from audio.baseline_model import AudioBaselineModel
from audio.trainer import MemoryOptimizedAudioBaselineTrainer

# 设置CUDA内存优化
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
torch.cuda.empty_cache()
gc.collect()

CONFIG.load_config("config.yaml")

# --- 训练和验证流程 ---
training_dataset_name = CONFIG.training_dataset_name()
print(f"\n--- 正在加载 '{training_dataset_name}' 数据集用于训练和验证 ---")

# 使用内存优化版本的dataloaders（这是主要的改动）
try:
    # 使用内存优化版本
    iemocap_loaders = get_dataloaders(training_dataset_name, use_memory_optimization=True)
    train_loader = iemocap_loaders['train']
    validation_loader = iemocap_loaders['validation']
    
    # --- 零样本评估流程 (在 CREMA-D 上) ---
    evaluation_dataset_name = CONFIG.evaluation_dataset_name()
    print(f"\n--- 正在加载 '{evaluation_dataset_name}' 数据集用于零样本评估 ---")
    
    cremad_loaders = get_dataloaders(evaluation_dataset_name, use_memory_optimization=True)
    evaluation_loader = cremad_loaders['evaluation']
    
    # --- 实例化模型和训练器 ---
    print("\n--- 初始化基线模型和训练器 ---")
    
    # 清理内存后创建模型
    torch.cuda.empty_cache()
    gc.collect()
    
    # 获取情感标签
    iemocap_emotions = CONFIG.dataset_emotions(training_dataset_name)
    num_labels = len(iemocap_emotions)
    
    # 创建模型（考虑使用更小的batch_size)
    baseline_model = AudioBaselineModel(num_labels=num_labels).to(device)
    
    # 使用内存优化版本的训练器
    baseline_trainer = MemoryOptimizedAudioBaselineTrainer(  # 使用新的训练器
        model=baseline_model,
        num_epochs=CONFIG.training_epochs(),
        learning_rate=CONFIG.learning_rate() * 4,  # 由于梯度累积，需要调整学习率
        optimizer_type=CONFIG.optimizer_type(),
        gradient_accumulation_steps=8  # 梯度累积步数，可以根据需要调整
    )
    
    # --- 步骤 3: 训练模型 ---
    print("\n--- 开始在 IEMOCAP 上训练基线模型 ---")
    baseline_trainer.train(train_loader)
    
    # --- 步骤 4: 在 IEMOCAP 验证集上评估 ---
    print("\n--- 在 IEMOCAP 验证集上评估模型性能 ---")
    baseline_trainer.eval(validation_loader, labels=iemocap_emotions)
    
    # --- 步骤 5: 在 CREMA-D 测试集上进行零样本评估 ---
    print("\n--- 在 CREMA-D 测试集上进行零样本评估 ---")
    cremad_emotions = CONFIG.dataset_emotions(evaluation_dataset_name)
    baseline_trainer.eval(evaluation_loader, labels=cremad_emotions)
    
    print("\n--- 基线模型训练和评估完成！ ---")

except torch.cuda.OutOfMemoryError as e:
    print(f"\n[ERROR] CUDA内存不足: {e}")
    print("建议:")
    print("1. 进一步减小batch_size到1")
    print("2. 减少音频最大长度")
    print("3. 使用更小的模型variant")
    print("4. 重启运行时清理内存")
    
    # 清理内存
    torch.cuda.empty_cache()
    gc.collect()
    
except Exception as e:
    print(f"\n[ERROR] 训练过程中出现错误: {e}")
    torch.cuda.empty_cache()
    gc.collect()
    raise e

  from .autonotebook import tqdm as notebook_tqdm



--- 正在加载 'IEMOCAP' 数据集用于训练和验证 ---
--- 正在为数据集 'IEMOCAP' 准备Dataloaders ---
[INFO] 使用内存优化模式
[INFO] 已从以下路径加载音频特征: E:/Iris_project/SER\dataset\IEMOCAP\Preprocessed\iemocap_audio_features.pkl
[INFO] 已从以下路径加载文本Tokens: E:/Iris_project/SER\dataset\IEMOCAP\Preprocessed\iemocap_text_tokens.pkl
[INFO] 使用内存优化的数据整理器
[INFO] 已清理加载过程中的临时内存

--- 正在加载 'CREMA-D' 数据集用于零样本评估 ---
--- 正在为数据集 'CREMA-D' 准备Dataloaders ---
[INFO] 使用内存优化模式
[INFO] 已从以下路径加载音频特征: E:/Iris_project/SER\dataset\CREMA-D\Preprocessed\crema-d_audio_features.pkl
[INFO] 已从以下路径加载文本Tokens: E:/Iris_project/SER\dataset\CREMA-D\Preprocessed\crema-d_text_tokens.pkl
[INFO] 使用内存优化的数据整理器
[INFO] 已清理加载过程中的临时内存

--- 初始化基线模型和训练器 ---


Some weights of WavLMForSequenceClassification were not initialized from the model checkpoint at microsoft/wavlm-base and are newly initialized: ['projector.bias', 'projector.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)


[INFO] 特征提取层已冻结，使用梯度累积步数: 8

--- 开始在 IEMOCAP 上训练基线模型 ---


Epoch 1:   0%|          | 0/449 [00:00<?, ?it/s]

In [2]:
import torch
import sys

print("--- 1. Check PyTorch and CUDA ---")
print(f"Python Version: {sys.version}")
print(f"PyTorch Version: {torch.__version__}")

# Check if CUDA is available
is_cuda_available = torch.cuda.is_available()
print(f"Is CUDA available: {is_cuda_available}")

if not is_cuda_available:
    print("\n[ERROR] PyTorch could not detect CUDA. Please check your NVIDIA driver and PyTorch installation.")
    # If CUDA is not available, exit the script
    sys.exit()

print("\n--- 2. Get GPU Device Information ---")
# Get the default CUDA device (usually GPU 0)
device = torch.device("cuda:0")
print(f"Default CUDA device: {device}")

# Print the name of the GPU
gpu_name = torch.cuda.get_device_name(0)
print(f"GPU Name: {gpu_name}")

# Print the CUDA version PyTorch was compiled with
torch_cuda_version = torch.version.cuda
print(f"PyTorch compiled with CUDA version: {torch_cuda_version}")


print("\n--- 3. Test Data Transfer Between CPU and GPU ---")
# a. Create a tensor on the CPU
cpu_tensor = torch.tensor([1, 2, 3], device='cpu')
print(f"a. Tensor created on the CPU: {cpu_tensor}")
print(f"   - Device: {cpu_tensor.device}")

# b. Try to move the tensor to the GPU
try:
    gpu_tensor = cpu_tensor.to(device)
    print(f"\nb. Successfully moved tensor to GPU: {gpu_tensor}")
    print(f"   - Device: {gpu_tensor.device}")
except Exception as e:
    print(f"\n[ERROR] Failed to move data to GPU: {e}")
    sys.exit()


print("\n--- 4. Test Computation on GPU ---")
# a. Create two tensors on the GPU for computation
try:
    a = torch.randn(3, 3).to(device)
    b = torch.randn(3, 3).to(device)
    print(f"a. Created two 3x3 random tensors on the GPU.")
    print(f"   - Tensor a device: {a.device}")
    print(f"   - Tensor b device: {b.device}")

    # b. Perform matrix multiplication on the GPU
    print("\nb. Performing matrix multiplication on GPU (c = a * b)...")
    c = torch.matmul(a, b)
    print(f"   - Result c device: {c.device}")
    print(f"   - Computation successful!")

except Exception as e:
    print(f"\n[ERROR] Computation on GPU failed: {e}")
    sys.exit()

print("\n--- 5. Test Moving Result Back to CPU ---")
# a. Move the computation result from GPU back to CPU
try:
    result_cpu_tensor = c.cpu()
    print("a. Successfully moved the computation result back to the CPU.")
    print(f"   - Device: {result_cpu_tensor.device}")
    print("\nComputation result:")
    print(result_cpu_tensor)

except Exception as e:
    print(f"\n[ERROR] Failed to move result back to CPU: {e}")
    sys.exit()


print("\n--- All tests completed ---")
print("[SUCCESS] Your PyTorch and CUDA environment is configured correctly, and they can communicate and perform computations normally!")

--- 1. Check PyTorch and CUDA ---
Python Version: 3.11.9 (tags/v3.11.9:de54cf5, Apr  2 2024, 10:12:12) [MSC v.1938 64 bit (AMD64)]
PyTorch Version: 2.1.2+cu121
Is CUDA available: True

--- 2. Get GPU Device Information ---
Default CUDA device: cuda:0
GPU Name: NVIDIA L40S-6Q
PyTorch compiled with CUDA version: 12.1

--- 3. Test Data Transfer Between CPU and GPU ---
a. Tensor created on the CPU: tensor([1, 2, 3])
   - Device: cpu

[ERROR] Failed to move data to GPU: CUDA error: operation not supported
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.



SystemExit: 

In [4]:
import torch
import sys
import subprocess
import re
import platform
import os
from datetime import datetime

# --- 辅助函数 ---
def run_command(command, shell=False):
    """执行一个 shell 命令并返回其输出。"""
    try:
        # 在 Windows 上隐藏命令行窗口
        startupinfo = None
        if platform.system() == "Windows":
            startupinfo = subprocess.STARTUPINFO()
            startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW

        result = subprocess.run(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            encoding='utf-8',
            errors='ignore',
            check=True,
            shell=shell,
            startupinfo=startupinfo
        )
        return result.stdout.strip()
    except (FileNotFoundError, subprocess.CalledProcessError) as e:
        return f"命令执行失败: {e}"

def check_windows_service(service_name):
    """在 Windows 上检查指定服务的状态。"""
    output = run_command(['sc', 'query', service_name])
    if "STATE" in output and "RUNNING" in output:
        return "正在运行 (Running)"
    elif "FAILED" in output or "1060" in output: # 1060: The specified service does not exist
        return "未找到或未安装 (Not Found/Installed)"
    else:
        return "已停止 (Stopped)"

def check_linux_service(service_name):
    """在 Linux 上检查指定服务的状态。"""
    output = run_command(f"systemctl is-active {service_name}")
    if "active" in output:
        return "正在运行 (Active)"
    elif "inactive" in output:
        return "已停止 (Inactive)"
    else:
        return "未找到或状态未知 (Not Found or Unknown)"

def analyze_vgpu_logs():
    """查找并分析 NVIDIA vGPU 许可证日志文件。"""
    log_path = ""
    system = platform.system()
    if system == "Windows":
        log_path = "C:\\Users\\Public\\Documents\\NvidiaLogging\\Log.NVDisplay.Container.exe.log"
    elif system == "Linux":
        log_path = "/var/log/nvidia/gridd.log"

    print(f"📄 正在检查 vGPU 日志文件: {log_path}")

    if not os.path.exists(log_path):
        print("   - [警告] 未找到日志文件。可能服务从未运行过，或日志在其他位置。")
        return

    try:
        with open(log_path, 'r', encoding='utf-8', errors='ignore') as f:
            # 只读取最后 100 行以提高效率
            log_content = f.readlines()[-100:]
            log_content = "".join(log_content)

        # 常见的许可证错误关键词
        error_keywords = {
            "Failed to acquire license": "获取许可证失败",
            "Failed to connect to license server": "连接到许可证服务器失败",
            "could not connect to": "无法连接到",
            "Connection refused": "连接被拒绝",
            "License request failed": "许可证请求失败",
            "unlicensed": "未授权状态",
            "terminated": "已终止"
        }

        found_errors = []
        for key, value in error_keywords.items():
            if re.search(key, log_content, re.IGNORECASE):
                found_errors.append(value)
        
        if found_errors:
            print(f"   - [严重] 在日志中发现可能的许可证错误: {', '.join(found_errors)}")
            print("   - 诊断: GPU 可能因无法获取有效许可证而被限制了计算功能。请检查您的许可证服务器地址、网络连接和客户端配置。")
        else:
            print("   - [信息] 在最近的日志中未发现明显的许可证错误。")

    except Exception as e:
        print(f"   - [错误] 读取日志文件失败: {e}")

# --- 主诊断流程 ---
print("--- 增强型 PyTorch & NVIDIA vGPU 环境诊断工具 ---")
print(f"诊断开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"操作系统: {platform.system()} {platform.release()}")

print("\n--- 1. 系统和驱动信息 ---")
print(f"🐍 Python 版本: {sys.version.splitlines()[0]}")
print(f"🔥 PyTorch 版本: {torch.__version__}")

print("\n... 正在运行 `nvidia-smi` 获取驱动信息...")
nvidia_smi_output = run_command(['nvidia-smi'])
if "命令执行失败" not in nvidia_smi_output:
    print("✅ `nvidia-smi` 命令成功执行。")
    driver_match = re.search(r"Driver Version: ([\d\.]+)", nvidia_smi_output)
    cuda_match = re.search(r"CUDA Version: ([\d\.]+)", nvidia_smi_output)
    gpu_name_match = re.search(r"\d+\s+(NVIDIA\s[\w\s-]+)\s+", nvidia_smi_output)
    
    driver_version = driver_match.group(1) if driver_match else "未检测到"
    driver_cuda_version = cuda_match.group(1) if cuda_match else "未检测到"
    gpu_name = gpu_name_match.group(1).strip() if gpu_name_match else "未检测到"
    
    print(f"   - GPU 型号: {gpu_name}")
    print(f"   - 驱动版本: {driver_version}")
    print(f"   - 驱动支持的最高 CUDA 版本: {driver_cuda_version}")

    # 检查是否为 vGPU 环境
    if "vGPU" in nvidia_smi_output or re.search(r"\w+-\d+Q", gpu_name):
        print("\n   - [关键信息] 检测到 vGPU 环境！将执行 vGPU 许可证诊断。")
        is_vgpu = True
    else:
        is_vgpu = False
else:
    print("\n[CRITICAL ERROR] `nvidia-smi` 命令执行失败。")
    print("无法验证 NVIDIA 驱动安装。请确保驱动已正确安装且 `nvidia-smi` 在系统 PATH 中。")
    sys.exit()

# --- vGPU 许可证诊断部分 ---
if is_vgpu:
    print("\n--- 2. vGPU 许可证服务诊断 ---")
    system = platform.system()
    service_status = ""
    if system == "Windows":
        service_name = "NVIDIA Display Container LS"
        print(f"   - 正在检查 Windows 服务: '{service_name}'...")
        service_status = check_windows_service("nvdisplay.container.service")
        print(f"   - 服务状态: {service_status}")
    elif system == "Linux":
        service_name = "nvidia-gridd"
        print(f"   - 正在检查 Linux 服务: '{service_name}'...")
        service_status = check_linux_service(service_name)
        print(f"   - 服务状态: {service_status}")
    
    if "正在运行" not in service_status and "Active" not in service_status:
        print("   - [严重] NVIDIA 许可证服务未运行！这是导致功能受限的直接原因。")
        print("   - 解决方案: 请启动该服务。Windows: `net start nvdisplay.container.service`, Linux: `sudo systemctl start nvidia-gridd`")
    else:
        print("   - ✅ 服务正在运行。")

    analyze_vgpu_logs()


print("\n--- 3. PyTorch CUDA 验证 ---")
is_cuda_available = torch.cuda.is_available()
print(f"🔍 PyTorch 是否能找到 CUDA: {is_cuda_available}")

if not is_cuda_available:
    print("\n[CRITICAL ERROR] PyTorch 报告 CUDA 不可用。")
    print("常见原因:")
    print("  1. 您可能安装了仅 CPU 版本的 PyTorch。请访问官网 (pytorch.org) 获取正确的 CUDA 版本安装命令。")
    print("  2. NVIDIA 驱动与 PyTorch 的 CUDA 工具包版本不兼容。")
    if is_vgpu:
        print("  3. (vGPU 环境) 许可证获取失败导致 GPU 计算功能被禁用，PyTorch 无法访问。")
    sys.exit()

print(f"✅ PyTorch 可以访问 CUDA。")
print(f"   - PyTorch 编译所用 CUDA 版本: {torch.version.cuda}")
device_count = torch.cuda.device_count()
print(f"   - 可用 GPU 数量: {device_count}")


print("\n--- 4. CUDA 核心操作测试 ---")
if device_count > 0:
    try:
        device = torch.device("cuda:0")
        print(f"a. 尝试在 GPU (cuda:0) 上创建张量...")
        a = torch.randn(3, 3, device=device)
        print(f"   - ✅ 成功在 {a.device} ({gpu_name}) 上创建张量。")

        print(f"b. 尝试在 GPU 上执行矩阵乘法...")
        b = torch.randn(3, 3, device=device)
        c = torch.matmul(a, b)
        print(f"   - ✅ 计算成功，结果位于 {c.device}。")
        
        print(f"c. 尝试将结果移回 CPU...")
        result_cpu = c.cpu()
        print(f"   - ✅ 成功将结果移回 {result_cpu.device}。")

    except Exception as e:
        print(f"\n[CRITICAL ERROR] 在执行 CUDA 操作时发生错误: {e}")
        if is_vgpu:
            print("\n[vGPU 诊断]")
            print("在 vGPU 环境下，此错误（特别是 'operation not supported' 或 'initialization error'）极有可能是由 **许可证问题** 造成的。")
            print("即使服务正在运行，许可证服务器也可能无法访问或没有可用的许可证。")
            print("请重点检查第 2 部分的日志分析结果，并联系您的系统管理员确认许可证配置。")
        else:
            print("\n[通用诊断]")
            print("此错误可能由驱动不稳定、硬件问题或 PyTorch 与驱动的深层不兼容导致。建议首先尝试重启系统和更新驱动程序。")
        sys.exit()
else:
    print("[警告] 没有可用的 GPU 设备进行操作测试。")


print("\n-------------------------------------------------")
print("✅ [诊断完成] 核心 CUDA 操作测试通过！")
if is_vgpu:
    print("您的 vGPU 环境似乎已为 PyTorch 准备就绪。如果遇到问题，请首先关注许可证服务的状态和日志。")
else:
    print("您的 PyTorch 环境已正确配置，可以使用 NVIDIA GPU。")
print("-------------------------------------------------")

--- 增强型 PyTorch & NVIDIA vGPU 环境诊断工具 ---
诊断开始时间: 2025-08-19 17:11:45
操作系统: Windows 10

--- 1. 系统和驱动信息 ---
🐍 Python 版本: 3.11.9 (tags/v3.11.9:de54cf5, Apr  2 2024, 10:12:12) [MSC v.1938 64 bit (AMD64)]
🔥 PyTorch 版本: 2.1.2+cu121

... 正在运行 `nvidia-smi` 获取驱动信息...
✅ `nvidia-smi` 命令成功执行。
   - GPU 型号: NVIDIA L40S-6Q               WDDM
   - 驱动版本: 538.95
   - 驱动支持的最高 CUDA 版本: 12.2

   - [关键信息] 检测到 vGPU 环境！将执行 vGPU 许可证诊断。

--- 2. vGPU 许可证服务诊断 ---
   - 正在检查 Windows 服务: 'NVIDIA Display Container LS'...
   - 服务状态: 未找到或未安装 (Not Found/Installed)
   - [严重] NVIDIA 许可证服务未运行！这是导致功能受限的直接原因。
   - 解决方案: 请启动该服务。Windows: `net start nvdisplay.container.service`, Linux: `sudo systemctl start nvidia-gridd`
📄 正在检查 vGPU 日志文件: C:\Users\Public\Documents\NvidiaLogging\Log.NVDisplay.Container.exe.log
   - [严重] 在日志中发现可能的许可证错误: 获取许可证失败
   - 诊断: GPU 可能因无法获取有效许可证而被限制了计算功能。请检查您的许可证服务器地址、网络连接和客户端配置。

--- 3. PyTorch CUDA 验证 ---
🔍 PyTorch 是否能找到 CUDA: True
✅ PyTorch 可以访问 CUDA。
   - PyTorch 编译所用 CUDA 版本: 12.1
   - 可用 GPU 数量: 1



SystemExit: 

### （pass）

In [1]:
import torch
from core.config import CONFIG, device
from scripts.get_dataloaders import get_dataloaders
from audio.baseline_model import AudioBaselineModel
from audio.trainer import AudioBaselineTrainer

def main():
    """
    主函数，用于执行声学基线模型的完整训练和评估流程。
    """
    # 1. 加载配置文件
    CONFIG.load_config("config.yaml")
    print(f"--- 实验配置已加载 ---")
    print(f"使用的设备: {device}")

    # --- 训练和验证流程 ---
    training_dataset_name = CONFIG.training_dataset_name()
    print(f"\n--- 正在加载 '{training_dataset_name}' 数据集用于训练和验证 ---")
    
    # 使用高级函数获取训练和验证所需的所有 dataloader
    iemocap_loaders = get_dataloaders(training_dataset_name)
    train_loader = iemocap_loaders['train']
    validation_loader = iemocap_loaders['validation']

    # 3. 初始化模型
    # 从配置中获取情感标签列表，以确定模型的输出维度
    num_labels = len(CONFIG.dataset_emotions(training_dataset_name))
    print(f"\n--- 正在初始化 AudioBaselineModel (类别数: {num_labels}) ---")
    model = AudioBaselineModel(num_labels=num_labels).to(device)

    # 4. 初始化训练器
    print(f"--- 正在初始化 AudioBaselineTrainer ---")
    trainer = AudioBaselineTrainer(
        model=model,
        num_epochs=CONFIG.training_epochs(),
        learning_rate=CONFIG.learning_rate(),
        optimizer_type=CONFIG.optimizer_type()
    )

    # 5. 开始训练
    trainer.train(train_loader)

    # 6. 在验证集上评估
    print(f"\n--- 正在 '{training_dataset_name}' 的验证集上进行评估 ---")
    trainer.eval(validation_loader, labels=CONFIG.dataset_emotions(training_dataset_name))

    # --- 零样本评估流程 (在 CREMA-D 上) ---
    evaluation_dataset_name = CONFIG.evaluation_dataset_name()
    print(f"\n--- 正在加载 '{evaluation_dataset_name}' 数据集用于零样本评估 ---")
    
    cremad_loaders = get_dataloaders(evaluation_dataset_name)
    evaluation_loader = cremad_loaders['evaluation']
    
    print(f"\n--- 正在 '{evaluation_dataset_name}' 上进行零样本评估 ---")
    trainer.eval(evaluation_loader, labels=CONFIG.dataset_emotions(evaluation_dataset_name))
    
    print("\n--- 训练和评估流程全部完成 ---")


if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm


RuntimeError: CUDA error: operation not supported
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [6]:

# --- 第 1 步: 加载必要的库和你的自定义类 ---
import torch
from torch.utils.data import DataLoader
from transformers import Wav2Vec2FeatureExtractor, DebertaV2Tokenizer
import os

from dataloaders.dataset import CustomSERDataset
from scripts.get_dataloaders import CustomDataCollator


# --- 确定计算设备 ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# --- 第 2 步: 初始化处理器和分词器 ---
# 这些是collator需要的 "工具"
audio_processor = Wav2Vec2FeatureExtractor.from_pretrained(CONFIG.audio_encoder_name())
text_tokenizer = DebertaV2Tokenizer.from_pretrained(CONFIG.text_encoder_name())

# --- 第 3 步: 实例化训练数据集 (IEMOCAP) ---
# 注意：你需要先运行你的预处理脚本，生成统一的元数据文件


# 定义最大长度 (例如10秒)
MAX_LEN_IN_SECONDS = 10
max_audio_len = 16000 * MAX_LEN_IN_SECONDS

iemocap_emotions = CONFIG.dataset_emotions(CONFIG.training_dataset_name())
train_dataset = CustomSERDataset(
    metadata_file_path=os.path.join(CONFIG.dataset_preprocessed_dir_path(CONFIG.training_dataset_name()),iemocap_metadata_filename), # 使用配置和生成的文件名
    emotions=iemocap_emotions,
    target_sample_rate=audio_processor.sampling_rate,
    split='train',
    max_audio_length=max_audio_len # 传入参数
)

# 创建IEMOCAP验证数据集
val_dataset = CustomSERDataset(
    metadata_file_path=os.path.join(CONFIG.dataset_preprocessed_dir_path(CONFIG.training_dataset_name()),iemocap_metadata_filename),
    emotions=iemocap_emotions,
    target_sample_rate=audio_processor.sampling_rate,
    split='val',
    max_audio_length=max_audio_len # 传入参数
)

# --- 第 4 步: 实例化评估数据集 (CREMA-D) ---
cremad_emotions = CONFIG.dataset_emotions(CONFIG.evaluation_dataset_name())
eval_dataset = CustomSERDataset(
    metadata_file_path=os.path.join(CONFIG.dataset_preprocessed_dir_path(CONFIG.evaluation_dataset_name()),cremad_metadata_filename), # 使用配置和生成的文件名
    emotions=cremad_emotions,
    target_sample_rate=audio_processor.sampling_rate,
    max_audio_length=max_audio_len # 传入参数
)


# --- 第 5 步: 实例化你的数据整理器 ---
# 数据整理器对于训练集和评估集是通用的
data_collator = CustomDataCollator(
    audio_processor=audio_processor,
    text_tokenizer=text_tokenizer,
    # device=device # device 来自你的CONFIG或Notebook顶部定义
)

# --- 第 6 步: 创建训练集的 DataLoader ---
train_dataloader = DataLoader(
    train_dataset,
    batch_size=CONFIG.dataloader_dict()['batch_size'],
    shuffle=True, # 训练集通常需要打乱
    collate_fn=data_collator, # 关键！在这里传入你的自定义整理器
    # num_workers=CONFIG.dataloader_dict()['num_workers'],
    # pin_memory=True
)
val_dataloader = DataLoader(
    val_dataset, 
    batch_size=CONFIG.dataloader_dict()['batch_size'], 
    shuffle=False, 
    collate_fn=data_collator, 
    # num_workers=CONFIG.dataloader_dict()['num_workers'], 
    # pin_memory=True
)

# --- 验证batch size ---
print(train_dataloader.batch_size)

# --- 第 7 步: 创建评估集的 DataLoader ---
eval_dataloader = DataLoader(
    eval_dataset,
    batch_size=CONFIG.dataloader_dict()['batch_size'],
    shuffle=False, # 评估集通常不需要打乱
    collate_fn=data_collator, # 关键！在这里传入你的自定义整理器
    # num_workers=CONFIG.dataloader_dict()['num_workers'],
    # pin_memory=True
)





Using device: cuda
已创建训练集，包含 3592 个样本。
已创建验证集，包含 898 个样本。
已创建训练集，包含 3920 个样本。
32


In [7]:
# --- 第 8 步 (验证): 精确定位问题的详细调试 ---
print("--- 验证 DataLoader 和 Collator (详细调试模式) ---")
print("\n验证训练集 DataLoader:")

# 初始化一个空的 gpu_batch 字典
gpu_batch = {}
try:
    # 1. 获取 CPU 批次 (这一步已经成功)
    cpu_batch = next(iter(train_dataloader))
    print("成功从 DataLoader 获取 CPU 批次。")

    # 2. 逐个检查并移动张量
    print("\n开始逐个将张量移动到 GPU...")

    # 检查 'audio_input_values'
    key = 'audio_input_values'
    tensor = cpu_batch[key]
    print(f"准备移动 '{key}'... | 类型: {tensor.dtype} | 形状: {tensor.shape}")
    gpu_batch[key] = tensor.to(device)
    print(f"'{key}' 移动成功！")

    # 检查 'text_input_ids'
    key = 'text_input_ids'
    tensor = cpu_batch[key]
    print(f"准备移动 '{key}'... | 类型: {tensor.dtype} | 形状: {tensor.shape}")
    gpu_batch[key] = tensor.to(device)
    print(f"'{key}' 移动成功！")

    # 检查 'text_attention_mask'
    key = 'text_attention_mask'
    tensor = cpu_batch[key]
    print(f"准备移动 '{key}'... | 类型: {tensor.dtype} | 形状: {tensor.shape}")
    gpu_batch[key] = tensor.to(device)
    print(f"'{key}' 移动成功！")

    # 检查 'labels'
    key = 'labels'
    tensor = cpu_batch[key]
    print(f"准备移动 '{key}'... | 类型: {tensor.dtype} | 形状: {tensor.shape}")
    gpu_batch[key] = tensor.to(device)
    print(f"'{key}' 移动成功！")

    print("\n所有张量均已成功移动到 GPU！")

except Exception as e:
    # 如果出错，我们会明确知道是在处理哪个 key 时发生的
    print(f"\n在尝试移动 '{key}' 张量时出错: {e}")

# --- 第 8 步 (验证): 从DataLoader中取出一个批次，检查其内容 ---
print("--- 验证 DataLoader 和 Collator ---")
print("\n验证训练集 DataLoader:")
try:
    first_train_batch = next(iter(train_dataloader))
    print("成功从训练集 DataLoader 获取一个批次！")
    print("批次包含的键:", first_train_batch.keys())
    print("音频输入形状:", first_train_batch['audio_input_values'].shape)
    print("文本输入形状:", first_train_batch['text_input_ids'].shape)
    print("标签形状:", first_train_batch['labels'].shape)
except Exception as e:
    print(f"获取训练集批次时出错: {e}")

print("\n验证评估集 DataLoader:")
try:
    first_eval_batch = next(iter(eval_dataloader))
    print("成功从评估集 DataLoader 获取一个批次！")
    print("批次包含的键:", first_eval_batch.keys())
    print("音频输入形状:", first_eval_batch['audio_input_values'].shape)
    print("文本输入形状:", first_eval_batch['text_input_ids'].shape)
    print("标签形状:", first_eval_batch['labels'].shape)
except Exception as e:
    print(f"获取评估集批次时出错: {e}")

--- 验证 DataLoader 和 Collator (详细调试模式) ---

验证训练集 DataLoader:
成功从 DataLoader 获取 CPU 批次。

开始逐个将张量移动到 GPU...
准备移动 'audio_input_values'... | 类型: torch.float32 | 形状: torch.Size([1, 32, 160000])
'audio_input_values' 移动成功！
准备移动 'text_input_ids'... | 类型: torch.int64 | 形状: torch.Size([32, 54])
'text_input_ids' 移动成功！
准备移动 'text_attention_mask'... | 类型: torch.int64 | 形状: torch.Size([32, 54])
'text_attention_mask' 移动成功！
准备移动 'labels'... | 类型: torch.int64 | 形状: torch.Size([32])
'labels' 移动成功！

所有张量均已成功移动到 GPU！
--- 验证 DataLoader 和 Collator ---

验证训练集 DataLoader:
成功从训练集 DataLoader 获取一个批次！
批次包含的键: dict_keys(['audio_input_values', 'text_input_ids', 'text_attention_mask', 'labels'])
音频输入形状: torch.Size([1, 32, 160000])
文本输入形状: torch.Size([32, 53])
标签形状: torch.Size([32])

验证评估集 DataLoader:
成功从评估集 DataLoader 获取一个批次！
批次包含的键: dict_keys(['audio_input_values', 'text_input_ids', 'text_attention_mask', 'labels'])
音频输入形状: torch.Size([1, 32, 160000])
文本输入形状: torch.Size([32, 11])
标签形状: torch.Size([32])


In [8]:
from audio.baseline_model import AudioBaselineModel
from audio.trainer import AudioBaselineTrainer
# 实例化模型和训练器 ---
print("\n--- 初始化基线模型和训练器 ---")
num_labels = len(iemocap_emotions)
baseline_model = AudioBaselineModel(num_labels=num_labels).to(device)

baseline_trainer = AudioBaselineTrainer(
    model=baseline_model,
    num_epochs=CONFIG.training_epochs(),
    learning_rate=CONFIG.learning_rate(),
    optimizer_type=CONFIG.optimizer_type()
)

# --- 步骤 3: 训练模型 ---
print("\n--- 开始在 IEMOCAP 上训练基线模型 ---")
baseline_trainer.train(train_dataloader)

# --- 步骤 4: 在 IEMOCAP 验证集上评估 ---
print("\n--- 在 IEMOCAP 验证集上评估模型性能 ---")
baseline_trainer.eval(val_dataloader, labels=iemocap_emotions)

# --- 步骤 5: 在 CREMA-D 测试集上进行零样本评估 ---
print("\n--- 在 CREMA-D 测试集上进行零样本评估 ---")
baseline_trainer.eval(eval_dataloader, labels=cremad_emotions)

print("\n--- 基线模型训练和评估完成！ ---")



--- 初始化基线模型和训练器 ---


Some weights of WavLMForSequenceClassification were not initialized from the model checkpoint at microsoft/wavlm-base-plus and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'classifier.weight', 'projector.bias', 'projector.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[INFO] 正在冻结WavLM的特征提取层...
[INFO] 特征提取层已冻结。

--- 开始在 IEMOCAP 上训练基线模型 ---


Epoch 1:  34%|███▎      | 38/113 [2:23:33<4:43:21, 226.68s/it, accuracy=0.406, loss=1.3]  


OutOfMemoryError: CUDA out of memory. Tried to allocate 94.00 MiB. GPU 0 has a total capacty of 6.00 GiB of which 0 bytes is free. Of the allocated memory 17.57 GiB is allocated by PyTorch, and 968.47 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [9]:
import torch
torch.cuda.empty_cache()  # 训练前先清理一次显存
