## 测试环境代理是否正常

In [9]:
# 测试代理
import os
import requests

# 设置代理环境变量
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890'
os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890'
os.environ['ALL_PROXY'] = 'socks5://127.0.0.1:7891'
#os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# 测试代理连接
try:
    response = requests.get('https://huggingface.co', timeout=10)
    print("✅ HuggingFace 连接成功，状态码:", response.status_code)
    print("Test server is ubuntu22.04 GPU 2080Ti 22G")
except Exception as e:
    print("连接失败:", e)

# 设置 HuggingFace 缓存路径
os.environ['HF_HOME'] = '/home/KevinLiangX/Codes/LLM-quickstart-main/hf'
os.environ['HF_HUB_CACHE'] = '/home/KevinLiangX/Codes/LLM-quickstart-main/hf_hu'


# 服务器环境 ubuntu22.04 GPU 2080Ti 22G

✅ HuggingFace 连接成功，状态码: 200
Test server is ubuntu22.04 GPU 2080Ti 22G


## 设置全局参数

In [6]:
model_name_or_path = "openai/whisper-large-v2"
model_dir = "models/whisper-large-v2-asr-int8"

language = "Chinese (China)"
language_abbr = "zh-CN"
task = "transcribe"
dataset_name = "mozilla-foundation/common_voice_11_0"

batch_size=8

## 准备数据集

In [7]:
# ## 优化的数据集加载方案

import os
from datasets import load_dataset, DatasetDict

# 设置缓存目录，避免重复下载
cache_dir = "/home/KevinLiangX/Codes/LLM-quickstart-main/hf_cache"
os.makedirs(cache_dir, exist_ok=True)

def load_dataset_with_cache(dataset_name, language_abbr, split, cache_dir):
    """带缓存的数据集加载，避免重复下载"""
    try:
        return load_dataset(
            dataset_name, 
            language_abbr, 
            split=split, 
            trust_remote_code=True,
            cache_dir=cache_dir
        )
    except Exception as e:
        print(f"加载{split}失败: {e}")
        return None

print("📥 加载数据集...")

common_voice = DatasetDict()

# 分别加载，失败时可以单独重试
common_voice["train"] = load_dataset_with_cache(dataset_name, language_abbr, "train", cache_dir)
common_voice["validation"] = load_dataset_with_cache(dataset_name, language_abbr, "validation", cache_dir)

# 检查加载结果
if all(v is not None for v in common_voice.values()):
    print(f"✅ 全部加载完成: 训练{len(common_voice['train'])} 验证{len(common_voice['validation'])}")
else:
    print("❌ 部分数据集加载失败，请检查网络连接")

📥 加载数据集...
✅ 全部加载完成: 训练29056 验证10581


## 预处理训练数据集

In [8]:
from transformers import AutoFeatureExtractor, AutoTokenizer, AutoProcessor

feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, language=language, task=task)

processor = AutoProcessor.from_pretrained(model_name_or_path, language=language, task=task)

## 移除数据集中不必要的字段
common_voice = common_voice.remove_columns(
    ["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"]
)

## 降低音频数据的采样16kHz
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))


def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## 使用全量数据进行训练

In [8]:
tokenized_common_voice = common_voice.map(prepare_dataset, num_proc=4)

Map (num_proc=4):   0%|          | 0/10581 [00:00<?, ? examples/s]

ImportError: To support decoding audio data, please install 'torchcodec'.

## 自定义语音数据整理器

In [11]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

# 定义一个针对语音到文本任务的数据整理器类
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any  # 处理器结合了特征提取器和分词器

    # 整理器函数，将特征列表处理成一个批次
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # 从特征列表中提取输入特征，并填充以使它们具有相同的形状
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # 从特征列表中提取标签特征（文本令牌），并进行填充
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # 使用-100替换标签中的填充区域，-100通常用于在损失计算中忽略填充令牌
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # 如果批次中的所有序列都以句子开始令牌开头，则移除它
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        # 将处理过的标签添加到批次中
        batch["labels"] = labels

        return batch  # 返回最终的批次，准备好进行训练或评估

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

## 模型准备

In [12]:
## 加载预训练模型(int8精度)
from transformers import AutoModelForSpeechSeq2Seq

model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name_or_path, load_in_8bit=True, device_map="auto")

model.config.forced_decoder_ids = None 

model.config.suppress_tokens = []

# 转换特定参数int8为32
from peft import prepare_model_for_int8_training

model = prepare_model_for_int8_training(model)



##  LoRA Adapter配置

In [13]:
from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

# 创建一个LoraConfig对象，用于设置LoRA（Low-Rank Adaptation）的配置参数
config = LoraConfig(
    r=4,  # LoRA的秩，影响LoRA矩阵的大小
    lora_alpha=64,  # LoRA适应的比例因子
    # 指定将LoRA应用到的模型模块，通常是attention和全连接层的投影。
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,  # 在LoRA模块中使用的dropout率
    bias="none",  # 设置bias的使用方式，这里没有使用bias
)

# 获取PEFT模型
peft_model = get_peft_model(model, config)
# 打印 LoRA 微调训练的模型参数，查看参与微调的参数量
peft_model.print_trainable_parameters()

trainable params: 1,966,080 || all params: 1,545,271,040 || trainable%: 0.12723204856023188


## 模型训练

In [18]:
## 设置超参
from transformers import Seq2SeqTrainingArguments

# 设置序列到序列模型训练的参数
training_args = Seq2SeqTrainingArguments(
    output_dir=model_dir,  
    per_device_train_batch_size=48, 
    learning_rate=1e-3,  
    num_train_epochs=3,  
    
    
    evaluation_strategy="epoch", 
    save_strategy="epoch",
    warmup_steps=200,  # 在训练初期增加学习率的步数，有助于稳定训练
    fp16=True,  # 启用混合精度训练，可以提高训练速度，同时减少内存使用
    gradient_accumulation_steps=1,   # 梯度累积，模拟更大batch
    per_device_eval_batch_size=64,  # 每个设备上的评估批量大小
    dataloader_num_workers=8,       # 多进程数据加载
    generation_max_length=256,  # 生成任务的最大长度
    logging_steps=20,  # 指定日志记录的步骤，用于跟踪训练进度
    remove_unused_columns=False,  # 是否删除不使用的列，以减少数据处理开销
    label_names=["labels"],  # 指定标签列的名称，用于训练过程中

    save_total_limit=3,                   # 只保存最新checkpoint
    load_best_model_at_end=True,          # 加载最佳模型
    metric_for_best_model="eval_loss",    # 基于validation loss选择

)

# 实例化 Seq2SeqTraniner 训练器
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=peft_model,
    train_dataset=tokenized_common_voice["train"],
    eval_dataset=tokenized_common_voice["validation"],
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
)
peft_model.config.use_cache = False

## 启动训练 并保存

In [19]:
# 启动训练
trainer.train()

  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename


Epoch,Training Loss,Validation Loss
1,0.371,0.417392
2,0.3228,0.389227
3,0.2413,0.388295


  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resource_filename
  from pkg_resources import resour

TrainOutput(global_step=1818, training_loss=0.31751476210204943, metrics={'train_runtime': 22241.1447, 'train_samples_per_second': 3.919, 'train_steps_per_second': 0.082, 'total_flos': 1.85319357677568e+20, 'train_loss': 0.31751476210204943, 'epoch': 3.0})

In [20]:
# 保存LoRA模型
trainer.save_model(model_dir)


In [None]:
# 查看Lora模型信息
peft_model.eval()

## 加载模型进行推理

In [8]:
model_dir = "models/whisper-large-v2-asr-int8"

language = "Chinese (China)"
language_abbr = "zh-CN"
language_decode = "chinese"
task = "transcribe"

from transformers import AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoProcessor
from peft import PeftConfig, PeftModel
import torch

print("修复模型加载问题...")

# 1. 从本地加载PEFT配置
peft_config = PeftConfig.from_pretrained(model_dir)
print(f"PEFT配置: {peft_config.base_model_name_or_path}")

# 2. 修复base model加载 - 处理pytorch_model.bin缺失
try:
    base_model = AutoModelForSpeechSeq2Seq.from_pretrained(
        peft_config.base_model_name_or_path, 
        load_in_8bit=True, 
        device_map="auto",
        torch_dtype=torch.float16,  # 指定数据类型
        use_safetensors=True,       # 优先使用safetensors格式
    )
    print("使用safetensors格式加载成功")
except Exception as e:
    print(f"safetensors加载失败: {e}")
    print("尝试重新下载模型...")
    
    # 重新下载完整模型
    base_model = AutoModelForSpeechSeq2Seq.from_pretrained(
        peft_config.base_model_name_or_path, 
        load_in_8bit=True, 
        device_map="auto",
        force_download=True,        # 强制重新下载
        resume_download=True        # 支持断点续传
    )
    print("重新下载完成")

修复模型加载问题...
PEFT配置: openai/whisper-large-v2


model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

{"timestamp":"2025-07-26T04:58:18.936583Z","level":"WARN","fields":{"message":"Reqwest(reqwest::Error { kind: Request, source: hyper_util::client::legacy::Error(Connect, Os { code: 32, kind: BrokenPipe, message: \"Broken pipe\" }) }). Retrying..."},"filename":"/home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs","line_number":242}
{"timestamp":"2025-07-26T04:58:18.936641Z","level":"WARN","fields":{"message":"Retry attempt #0. Sleeping 2.038532554s before the next attempt"},"filename":"/root/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/reqwest-retry-0.7.0/src/middleware.rs","line_number":171}




generation_config.json: 0.00B [00:00, ?B/s]

使用safetensors格式加载成功


In [9]:
# 继续加载LoRA适配器和其他组件

# 3. 加载本地LoRA适配器
peft_model = PeftModel.from_pretrained(base_model, model_dir)
print("LoRA适配器加载成功")

# 4. 加载tokenizer和processor
tokenizer = AutoTokenizer.from_pretrained(
    peft_config.base_model_name_or_path, 
    language=language, 
    task=task
)
processor = AutoProcessor.from_pretrained(
    peft_config.base_model_name_or_path, 
    language=language, 
    task=task
)
feature_extractor = processor.feature_extractor

print("所有组件加载完成")

# 设置模型为评估模式
peft_model.eval()

LoRA适配器加载成功


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json: 0.00B [00:00, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


所有组件加载完成


PeftModel(
  (base_model): LoraModel(
    (model): WhisperForConditionalGeneration(
      (model): WhisperModel(
        (encoder): WhisperEncoder(
          (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
          (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
          (embed_positions): Embedding(1500, 1280)
          (layers): ModuleList(
            (0-31): 32 x WhisperEncoderLayer(
              (self_attn): WhisperSdpaAttention(
                (k_proj): Linear8bitLt(in_features=1280, out_features=1280, bias=False)
                (v_proj): lora.Linear8bitLt(
                  (base_layer): Linear8bitLt(in_features=1280, out_features=1280, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1280, out_features=4, bias=False)
                  )
            

In [13]:
from transformers import AutomaticSpeechRecognitionPipeline

print("创建ASR pipeline（修复device冲突）...")

# 不指定device参数，让accelerate自动管理
pipeline = AutomaticSpeechRecognitionPipeline(
    model=peft_model, 
    tokenizer=tokenizer, 
    feature_extractor=feature_extractor
    # 移除 device=0 参数
)

# 获取强制解码ID
forced_decoder_ids = processor.get_decoder_prompt_ids(language=language_decode, task=task)

print("Pipeline创建成功")

创建ASR pipeline（修复device冲突）...
Pipeline创建成功


In [15]:
test_audio = "data/audio/test_zh.flac"

print("测试微调模型...")

with torch.cuda.amp.autocast():
    result = pipeline(
        test_audio, 
        max_new_tokens=255,
        generate_kwargs={
            "language": "chinese", 
            "task": "transcribe"   
        }
    )
    text = result["text"]

print(f"微调模型转录结果:")
print(f"   {text}")

🎤 测试微调模型...




微调模型转录结果:
   这是一段测试用于WhisperLarge V2模型的自动语音识别测试。


## OpenAI Whisper LoRA 模型评估

In [4]:
model_name_or_path = "openai/whisper-large-v2"
model_dir = "models/whisper-large-v2-asr-int8"

language = "Chinese (China)"
language_abbr = "zh-CN"
task = "transcribe"
dataset_name = "mozilla-foundation/common_voice_11_0"

batch_size=16

from transformers import AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoProcessor
from peft import PeftConfig, PeftModel

peft_config = PeftConfig.from_pretrained(model_dir)

base_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    peft_config.base_model_name_or_path, load_in_8bit=True, device_map="auto"
)
base_model.requires_grad_(False)

peft_model = PeftModel.from_pretrained(base_model, model_dir)
peft_model.eval()

tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
processor = AutoProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
feature_extractor = processor.feature_extractor
print("微调模型加载完成")

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


微调模型加载完成


In [5]:
# 优化数据集加载 - 使用缓存，避免重新下载

import os
from datasets import load_dataset, DatasetDict, Audio

# 设置缓存目录，使用之前训练时的缓存
cache_dir = "/home/KevinLiangX/Codes/LLM-quickstart-main/hf_cache"  # 使用之前的缓存目录

print("加载测试数据集（使用缓存）...")

def load_dataset_with_cache(dataset_name, language_abbr, split, cache_dir):
    """使用缓存加载数据集，避免重新下载"""
    try:
        return load_dataset(
            dataset_name, 
            language_abbr, 
            split=split, 
            trust_remote_code=True,
            cache_dir=cache_dir  # 使用已有缓存
        )
    except Exception as e:
        print(f"加载{split}失败: {e}")
        return None

# 只加载测试集，使用缓存
common_voice = DatasetDict()
common_voice["test"] = load_dataset_with_cache(dataset_name, language_abbr, "test", cache_dir)

if common_voice["test"] is not None:
    print(f"测试集加载成功（使用缓存）: {len(common_voice['test'])} 样本")
else:
    print("测试集加载失败")
    exit()

加载测试数据集（使用缓存）...


Using the latest cached version of the dataset since mozilla-foundation/common_voice_11_0 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'zh-CN' at /home/KevinLiangX/Codes/LLM-quickstart-main/hf_cache/mozilla-foundation___common_voice_11_0/zh-CN/11.0.0/3f27acf10f303eac5b6fbbbe02495aeddb46ecffdb0a2fe3507fcfbf89094631 (last modified on Sat Jul 26 15:05:05 2025).


测试集加载成功（使用缓存）: 10581 样本


In [11]:
common_voice = common_voice.remove_columns(
    ["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"]
)
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

tokenized_common_voice = common_voice.map(prepare_dataset)
print("数据已预处理完毕")

数据已预处理完毕


In [6]:
# 数据整理器 

import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

print("数据整理器准备完成")

数据整理器准备完成


In [7]:
# 解决WER模块版本问题

import subprocess
import sys
import os

print("解决WER模块问题...")

# 4. 重新导入并测试
print("\n重新测试...")
try:
    # 重新导入evaluate
    import importlib
    import evaluate
    importlib.reload(evaluate)
    
    print(f"新的evaluate版本: {evaluate.__version__}")
    
    # 检查WER是否现在可用
    available_metrics = evaluate.list_evaluation_modules(module_type="metric")
    print(f" 可用指标数量: {len(available_metrics)}")
    
    if "wer" in available_metrics:
        print(" WER现在可用了！")
        
        # 测试加载
        metric = evaluate.load("wer")
        print("WER加载成功")
        
        # 测试计算
        wer_score = metric.compute(predictions=["hello"], references=["hello"])
        print(f" WER计算成功: {wer_score}")
        
    else:
        print(" WER仍然不可用")
        print(" 搜索包含'wer'的指标...")
        wer_like = [m for m in available_metrics if 'wer' in m.lower()]
        print(f"包含'wer'的指标: {wer_like}")
        
except Exception as e:
    print(f" 测试失败: {e}")

print("\n 版本修复完成")

解决WER模块问题...

重新测试...
新的evaluate版本: 0.4.1
 测试失败: HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /api/spaces?filter=metric (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f9139b5cdc0>: Failed to establish a new connection: [Errno 101] Network is unreachable'))

 版本修复完成


In [59]:
# 使用可用的detailed-wer模块

import evaluate

print(" 使用可用的WER模块...")

try:
    # 使用找到的detailed-wer模块
    print(" 加载 argmaxinc/detailed-wer...")
    metric = evaluate.load("argmaxinc/detailed-wer")
    print(" detailed-wer 加载成功！")
    
    # 测试功能
    test_predictions = ["这是一个测试", "你好世界"]
    test_references = ["这是一个测试", "你好世界"]
    
    result = metric.compute(predictions=test_predictions, references=test_references)
    print(f" WER计算成功: {result}")
    
    # 检查返回的结果格式
    print(f" 结果类型: {type(result)}")
    print(f" 结果内容: {result}")
    
    # 如果是字典，查看包含的键
    if isinstance(result, dict):
        print(f" 可用键: {list(result.keys())}")
        
        # 通常WER在'wer'键中
        if 'wer' in result:
            wer_score = result['wer']
            print(f" WER分数: {wer_score}")
        elif 'word_error_rate' in result:
            wer_score = result['word_error_rate']
            print(f" WER分数: {wer_score}")
    
except Exception as e:
    print(f" detailed-wer 加载失败: {e}")

print("\n WER模块测试完成")

 使用可用的WER模块...
 加载 argmaxinc/detailed-wer...
 detailed-wer 加载失败: Couldn't find a module script at /home/KevinLiangX/Codes/LLM-quickstart-main/peft/argmaxinc/detailed-wer/detailed-wer.py. Module 'argmaxinc/detailed-wer' doesn't exist on the Hugging Face Hub either.

 WER模块测试完成


In [60]:
# 修复evaluate路径问题

import os
import evaluate

print(" 修复evaluate路径问题...")

# 1. 检查当前工作目录
current_dir = os.getcwd()
print(f" 当前工作目录: {current_dir}")

# 2. 检查是否有本地的evaluate模块干扰
local_evaluate_dirs = []
for item in os.listdir('.'):
    if 'evaluate' in item.lower() or 'wer' in item.lower():
        local_evaluate_dirs.append(item)

if local_evaluate_dirs:
    print(f" 发现可能干扰的本地目录: {local_evaluate_dirs}")
else:
    print(" 没有发现干扰的本地目录")

# 3. 临时切换到系统临时目录
import tempfile
temp_dir = tempfile.mkdtemp()
print(f" 切换到临时目录: {temp_dir}")

original_dir = os.getcwd()
os.chdir(temp_dir)

try:
    # 4. 在干净环境中重新导入evaluate
    import importlib
    importlib.reload(evaluate)
    
    print(" 在干净环境中重新加载evaluate...")
    
    # 5. 强制从Hub下载
    print(" 尝试从HuggingFace Hub加载WER...")
    
    # 设置环境变量确保在线模式
    os.environ['HF_DATASETS_OFFLINE'] = '0'
    os.environ['HF_EVALUATE_OFFLINE'] = '0'
    
    # 尝试不同的WER模块
    wer_modules = [
        "wer",
        "evaluate-metric/wer", 
        "huggingface/evaluate-metric-wer"
    ]
    
    metric = None
    for module_name in wer_modules:
        try:
            print(f" 尝试加载: {module_name}")
            metric = evaluate.load(module_name, trust_remote_code=True)
            print(f" 成功加载: {module_name}")
            break
        except Exception as e:
            print(f" 失败: {e}")
    
    if metric:
        # 测试WER计算
        test_preds = ["hello world"]
        test_refs = ["hello world"]
        wer_score = metric.compute(predictions=test_preds, references=test_refs)
        print(f" WER测试成功: {wer_score}")
    else:
        print(" 所有WER模块都加载失败")

except Exception as e:
    print(f" 在临时目录中也失败: {e}")

finally:
    # 6. 恢复原目录
    os.chdir(original_dir)
    print(f" 恢复到原目录: {original_dir}")

print(" 路径修复测试完成")

 修复evaluate路径问题...
 当前工作目录: /home/KevinLiangX/Codes/LLM-quickstart-main/peft
 没有发现干扰的本地目录
 切换到临时目录: /tmp/tmp_sk0wbe5
 在干净环境中重新加载evaluate...
 尝试从HuggingFace Hub加载WER...
 尝试加载: wer
 失败: Couldn't find a module script at /tmp/tmp_sk0wbe5/wer/wer.py. Module 'wer' doesn't exist on the Hugging Face Hub either.
 尝试加载: evaluate-metric/wer
 失败: Couldn't find a module script at /tmp/tmp_sk0wbe5/evaluate-metric/wer/wer.py. Module 'evaluate-metric/wer' doesn't exist on the Hugging Face Hub either.
 尝试加载: huggingface/evaluate-metric-wer
 失败: Couldn't find a module script at /tmp/tmp_sk0wbe5/huggingface/evaluate-metric-wer/evaluate-metric-wer.py. Module 'huggingface/evaluate-metric-wer' doesn't exist on the Hugging Face Hub either.
 所有WER模块都加载失败
 恢复到原目录: /home/KevinLiangX/Codes/LLM-quickstart-main/peft
 路径修复测试完成


In [57]:
## 放弃 evaluate.wer评估方式

In [8]:

print("\n 测试jiwer...")

# 测试数据
references = ["这是一个测试句子", "你好世界"]
predictions = ["这是一个测试句子", "你好世界"]

# 计算WER
wer_score = jiwer.wer(references, predictions)
print(f" WER计算成功: {wer_score}")
print(f" WER百分比: {wer_score * 100:.2f}%")

# 3. 创建兼容evaluate接口的包装器
class JiwerMetric:
    """jiwer包装器，兼容evaluate接口"""
    
    def __init__(self):
        self.predictions = []
        self.references = []
    
    def add_batch(self, predictions, references):
        """添加批次数据"""
        self.predictions.extend(predictions)
        self.references.extend(references)
    
    def compute(self, predictions=None, references=None):
        """计算WER"""
        if predictions is not None and references is not None:
            return jiwer.wer(references, predictions)
        else:
            if not self.predictions or not self.references:
                return 0.0
            
            wer_result = jiwer.wer(self.references, self.predictions)
            
            # 清空数据
            self.predictions = []
            self.references = []
            
            return wer_result

# 4. 测试包装器
print("\n 测试包装器...")
metric = JiwerMetric()

# 测试批次添加
metric.add_batch(predictions=["测试1"], references=["测试1"])
metric.add_batch(predictions=["测试2"], references=["测试2"])
wer_batch = metric.compute()
print(f" 批次测试: {wer_batch}")

# 测试直接计算
wer_direct = metric.compute(predictions=["直接测试"], references=["直接测试"])
print(f" 直接测试: {wer_direct}")

print("\n jiwer解决方案准备完成！")


 测试jiwer...
 WER计算成功: 0.0
 WER百分比: 0.00%

 测试包装器...
 批次测试: 0.0
 直接测试: 0.0

 jiwer解决方案准备完成！


In [12]:
# 完测试集评估

from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import gc
import jiwer

# 创建评估数据加载器 - 使用完整测试集
eval_dataloader = DataLoader(
    tokenized_common_voice["test"], 
    batch_size=batch_size, 
    collate_fn=data_collator
)

print(f" 完整测试集评估准备")
print(f"测试集大小: {len(tokenized_common_voice['test'])}")
print(f" 批次大小: {batch_size}")
print(f"总批次数: {len(eval_dataloader)}")

# 创建评估器
class FullDatasetWEREvaluator:
    def __init__(self):
        self.predictions = []
        self.references = []
    
    def add_batch(self, predictions, references):
        self.predictions.extend(predictions)
        self.references.extend(references)
    
    def compute(self):
        if not self.predictions:
            return 0.0
        return jiwer.wer(self.references, self.predictions)
    
    def get_stats(self):
        return {
            'total_samples': len(self.predictions),
            'total_references': len(self.references)
        }

# 初始化评估器
metric = FullDatasetWEREvaluator()

print("开始完整测试集评估...")
print("这可能需要较长时间，请耐心等待...")

# 评估循环
processed_samples = 0
for step, batch in enumerate(tqdm(eval_dataloader, desc="完整评估进度")):
    with torch.cuda.amp.autocast():
        with torch.no_grad():
            # 生成预测
            generated_tokens = peft_model.generate(
                input_features=batch["input_features"].to("cuda"),
                max_new_tokens=255,
            ).cpu().numpy()
            
            # 处理标签
            labels = batch["labels"].cpu().numpy()
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            
            # 解码
            decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            
            # 添加到评估器
            metric.add_batch(predictions=decoded_preds, references=decoded_labels)
            processed_samples += len(decoded_preds)
    
    # 内存清理
    del generated_tokens, labels, batch
    if step % 20 == 0:  # 更频繁的内存清理
        gc.collect()
        torch.cuda.empty_cache()
        print(f"已处理: {processed_samples}/{len(tokenized_common_voice['test'])} 样本")

print("完整测试集评估数据收集完成")

# 计算最终WER
print("计算完整测试集WER...")
wer_score = metric.compute()
wer_percent = wer_score * 100
stats = metric.get_stats()

# 显示结果
print("\n" + "="*70)
print("Whisper LoRA 中文ASR模型 - 完整测试集评估结果")
print("="*70)
print(f"测试样本总数: {stats['total_samples']}")
print(f"词错误率 (WER): {wer_percent:.2f}%")
print(f"词准确率: {100-wer_percent:.2f}%")
print("="*70)
print("完整测试集评估完成！")

# 保存结果
results = {
    'wer': wer_score,
    'wer_percent': wer_percent,
    'accuracy': 100 - wer_percent,
    'total_samples': stats['total_samples'],
    'dataset': 'full_test_set'
}

print(f"最终结果: {results}")

 完整测试集评估准备
测试集大小: 10581
 批次大小: 16
总批次数: 662
开始完整测试集评估...
这可能需要较长时间，请耐心等待...


  from pkg_resources import resource_filename
完整评估进度:   0%|▏                                                                                                                                          | 1/662 [00:17<3:18:00, 17.97s/it]

已处理: 16/10581 样本


完整评估进度:   3%|████▍                                                                                                                                     | 21/662 [06:04<3:16:04, 18.35s/it]

已处理: 336/10581 样本


完整评估进度:   6%|████████▌                                                                                                                                 | 41/662 [11:02<2:25:51, 14.09s/it]

已处理: 656/10581 样本


完整评估进度:   9%|████████████▋                                                                                                                             | 61/662 [16:11<2:42:36, 16.23s/it]

已处理: 976/10581 样本


完整评估进度:  12%|████████████████▉                                                                                                                         | 81/662 [21:15<2:22:16, 14.69s/it]

已处理: 1296/10581 样本


完整评估进度:  15%|████████████████████▉                                                                                                                    | 101/662 [26:43<2:40:25, 17.16s/it]

已处理: 1616/10581 样本


完整评估进度:  18%|█████████████████████████                                                                                                                | 121/662 [31:51<2:26:57, 16.30s/it]

已处理: 1936/10581 样本


完整评估进度:  21%|█████████████████████████████▏                                                                                                           | 141/662 [37:22<2:23:24, 16.52s/it]

已处理: 2256/10581 样本


完整评估进度:  24%|█████████████████████████████████▎                                                                                                       | 161/662 [42:40<2:16:41, 16.37s/it]

已处理: 2576/10581 样本


完整评估进度:  27%|█████████████████████████████████████▍                                                                                                   | 181/662 [47:58<1:58:46, 14.82s/it]

已处理: 2896/10581 样本


完整评估进度:  30%|█████████████████████████████████████████▌                                                                                               | 201/662 [53:28<2:07:33, 16.60s/it]

已处理: 3216/10581 样本


完整评估进度:  33%|█████████████████████████████████████████████▋                                                                                           | 221/662 [58:26<1:56:22, 15.83s/it]

已处理: 3536/10581 样本


完整评估进度:  36%|█████████████████████████████████████████████████▏                                                                                     | 241/662 [1:03:36<1:51:18, 15.86s/it]

已处理: 3856/10581 样本


完整评估进度:  39%|█████████████████████████████████████████████████████▏                                                                                 | 261/662 [1:08:47<1:41:55, 15.25s/it]

已处理: 4176/10581 样本


完整评估进度:  42%|█████████████████████████████████████████████████████████▎                                                                             | 281/662 [1:14:04<1:35:38, 15.06s/it]

已处理: 4496/10581 样本


完整评估进度:  45%|█████████████████████████████████████████████████████████████▍                                                                         | 301/662 [1:19:27<1:32:29, 15.37s/it]

已处理: 4816/10581 样本


完整评估进度:  48%|█████████████████████████████████████████████████████████████████▍                                                                     | 321/662 [1:25:13<1:32:10, 16.22s/it]

已处理: 5136/10581 样本


完整评估进度:  52%|█████████████████████████████████████████████████████████████████████▌                                                                 | 341/662 [1:30:40<1:29:02, 16.64s/it]

已处理: 5456/10581 样本


完整评估进度:  55%|█████████████████████████████████████████████████████████████████████████▌                                                             | 361/662 [1:35:59<1:23:17, 16.60s/it]

已处理: 5776/10581 样本


完整评估进度:  58%|█████████████████████████████████████████████████████████████████████████████▋                                                         | 381/662 [1:41:08<1:12:52, 15.56s/it]

已处理: 6096/10581 样本


完整评估进度:  61%|█████████████████████████████████████████████████████████████████████████████████▊                                                     | 401/662 [1:46:29<1:05:49, 15.13s/it]

已处理: 6416/10581 样本


完整评估进度:  64%|█████████████████████████████████████████████████████████████████████████████████████▊                                                 | 421/662 [1:51:35<1:06:02, 16.44s/it]

已处理: 6736/10581 样本


完整评估进度:  67%|█████████████████████████████████████████████████████████████████████████████████████████▉                                             | 441/662 [1:56:58<1:03:59, 17.37s/it]

已处理: 7056/10581 样本


完整评估进度:  70%|███████████████████████████████████████████████████████████████████████████████████████████████▍                                         | 461/662 [2:02:41<57:36, 17.20s/it]

已处理: 7376/10581 样本


完整评估进度:  73%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 481/662 [2:07:53<47:45, 15.83s/it]

已处理: 7696/10581 样本


完整评估进度:  76%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋                                 | 501/662 [2:13:05<44:05, 16.43s/it]

已处理: 8016/10581 样本


完整评估进度:  79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                             | 521/662 [2:18:28<39:16, 16.72s/it]

已处理: 8336/10581 样本


完整评估进度:  82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                         | 541/662 [2:23:40<30:27, 15.10s/it]

已处理: 8656/10581 样本


完整评估进度:  85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                     | 561/662 [2:28:55<26:13, 15.58s/it]

已处理: 8976/10581 样本


完整评估进度:  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                | 581/662 [2:34:33<23:57, 17.74s/it]

已处理: 9296/10581 样本


完整评估进度:  91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍            | 601/662 [2:39:49<14:59, 14.75s/it]

已处理: 9616/10581 样本


完整评估进度:  94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 621/662 [2:45:06<10:37, 15.55s/it]

已处理: 9936/10581 样本


完整评估进度:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 641/662 [2:50:23<05:46, 16.48s/it]

已处理: 10256/10581 样本


完整评估进度: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 661/662 [2:55:41<00:16, 16.85s/it]

已处理: 10576/10581 样本


完整评估进度: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 662/662 [2:55:51<00:00, 15.94s/it]

完整测试集评估数据收集完成
计算完整测试集WER...

Whisper LoRA 中文ASR模型 - 完整测试集评估结果
测试样本总数: 10581
词错误率 (WER): 68.67%
词准确率: 31.33%
完整测试集评估完成！
最终结果: {'wer': 0.6867321867321867, 'wer_percent': 68.67321867321867, 'accuracy': 31.32678132678133, 'total_samples': 10581, 'dataset': 'full_test_set'}



