Подготовка

In [1]:
import os, re
from datasets import load_dataset, Audio, config as ds_config


BASE_DIR   = r"W:\whisper_sova"
DATA_DIR   = fr"{BASE_DIR}\data"


ds_config.TORCHCODEC_AVAILABLE = False

DATASET_ID = "MikeHonkers/SOVA-audiobooks-100k"
TARGET_SR  = 16000
MIN_BYTES  = int(0.20 * TARGET_SR * 2)

print("→ Загружаем SOVA-audiobooks-100k (только для val/test) ...")
ds = load_dataset(DATASET_ID, cache_dir=DATA_DIR, split="train")

ds = ds.train_test_split(test_size=0.02, seed=42)
train_ds, test_holdout = ds["train"], ds["test"]
val_test = test_holdout.train_test_split(test_size=0.5, seed=42)
val_ds, test_ds = val_test["train"], val_test["test"]

train_ds = train_ds.cast_column("audio", Audio(decode=False))
val_ds   = val_ds.cast_column("audio",   Audio(decode=False))
test_ds  = test_ds.cast_column("audio",  Audio(decode=False))

def normalize_ru_text(s: str) -> str:
    s = s.strip().replace("ё", "е")
    s = re.sub(r'[“”«»]', '"', s)
    s = re.sub(r"\s+", " ", s)
    return s

def prepare_batched(batch):
    return {"sentence": [normalize_ru_text(t) for t in batch["text"]]}

print("→ Нормализуем текст для val/test ...")
train_ds = train_ds.map(prepare_batched, batched=True, desc="prepare train (for consistency)")
val_ds   = val_ds.map(prepare_batched,   batched=True, desc="prepare val")
test_ds  = test_ds.map(prepare_batched,  batched=True, desc="prepare test")

def has_audio_ref(ex):
    a = ex.get("audio", {})
    return (a.get("path") or a.get("bytes")) is not None

def looks_nonempty(ex):
    a = ex.get("audio", {})
    if a.get("bytes"):
        return len(a["bytes"]) >= MIN_BYTES
    p = a.get("path")
    return bool(p) and os.path.exists(p) and os.path.getsize(p) >= MIN_BYTES

print("→ Фильтруем пустые/битые аудио ...")
train_ds = train_ds.filter(has_audio_ref)
val_ds   = val_ds.filter(has_audio_ref)
test_ds  = test_ds.filter(has_audio_ref)

train_ds = train_ds.filter(looks_nonempty)
val_ds   = val_ds.filter(looks_nonempty)
test_ds  = test_ds.filter(looks_nonempty)

print(f"[DATA] train: {len(train_ds)}  val: {len(val_ds)}  test: {len(test_ds)}")



→ Загружаем SOVA-audiobooks-100k (только для val/test) ...


Resolving data files:   0%|          | 0/33 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/33 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/32 [00:00<?, ?it/s]

→ Нормализуем текст для val/test ...
→ Фильтруем пустые/битые аудио ...
[DATA] train: 99057  val: 1011  test: 1011


Батч-оценка

In [4]:
import os, io, json, math, random
import numpy as np
import soundfile as sf
import librosa
from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor
from peft import PeftModel

BASE_DIR      = r"W:\whisper_sova"
MODEL_DIR     = fr"{BASE_DIR}\model"
OUTPUT_DIR    = fr"{BASE_DIR}\output"
PROCESSOR_DIR = fr"{OUTPUT_DIR}\processor"
ADAPTER_DIR   = fr"{OUTPUT_DIR}\lora_adapter_fast"
MODEL_ID      = "openai/whisper-small"
TARGET_SR     = 16000

processor = WhisperProcessor.from_pretrained(PROCESSOR_DIR)
base_model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, cache_dir=MODEL_DIR, device_map="auto")
ft_model  = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
ft_model.eval()

pipe = pipeline(
    task="automatic-speech-recognition",
    model=ft_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    generate_kwargs={
        "task": "transcribe",
        "language": "russian",
        "num_beams": 1,
        "do_sample": False,
        "temperature": 0.0,
        "no_repeat_ngram_size": 4,
        "repetition_penalty": 1.30,
        "length_penalty": 0.0,
        "return_timestamps": False,
    },
)

def load_audio_field(audio_field):
    if audio_field.get("bytes"):
        y, sr = sf.read(io.BytesIO(audio_field["bytes"]), always_2d=False)
    else:
        y, sr = sf.read(audio_field["path"], always_2d=False)
    if y.ndim > 1:
        y = y.mean(axis=1)
    return y.astype(np.float32), sr

def strong_vad_trim(y, sr, top_db=40):
    y = librosa.util.normalize(y, axis=0)
    yt, _ = librosa.effects.trim(y, top_db=top_db)
    return yt if yt.size >= int(0.5 * sr) else y

def resample_to(y, sr, target_sr=TARGET_SR):
    return y if sr == target_sr else librosa.resample(y, orig_sr=sr, target_sr=target_sr)

def save_tmp_wav(y, sr, fname="_vad_tmp.wav"):
    path = os.path.join(OUTPUT_DIR, fname)
    sf.write(path, y, sr)
    return path

def adaptive_max_new_tokens(duration_s, scale=4.2, bias=8, hard_min=10, hard_max=64):
    m = int(scale * duration_s + bias)
    return max(hard_min, min(hard_max, m))

random.seed(1)
idxs = random.sample(range(len(val_ds)), k=min(12, len(val_ds)))

print(f"Split: val | items: {len(idxs)} | greedy strict v2 | VAD top_db=40")
print("-"*90)
for i, idx in enumerate(idxs):
    ex = val_ds[idx]
    ref = ex.get("sentence") or ex.get("text") or ""
    y, sr = load_audio_field(ex["audio"])
    dur = len(y)/sr
    y = resample_to(y, sr, TARGET_SR)
    y = strong_vad_trim(y, TARGET_SR, top_db=40)
    dur_trim = len(y)/TARGET_SR
    wav_path = save_tmp_wav(y, TARGET_SR, fname=f"_vadv2_{i}.wav")

    max_new = adaptive_max_new_tokens(dur_trim, scale=4.2, bias=8, hard_min=10, hard_max=64)
    out = pipe(wav_path, generate_kwargs=dict(max_new_tokens=max_new))
    hyp = out["text"].strip()

    print(f"[# {idx:5d}] dur={dur:.2f}s → {dur_trim:.2f}s | max_new={max_new}")
    print("  REF:", ref[:220])
    print("  HYP:", hyp[:220] + ("…" if len(hyp) > 220 else ""))
    print("-"*90)


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Device set to use cuda:0


Split: val | items: 12 | greedy strict v2 | VAD top_db=40
------------------------------------------------------------------------------------------


  "cipher": algorithms.TripleDES,
  "class": algorithms.Blowfish,
  "class": algorithms.TripleDES,
`return_token_timestamps` is deprecated for WhisperFeatureExtractor and will be removed in Transformers v5. Use `return_attention_mask` instead, as the number of frames can be inferred from it.
The following generation flags are not valid and may be ignored: ['length_penalty']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


[#   137] dur=5.43s → 4.13s | max_new=25
  REF: уже издалека метров за пятьдесят он закричал им чтобы заводили
  HYP: уж и с далека метров за пятьдесят он закричал им чтобы заводили в течение час
------------------------------------------------------------------------------------------
[#   582] dur=5.07s → 3.52s | max_new=22
  REF: а теперь товарищи сказал сноуболл отбрасывая кисточку на нивы
  HYP: а теперь товарищ я сказал сноуболла бросая кисточку на нивы в
------------------------------------------------------------------------------------------
[#   867] dur=3.81s → 2.08s | max_new=16
  REF: три корабля все-таки исчезли
  HYP: три корабля все-таки исчезли и с днём
------------------------------------------------------------------------------------------
[#   821] dur=6.06s → 5.25s | max_new=30
  REF: боюсь что эту идею нам придется с негодованием отвергнуть огорченно сообщил мужчина
  HYP: боюсь что эту идею нам придется с негодованием отвергнуть оберченно сообщил мужчинах а у
--

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[#   779] dur=7.56s → 6.14s | max_new=33
  REF: и возможно что это вызвано какой то причиной которую мы обнаружили слишком поздно
  HYP: и возможно что это вызвано какой-то причиной которую мы обнаружили слишком поздно как вы
------------------------------------------------------------------------------------------
[#   460] dur=7.89s → 7.33s | max_new=38
  REF: боксер опустил копыто и собака повизгивая уползла в сторону
  HYP: боксер опустил копыто и собака повизгивая уползла в сторону его шума на ладону с кем-ли
------------------------------------------------------------------------------------------
[#   483] dur=6.33s → 6.33s | max_new=34
  REF: и как только что-нибудь случится немедленно явимся на место происшествия
  HYP: и как только что-нибудь случится немедленно явимся на место происшествияю и в течение года вы можете увидеть себя так же не очень
------------------------------------------------------------------------------------------


Диаризация

In [None]:
import os
os.environ["HF_TOKEN"] = "******"


In [None]:
import torch
from pyannote.audio import Pipeline
import os

HF_TOKEN = os.environ.get("HF_TOKEN")  # или явно строкой

pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    token=HF_TOKEN,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline.to(device)

print("Диаризация инициализирована на:", device)


In [None]:
def diarize_and_transcribe(
    y: np.ndarray,
    sr: int,
    global_wav_path: str,
    diar_pipeline,
    asr_pipe,
    max_tokens_fn=adaptive_max_new_tokens,
    min_seg_dur: float = 0.5,
):
    """
    y, sr       – аудиосигнал, уже приведённый к TARGET_SR.
    global_wav_path – путь к этому же аудио (для pyannote).
    diar_pipeline – pyannote.audio Pipeline (speaker-diarization-3.1).
    asr_pipe    –  HF pipeline с LoRA-Whisper.
    max_tokens_fn(duration_s) – функция для max_new_tokens.
    """

    out = diar_pipeline(global_wav_path)

    if hasattr(out, "speaker_diarization"):
        ann = out.speaker_diarization
    else:
        ann = out

    segments = []

    for turn, _, speaker in ann.itertracks(yield_label=True):
        start_s = float(turn.start)
        end_s   = float(turn.end)
        if end_s <= start_s:
            continue
        if (end_s - start_s) < min_seg_dur:
            continue

        start_idx = int(start_s * sr)
        end_idx   = int(end_s * sr)
        seg = y[start_idx:end_idx]

        if seg.size < int(min_seg_dur * sr):
            continue

        duration_seg = (end_idx - start_idx) / sr
        max_new = max_tokens_fn(duration_seg)

        asr_out = asr_pipe(
            {"array": seg, "sampling_rate": sr},
            generate_kwargs=dict(max_new_tokens=max_new),
        )
        text = asr_out["text"].strip()

        segments.append(
            {
                "speaker": str(speaker),
                "start": start_s,
                "end": end_s,
                "text": text,
                "duration": duration_seg,
                "max_new_tokens": max_new,
            }
        )

    segments.sort(key=lambda s: s["start"])
    return segments


In [None]:
print(f"Split: val | items: {len(idxs)} | greedy strict v2 | VAD top_db=40 + diarization")
print("-"*90)
for i, idx in enumerate(idxs):
    ex = val_ds[idx]
    ref = ex.get("sentence") or ex.get("text") or ""
    y, sr = load_audio_field(ex["audio"])
    dur = len(y) / sr

    y = resample_to(y, sr, TARGET_SR)
    y = strong_vad_trim(y, TARGET_SR, top_db=40)
    dur_trim = len(y) / TARGET_SR

    wav_path = save_tmp_wav(y, TARGET_SR, fname=f"_vadv2_{i}.wav")

    max_new = adaptive_max_new_tokens(dur_trim, scale=4.2, bias=8, hard_min=10, hard_max=64)
    out = pipe(wav_path, generate_kwargs=dict(max_new_tokens=max_new))
    hyp = out["text"].strip()

    print(f"[# {idx:5d}] dur={dur:.2f}s → {dur_trim:.2f}s | max_new={max_new}")
    print("  REF:", ref[:220])
    print("  HYP:", hyp[:220] + ("…" if len(hyp) > 220 else ""))


    diar_segments = diarize_and_transcribe(
        y,
        TARGET_SR,
        wav_path,
        diar_pipeline=pipeline,
        asr_pipe=pipe,
        max_tokens_fn=adaptive_max_new_tokens,
        min_seg_dur=0.7,
    )
    if diar_segments:
        print("  Diarized transcript:")
        for seg in diar_segments:
            print(
                f'    [spk {seg["speaker"]}] '
                f'{seg["start"]:6.2f}-{seg["end"]:6.2f}s | '
                f'max_new={seg["max_new_tokens"]:2d} | '
                f'{seg["text"][:150]}{"…" if len(seg["text"]) > 150 else ""}'
            )
    else:
        print("  Diarized transcript: <empty / only noise>")


    print("-"*90)


Свой файл

In [None]:
import os
import numpy as np
import librosa

AUDIO_FILE = "videoplayback.m4a"

if not os.path.exists(AUDIO_FILE):
    raise FileNotFoundError(f"Не найден {AUDIO_FILE} рядом с ноутбуком. Текущая папка: {os.getcwd()}")

y, sr = librosa.load(AUDIO_FILE, sr=None, mono=False)
if y.ndim > 1:
    y = y.mean(axis=0)
y = y.astype(np.float32)

max_samples = int(20.0 * sr)
if len(y) > max_samples:
    y = y[:max_samples]
dur_raw = len(y) / sr

y16 = resample_to(y, sr, TARGET_SR)
dur16 = len(y16) / TARGET_SR

wav_path = save_tmp_wav(y16, TARGET_SR, fname="_film_20s_no_vad.wav")

print(f"Файл: {AUDIO_FILE}")
print(f"  исходный sr={sr}, dur≈{dur_raw:.2f}с (обрезано до 20с)")
print(f"  после ресемплинга 16k: dur≈{dur16:.2f}с")
print(f"  временный WAV для анализа: {wav_path}")

max_new_full = adaptive_max_new_tokens(
    dur16,
    scale=4.2,
    bias=8,
    hard_min=10,
    hard_max=160,
)
asr_full = pipe(wav_path, generate_kwargs=dict(max_new_tokens=max_new_full))
full_text = asr_full["text"].strip()

print("\n=== Общая расшифровка (без диаризации) ===")
print(full_text)

diar_segments = diarize_and_transcribe(
    y16,
    TARGET_SR,
    wav_path,
    diar_pipeline=pipeline,
    asr_pipe=pipe, 
    max_tokens_fn=adaptive_max_new_tokens,
    min_seg_dur=0.3,
)

print("\n=== Диаризация + ASR по спикерам (без внешнего VAD, min_seg_dur=0.3) ===")
if not diar_segments:
    print("<нет голосовых сегментов / только шум>")
else:
    for seg in diar_segments:
        print(
            f'[spk {seg["speaker"]}] '
            f'{seg["start"]:6.2f}-{seg["end"]:6.2f}s '
            f'(dur={seg["duration"]:.2f}s, max_new={seg["max_new_tokens"]})'
        )
        print("   ", seg["text"])


Экспорт

In [None]:
import os
import json

if "diar_segments" not in globals() or not diar_segments:
    raise RuntimeError("diar_segments пуст или не определён — сначала запусти ячейку с диаризацией.")

audio_file = globals().get("AUDIO_FILE", "unknown")
sample_rate = globals().get("TARGET_SR", 16000)
duration_sec = globals().get("dur16", None)
asr_model_id = globals().get("MODEL_ID", "whisper_lora")
diar_model_id = "pyannote/speaker-diarization-3.1"

speaker_map = {}
next_spk_idx = 0

segments_json = []

for i, seg in enumerate(diar_segments, start=1):
    raw_spk = str(seg["speaker"])
    if raw_spk not in speaker_map:
        speaker_map[raw_spk] = f"S{next_spk_idx}"
        next_spk_idx += 1
    spk_id = speaker_map[raw_spk]

    seg_id = f"seg_{i:04d}"

    start_t = round(float(seg["start"]), 2)
    end_t   = round(float(seg["end"]), 2)

    segments_json.append(
        {
            "id": seg_id,
            "start": start_t,
            "end": end_t,
            "speaker": spk_id,     
            "speaker_raw": raw_spk, 
            "text": seg["text"],       
            "conf": None,           
            "n_best": [],            
            "uncertain_spans": [],  
        }
    )

export_obj = {
    "schema_version": 1,
    "language": "ru",
    "meta": {
        "source": audio_file,
        "sample_rate": sample_rate,
        "duration": duration_sec,
        "num_speakers": len(speaker_map),
        "speaker_map": speaker_map,
        "asr_model": asr_model_id,
        "diarization_model": diar_model_id,
    },
    "segments": segments_json,
}


base_out_dir = globals().get("OUTPUT_DIR", os.getcwd())
os.makedirs(base_out_dir, exist_ok=True)

json_path = os.path.join(base_out_dir, "videoplayback_20s_diarized.json")

with open(json_path, "w", encoding="utf-8") as f:
    json.dump(export_obj, f, ensure_ascii=False, indent=2)

print(f" Экспортирован диаризованный транскрипт в: {json_path}\n")
print("Пример (первые сегменты):")
print(json.dumps({**export_obj, "segments": export_obj["segments"][:3]}, ensure_ascii=False, indent=2))
