In [37]:
import os
import numpy as np
import torch
import torchaudio
import subprocess
import pandas as pd
from torchmetrics.audio import SignalDistortionRatio, ScaleInvariantSignalDistortionRatio, PerceptualEvaluationSpeechQuality

In [38]:
import torch
from torchmetrics.audio import PerceptualEvaluationSpeechQuality

# Сгенерируем "чистый" и "noisy"
fs = 16000
clean = torch.randn(1, fs*1)     # 1 секунда, batch=1
noisy = clean + 0.3*torch.randn(1, fs*1)  # немного испорченный

pesq_metric = PerceptualEvaluationSpeechQuality(fs, mode='wb')
val = pesq_metric(noisy, clean)
print('PESQ:', val.item())


PESQ: 4.197354316711426


In [39]:
def mixer(original, noise, snr_db):
    L = len(original)
    if len(noise) < L:
        repeats = int(np.ceil(L / len(noise)))
        noise = np.tile(noise, repeats)
    noise = noise[:L]
    rms_signal = np.sqrt((original**2).mean())
    rms_noise = np.sqrt((noise**2).mean())
    snr_linear = 10 ** (snr_db / 20)
    desired_rms_noise = rms_signal / snr_linear
    noise = noise * (desired_rms_noise / (rms_noise + 1e-8))
    mix = original + noise
    return mix.astype('float32')

def load_audio(path, sr=16000):
    audio, fs = torchaudio.load(path)
    if fs != sr:
        audio = torchaudio.functional.resample(audio, fs, sr)
    return audio[0].numpy(), sr

def compute_metrics(clean, mixed, sr=16000, eval_duration=10):
    # eval_duration в секундах; берём равные короткие фрагменты
    max_samples = int(eval_duration * sr)
    L = min(len(clean), len(mixed), max_samples)
    clean_s = clean[:L]
    mixed_s = mixed[:L]
    clean_torch = torch.tensor(clean_s, dtype=torch.float32).unsqueeze(0)
    mixed_torch = torch.tensor(mixed_s, dtype=torch.float32).unsqueeze(0)
    sdr = SignalDistortionRatio()(mixed_torch, clean_torch).item()
    sisdr = ScaleInvariantSignalDistortionRatio()(mixed_torch, clean_torch).item()
    pesq_metric = PerceptualEvaluationSpeechQuality(sr, mode='wb')
    pesq = pesq_metric(mixed_torch, clean_torch).item()
    return sdr, sisdr, pesq

# Заглушки для NISQA и DNSMOS
#def run_nisqa(audio_file):
#    return [None] * 6
def get_dnsmos(audio_file):
    return [None] * 3


def run_nisqa(audio_file, model_dir='NISQA/nisqa', out_csv='tmp.csv'):
    # Например: python -m nisqa.NISQA MOS_pred --audio_dir=audio/ --data_path=wavs.csv ...
    # wavs.csv имеет колонки: 'file_name'
    # Ваша задача: создать временный csv, скормить cli и считать результат.
    import csv, os
    tmp_csv = 'nisqa_file.csv'
    with open(tmp_csv, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['file_name'])
        writer.writerow([audio_file])
    cmd = ['python', '-m', 'nisqa.NISQA', 
           'MOS_pred', 
           '--audio_dir', os.path.dirname(audio_file), 
           '--data_path', tmp_csv,
           '--pretrained_model_dir', model_dir,
           '--num_workers', '1']
    subprocess.run(cmd)
    # прочитать nisqa_file_output.csv (создаёт nisqa_<timestamp>_MOS_output.csv)
    files = [f for f in os.listdir('.') if f.endswith('_MOS_output.csv')]
    print(f'len(files) = {len(files)}')
    df = pd.read_csv(sorted(files)[-1])
    # Колонки: mos_pred, mos_pred_full, ...
    values = [df.iloc[0][k] for k in ['mos_pred', 'mos_pred_full', 'mos_pred_d', 'mos_pred_mos_sig', 'mos_pred_mos_bak', 'mos_pred_mos_ovr']]
    return values

In [40]:
# === Параметры ===
clean_wav = "../../sounds/voice_audiobook.mp3"
noise_wav = "../../sounds/noise_for_voice.wav"
outdir = "mixed_audio/"
os.makedirs(outdir, exist_ok=True)
SNR_list = [-5, 0, 5, 10]

In [41]:
# === Загрузка аудио ===
assert os.path.exists(clean_wav), f"Файл не найден: {clean_wav}"
assert os.path.exists(noise_wav), f"Файл не найден: {noise_wav}"

clean, sr = load_audio(clean_wav)
print(f'sr1 = {sr}')
assert sr!=0
noise, sr = load_audio(noise_wav, sr)
print(f'sr2 = {sr}')

sr1 = 16000
sr2 = 16000


In [42]:
rows = []
for snr in SNR_list:
    print(f'snr = {snr}')
    try:
        mix = mixer(clean, noise, snr)
        print('1')
        out_path = os.path.join(outdir, f"mix_snr_{snr}db.wav")
        print('2')
        torchaudio.save(out_path, torch.tensor(mix).unsqueeze(0), sr)
        print('3')
        sdr, sisdr, pesq = compute_metrics(clean, mix, sr)
        print('4')
        nisqa_metrics = run_nisqa(out_path)
        print('5')
        dnsmos_metrics = [None] * 3
        print('6')
        # MOS вручную/краудсорс — поставить 0/None если не оцениваете
        print(f"Прослушайте файл {out_path} (SNR={snr}dB) и поставьте MOS (от 1 до 5):")
        mos_manual = None
    except Exception as e:
        print(f"Ошибка на SNR={snr}: {e}")
        
    try:
        row = {
            "файл": out_path,
            "SNR": snr,
            "SDR": sdr,
            "SI-SDR": sisdr,
            "PESQ": pesq,
            "NISQA_mos_pred": nisqa_metrics[0],
            "NISQA_mos_pred_full": nisqa_metrics[1],
            "NISQA_mos_pred_d": nisqa_metrics[2],
            "NISQA_mos_pred_mos_sig": nisqa_metrics[3],
            "NISQA_mos_pred_mos_bak": nisqa_metrics[4],
            "NISQA_mos_pred_mos_ovr": nisqa_metrics[5],
            "DNSMOS_OVRL": dnsmos_metrics[0],
            "DNSMOS_SIG": dnsmos_metrics[1],
            "DNSMOS_BAK": dnsmos_metrics[2],
            "MOS": mos_manual
        }
        rows.append(row)
        print(f"Row for SNR={snr} added.")
    except Exception as e:
        print(f"Ошибка на SNR={snr}: {e}")



snr = -5
1
2
3
4
len(files) = 0
Ошибка на SNR=-5: list index out of range
Ошибка на SNR=-5: name 'nisqa_metrics' is not defined
snr = 0
1
2
3
4
len(files) = 0
Ошибка на SNR=0: list index out of range
Ошибка на SNR=0: name 'nisqa_metrics' is not defined
snr = 5
1
2
3
4
len(files) = 0
Ошибка на SNR=5: list index out of range
Ошибка на SNR=5: name 'nisqa_metrics' is not defined
snr = 10
1
2
3
4
len(files) = 0
Ошибка на SNR=10: list index out of range
Ошибка на SNR=10: name 'nisqa_metrics' is not defined


In [43]:
df = pd.DataFrame(rows)
df.to_csv('results.csv', index=False)
print("\nРезультаты эксперимента:")
print(df.to_string(index=False))


Результаты эксперимента:
Empty DataFrame
Columns: []
Index: []


In [44]:
import numpy as np

def mixer(original, noise, snr_db):
    """
    Смешивает оригинальный сигнал с шумом согласно заданному SNR (в дБ)
    original: np.ndarray (длина T)
    noise: np.ndarray (длина >= T)
    snr_db: float, SNR в децибелах
    Возвращает смешанный сигнал длины T.
    """
    # Обрезать шум до длины original или повторить noise
    if len(noise) < len(original):
        # Повторяем noise, если он короче
        repeats = int(np.ceil(len(original) / len(noise)))
        noise = np.tile(noise, repeats)
    noise = noise[:len(original)]
    
    # RMS (Root Mean Square) амплитуда
    rms_signal = np.sqrt(np.mean(original**2))
    rms_noise = np.sqrt(np.mean(noise**2))
    
    # Считаем по формуле, вывод: K = rms_signal / (10**(snr_db/20) * rms_noise)
    desired_rms_noise = rms_signal / (10**(snr_db/20))
    noise = noise * (desired_rms_noise / (rms_noise + 1e-12))
    mix = original + noise
    return mix.astype(np.float32)


In [45]:
import torchaudio

voice_path = 'corrected_white_noise.wav'  # Путь к чистому голосу
noise_path = 'noise_pink.wav'  # Путь к шуму (шум из DEMAND, UrbanSound8K и т.д.)

voice, sr = torchaudio.load(voice_path)
noise, sr_noise = torchaudio.load(noise_path)
voice = voice[0].numpy()
noise = noise[0].numpy()
assert sr == sr_noise

snr_list = [-5, 0, 5, 10]
mixed_audios = {}
for snr in snr_list:
    mixed = mixer(voice, noise, snr)
    torchaudio.save(f"mix_{snr}dB.wav", torch.from_numpy(mixed).unsqueeze(0), sr)
    mixed_audios[snr] = mixed


LibsndfileError: Error opening 'corrected_white_noise.wav': System error.

In [None]:
import torch
from torchmetrics.audio import SignalDistortionRatio, ScaleInvariantSignalDistortionRatio, PerceptualEvaluationSpeechQuality

sdr_metric = SignalDistortionRatio()
sisdr_metric = ScaleInvariantSignalDistortionRatio()
# Для PESQ нужна поддержка только Wideband (16кГц) и Narrowband (8кГц)
# Скачайте голос и шум подходящей частоты дискретизации!

pesq_metric = PerceptualEvaluationSpeechQuality(16000, mode='wb')

results = []
for snr in snr_list:
    mixture = mixed_audios[snr]
    # Обрезаем на всякий случай длину оригинала
    mixture = mixture[:len(voice)]
    mixture_tensor = torch.from_numpy(mixture)
    voice_tensor = torch.from_numpy(voice)
    sdr = sdr_metric(mixture_tensor, voice_tensor).item()
    sisdr = sisdr_metric(mixture_tensor, voice_tensor).item()
    pesq = pesq_metric(mixture_tensor, voice_tensor).item()
    results.append({'snr': snr, 'SDR': sdr, 'SI-SDR': sisdr, 'PESQ': pesq})


In [None]:
# Во-первых, сохраните файлы для оценки: mix_-5dB.wav и т.д.
# Затем используйте NISQA CLI или функцию Python
# Пример используя os.system (CLI):
import os

for snr in snr_list:
    wave = f"mix_{snr}dB.wav"
    cmd = f"python nisqa/run_nisqa.py --mode predict_file --pretrained_model nisqa/NISQA_MODEL --input_file {wave} --output_csv nisqa_out_{snr}dB.csv"
    os.system(cmd)
