In [1]:
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from pystoi import stoi
from pesq import pesq
from scipy.signal import stft
from tabulate import tabulate


In [2]:
def compute_metrics(clean, denoised, fs):
    # Compute STOI directly
    stoi_score = stoi(clean, denoised, fs, extended=False)

    # Compute PESQ (requires resampling to 16 kHz)
    pesq_sr = 16000
    clean_resampled = librosa.resample(clean, orig_sr=fs, target_sr=pesq_sr)
    denoised_resampled = librosa.resample(denoised, orig_sr=fs, target_sr=pesq_sr)
    pesq_score = pesq(pesq_sr, clean_resampled, denoised_resampled, 'wb')  # 'wb' for wideband PESQ

    # Compute LSD (Log Spectral Distance)
    f, t, Zxx_clean = stft(clean, fs=fs)
    _, _, Zxx_denoised = stft(denoised, fs=fs)
    eps = 1e-10  # Avoid log(0)
    lsd = np.mean(np.sqrt(np.mean((np.log(np.abs(Zxx_clean) + eps) - np.log(np.abs(Zxx_denoised) + eps))**2, axis=0)))

    # Compute SNR (Signal-to-Noise Ratio)
    noise = clean - denoised
    snr = 10 * np.log10(np.sum(clean**2) / np.sum(noise**2))

    # Compute MSE (Mean Squared Error)
    mse = np.mean((clean - denoised)**2)

    return stoi_score, pesq_score, lsd, snr, mse

def evaluate_denoising(clean_pth, noisy_pth, denoised_pths, sample_rate):
    # Load clean and noisy speech
    clean, _ = librosa.load(clean_pth, sr=sample_rate)
    noisy, _ = librosa.load(noisy_pth, sr=sample_rate)

    results = []

    for denoised_pth in denoised_pths:
        denoised, _ = librosa.load(denoised_pth, sr=sample_rate)

        # Trim both signals to the shortest length
        min_length = min(len(clean), len(denoised))
        clean_trimmed = clean[:min_length]
        denoised_trimmed = denoised[:min_length]

        # Compute all metrics at once
        stoi_score, pesq_score, lsd_score, snr_score, mse_score = compute_metrics(clean_trimmed, denoised_trimmed, sample_rate)

        # Store results in a list
        results.append([denoised_pth, f"{stoi_score:.3f}", f"{pesq_score:.3f}", f"{lsd_score:.3f}", 
                        f"{snr_score:.3f}", f"{mse_score:.3f}"])

    # Define headers with the arrows near them
    headers = ["Denoised File", "STOI ↑", "PESQ ↑", "LSD ↓", "SNR (dB) ↑", "MSE ↓"]

    # Print results as a formatted table
    print(tabulate(results, headers=headers, tablefmt="pretty"))


In [3]:
clean_pth = "ED-Noisy-Speech-Datashare/clean_testset_wav/p232_014.wav"
noisy_pth = "ED-Noisy-Speech-Datashare/noisy_testset_wav/p232_014.wav"
denoised_pths = ["xVLenght/Output/static_bucketing_denoised.wav", "xVLenght/Output/dynamic_bucketing_deniosed.wav","xVLenght/Output/PTO-OT_denioised.wav",".Project/Output/ModelA_dynamic_p232_014.wav",".Project/Output/ModelA_static_p232_014.wav",".Project/Output/ModelA_pto_p232_014.wav","separated/htdemucs/p232_014/vocals.wav"
]
sample_rate = 48000

evaluate_denoising(clean_pth, noisy_pth, denoised_pths, sample_rate)

#https://www-sciencedirect-com.ejournals.um.edu.mt/science/article/pii/S0167639319304686


+------------------------------------------------+--------+--------+-------+------------+-------+
|                 Denoised File                  | STOI ↑ | PESQ ↑ | LSD ↓ | SNR (dB) ↑ | MSE ↓ |
+------------------------------------------------+--------+--------+-------+------------+-------+
| xVLenght/Output/static_bucketing_denoised.wav  | 0.831  | 1.402  | 3.609 |   0.570    | 0.004 |
| xVLenght/Output/dynamic_bucketing_deniosed.wav | 0.843  | 1.614  | 2.623 |   1.714    | 0.003 |
|      xVLenght/Output/PTO-OT_denioised.wav      | 0.801  | 1.579  | 3.331 |   1.558    | 0.003 |
|  .Project/Output/ModelA_dynamic_p232_014.wav   | 0.761  | 1.166  | 2.446 |   0.397    | 0.004 |
|   .Project/Output/ModelA_static_p232_014.wav   | 0.768  | 1.164  | 2.784 |   0.299    | 0.004 |
|    .Project/Output/ModelA_pto_p232_014.wav     | 0.766  | 1.180  | 2.988 |   0.245    | 0.004 |
|     separated/htdemucs/p232_014/vocals.wav     | 0.926  | 1.910  | 3.386 |   9.793    | 0.000 |
+-------------------