In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import librosa
import os
from pesq import pesq
from pystoi import stoi


def compute_si_snr(preds, target, eps=1e-8):
    preds = preds - preds.mean(dim=-1, keepdim=True)
    target = target - target.mean(dim=-1, keepdim=True)
    s_target = (torch.sum(preds * target, dim=-1, keepdim=True) / (torch.norm(target, dim=-1, keepdim=True)**2 + eps)) * target
    e_noise = preds - s_target
    si_snr = 10 * torch.log10((torch.norm(s_target, dim=-1)**2) / (torch.norm(e_noise, dim=-1)**2 + eps))
    return si_snr.mean()

def compute_snr(preds, target):
    noise = target - preds
    snr = 10 * torch.log10(torch.sum(target ** 2) / (torch.sum(noise ** 2) + 1e-8))
    return snr.item()

def stft_to_tensor(wav_batch, n_fft=512, hop_length=128):
    specs = []
    window = torch.hann_window(n_fft)
    for wav in wav_batch:
        wav_tensor = torch.tensor(wav, dtype=torch.float32)
        spec = torch.stft(
            wav_tensor,
            n_fft=n_fft,
            hop_length=hop_length,
            window=window,
            return_complex=True
        )
        spec = spec.abs().T
        specs.append(spec)
    return torch.stack(specs).to(torch.float32)

def pad_sequences(sequences):
    max_len = max(seq.shape[0] for seq in sequences)
    padded = torch.stack([torch.cat([seq, torch.zeros(max_len - seq.shape[0], seq.shape[1])], dim=0) for seq in sequences])
    return padded

def spec_to_wav(spec, n_fft=512, hop_length=128):
    spec = spec.T.numpy()
    phase = np.zeros_like(spec)
    complex_spec = spec * np.exp(1j * phase)
    wav = librosa.istft(complex_spec, hop_length=hop_length, win_length=n_fft, window='hann')
    return wav

def load_dummy_data(n=20):
    clean = [np.sin(np.linspace(0, 50*np.pi, 16000)) for _ in range(n)]
    noisy = [x + 0.1*np.random.randn(16000) for x in clean]
    clean_specs = stft_to_tensor(clean)
    noisy_specs = stft_to_tensor(noisy)
    return pad_sequences(noisy_specs), pad_sequences(clean_specs)

class MP_SENet_GRU(nn.Module):
    def __init__(self, input_size=257, hidden_size=128, num_layers=2, dropout=0.3, bidirectional=True):
        super(MP_SENet_GRU, self).__init__()
        self.gru_freq = nn.GRU(input_size, hidden_size, num_layers=num_layers,
                               dropout=dropout, batch_first=True, bidirectional=bidirectional)
        self.gru_time = nn.GRU(hidden_size * (2 if bidirectional else 1), hidden_size,
                               num_layers=num_layers, dropout=dropout, batch_first=True, bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_size * (2 if bidirectional else 1), input_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out_freq, _ = self.gru_freq(x)
        out_freq = self.dropout(out_freq)
        out_time, _ = self.gru_time(out_freq)
        out_time = self.dropout(out_time)
        out = self.fc(out_time)
        return out

def train(model, noisy, clean, epochs=3, lr=1e-3):
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()
    for epoch in range(epochs):
        pred = model(noisy)
        loss = loss_fn(pred, clean)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch+1}: loss = {loss.item():.4f}")
    return model

def run_mc_dropout(model, x, T=10):
    model.train()
    preds = torch.stack([model(x) for _ in range(T)], dim=0)
    mean = preds.mean(0)
    std = preds.std(0)
    return mean, std

def random_search(noisy, clean, n_trials=5):
    best_snr = -float('inf')
    best_model = None
    for i in range(n_trials):
        config = {
            "hidden_size": int(np.random.choice([64, 128, 256])),
            "num_layers": int(np.random.choice([1, 2])),
            "dropout": float(np.random.uniform(0.1, 0.5)),
            "bidirectional": True
        }

        print(f"\nTrial {i+1}: {config}")
        model = MP_SENet_GRU(input_size=noisy.shape[-1], **config)
        model.cuda()
        noisy_cuda, clean_cuda = noisy.cuda(), clean.cuda()
        model = train(model, noisy_cuda, clean_cuda, epochs=3)
        model.eval()
        with torch.no_grad():
            denoised = model(noisy_cuda)
            snr = compute_si_snr(denoised, clean_cuda)
            print(f"SI-SNR = {snr.item():.2f} dB")
        if snr.item() > best_snr:
            best_snr = snr.item()
            best_model = model
    best_model.eval()
    return best_model

def evaluate_model(model, noisy, clean, n_fft=512, hop_length=128):
    model.eval()
    noisy_cpu = noisy.cpu()
    clean_cpu = clean.cpu()
    with torch.no_grad():
        denoised = model(noisy).cpu()
    si_snr_val = compute_si_snr(denoised, clean_cpu).item()
    snr_val = compute_snr(denoised, clean_cpu)

    pesq_scores = []
    stoi_scores = []
    for i in range(clean.shape[0]):
        clean_wav = spec_to_wav(clean_cpu[i], n_fft, hop_length)
        denoised_wav = spec_to_wav(denoised[i], n_fft, hop_length)
        try:
            p_score = pesq(16000, clean_wav, denoised_wav, 'wb')
            s_score = stoi(clean_wav, denoised_wav, 16000, extended=False)
        except Exception as e:
            p_score, s_score = 0, 0
            print(f"Warning: PESQ/STOI failed: {e}")
        pesq_scores.append(p_score)
        stoi_scores.append(s_score)

    print(f"\nEvaluation metrics on test data:")
    print(f"SI-SNR: {si_snr_val:.3f} dB")
    print(f"SNR: {snr_val:.3f} dB")
    print(f"Mean PESQ: {np.mean(pesq_scores):.3f}")
    print(f"Mean STOI: {np.mean(stoi_scores):.3f}")

if __name__ == "__main__":
    torch.manual_seed(42)
    noisy_spec, clean_spec = load_dummy_data()
    noisy_spec = noisy_spec.cuda()
    clean_spec = clean_spec.cuda()
    best_model = random_search(noisy_spec, clean_spec, n_trials=3)
    mean_pred, std_pred = run_mc_dropout(best_model, noisy_spec[:1])
    print(f"\nMC Dropout std avg: {std_pred.mean().item():.4f}")
    evaluate_model(best_model, noisy_spec, clean_spec)

    print("\nВыводы:")
    print("- Замена TF-Transformer на GRU в MP-SENet показала приемлемую производительность.")
    print("- Random Search помог подобрать гиперпараметры, улучшив качество.")
    print("- MC Dropout позволяет оценить неопределённость модели и её стабильность.")
    print("- Использование PESQ и STOI дополняет оценку качества с точки зрения восприятия звука.")



Trial 1: {'hidden_size': 64, 'num_layers': 2, 'dropout': 0.27329712093492464, 'bidirectional': True}
Epoch 1: loss = 122.8718
Epoch 2: loss = 122.0172
Epoch 3: loss = 121.2394
SI-SNR = -5.78 dB

Trial 2: {'hidden_size': 128, 'num_layers': 1, 'dropout': 0.3036655415227367, 'bidirectional': True}
Epoch 1: loss = 123.0714
Epoch 2: loss = 121.0463
Epoch 3: loss = 119.2523
SI-SNR = 0.33 dB

Trial 3: {'hidden_size': 64, 'num_layers': 1, 'dropout': 0.4243791643114233, 'bidirectional': True}
Epoch 1: loss = 123.0118
Epoch 2: loss = 122.2767
Epoch 3: loss = 121.6068
SI-SNR = -6.26 dB

MC Dropout std avg: 0.2412





Evaluation metrics on test data:
SI-SNR: 0.333 dB
SNR: 0.206 dB
Mean PESQ: 1.020
Mean STOI: -0.137

Выводы:
- Замена TF-Transformer на GRU в MP-SENet показала приемлемую производительность.
- Random Search помог подобрать гиперпараметры, улучшив качество.
- MC Dropout позволяет оценить неопределённость модели и её стабильность.
- Использование PESQ и STOI дополняет оценку качества с точки зрения восприятия звука.
