In [1]:
from audiomentations import AddBackgroundNoise, PolarityInversion, ClippingDistortion, GainTransition, LowPassFilter
from audiomentations import Mp3Compression, PitchShift, RoomSimulator, TimeMask, TimeStretch
from dataset import NoiseDataset, LibriSpeech
from audio import Audio
import pandas as pd
import tqdm
import random
import pesq
import os
from visqol import visqol_lib_py
from visqol.pb2 import visqol_config_pb2
from visqol.pb2 import similarity_result_pb2

In [8]:
sample_rate = 16000
clip_duration = 10
valid = 'test'
NoiseData = NoiseDataset()
SpeechData = LibriSpeech(valid=valid, max_num_clips=None)
augmentations = ['Identity', 'AddBackgroundNoise', 'ClippingDistortion', 'GainTransition', 'LowPassFilter',
                'Mp3Compression', 'PitchShift', 'RoomSimulator', 'TimeMask', 'TimeStretch']
augmentations2 = ['Noise+Reverb', 'Noise+LPF', 'Noise+TimeStretch', 'Reverb+Mp3', 'Pitch+LPF', 'GainTransition+TimeMask']

Loading noises...


../../datasets/noise_demand/NRIVER: 100%|███████| 16/16 [00:00<00:00, 56.15it/s]
../../datasets/noise_demand/PRESTO: 100%|███████| 16/16 [00:00<00:00, 51.60it/s]
../../datasets/noise_demand/NPARK: 100%|████████| 16/16 [00:00<00:00, 48.52it/s]
../../datasets/noise_demand/OMEETING: 100%|█████| 16/16 [00:00<00:00, 50.65it/s]
../../datasets/noise_demand/NFIELD: 100%|███████| 16/16 [00:00<00:00, 47.60it/s]
../../datasets/noise_demand/OOFFICE: 100%|██████| 16/16 [00:00<00:00, 49.00it/s]
../../datasets/noise_demand/PCAFETER: 100%|█████| 16/16 [00:00<00:00, 51.10it/s]
../../datasets/noise_demand/DWASHING: 100%|█████| 16/16 [00:00<00:00, 48.85it/s]
../../datasets/noise_demand/TMETRO: 100%|███████| 16/16 [00:00<00:00, 43.02it/s]
../../datasets/noise_demand/TCAR: 100%|█████████| 16/16 [00:00<00:00, 49.87it/s]
../../datasets/noise_demand/DLIVING: 100%|██████| 16/16 [00:00<00:00, 49.83it/s]
../../datasets/noise_demand/PSTATION: 100%|█████| 16/16 [00:00<00:00, 46.49it/s]
../../datasets/noise_demand/

In [3]:
config = visqol_config_pb2.VisqolConfig()
config.audio.sample_rate = sample_rate
config.options.use_speech_scoring = True
svr_model_path = "lattice_tcditugenmeetpackhref_ls2_nl60_lr12_bs2048_learn.005_ep2400_train1_7_raw.tflite"
config.options.svr_model_path = os.path.join(
    os.path.dirname(visqol_lib_py.__file__), "model", svr_model_path)
api = visqol_lib_py.VisqolApi()
api.Create(config)

## Single Augmentation

In [None]:
df = pd.DataFrame(columns=['processed', 'reference', 'augmentation', 'parameters', 'pesq', 'visqol'])
for path in tqdm.tqdm(SpeechData._paths, total=len(SpeechData._paths), desc='Processing augmentations...'):
    wav_name = path.split('/')[-1][:-5]
    sig = SpeechData.load_wav(path)
    sig.write_wav(f'../../datasets/LibriAugmented/reference/{wav_name}.wav')
    selected_augmentation = random.choice(augmentations)
    
    if selected_augmentation == 'Identity':
        parameters = None
    elif selected_augmentation == 'AddBackgroundNoise': # has to be 16kHz
        noise_path = NoiseData.get_random_sample_path()
        transform = AddBackgroundNoise(
            sounds_path=noise_path,
            min_snr_in_db=-10.0, max_snr_in_db=15.0,
            noise_transform=PolarityInversion(), p=1.0
        )
        parameters = noise_path
    elif selected_augmentation == 'ClippingDistortion':
        transform = ClippingDistortion(
            min_percentile_threshold=10, max_percentile_threshold=40, p=1.0
        )
        parameters = None
    elif selected_augmentation == 'GainTransition':
        transform = GainTransition(
            min_gain_db=-60, max_gain_db=20, p=1.0
        )
        parameters = None
    elif selected_augmentation == 'LowPassFilter':
        transform = LowPassFilter(
            min_cutoff_freq=500, max_cutoff_freq=1000, p=1.0
        )
        parameters = None
    elif selected_augmentation == 'Mp3Compression':
        transform = Mp3Compression(
            min_bitrate=8, max_bitrate=14, p=1.0
        )
        paramters = None
    elif selected_augmentation == 'PitchShift':
        transform = PitchShift(
            min_semitones=-4.0, max_semitones=4.0, p=1.0
        )
        parameters = None
    elif selected_augmentation == 'RoomSimulator':
        transform = RoomSimulator(
            min_target_rt60=0.8, max_target_rt60=1.5, p=1.0
        )
        parameters = None
    elif selected_augmentation == 'TimeMask':
        transform = TimeMask(
            min_band_part=0.2, max_band_part=0.5, fade=True, p=1.0
        )
        parameters = None
    elif selected_augmentation == 'TimeStretch':
        transform = TimeStretch(
            min_rate=0.5, max_rate=2.0, p=1.0
        )
        parameters = None
    else:
        raise ValueError(f'{selected_augmentation} not supported!')
    
    if selected_augmentation != 'Identity':
        aug_sig = transform(sig.samples[:, 0], sample_rate)
        processed = Audio(aug_sig[:int(sample_rate*clip_duration)], sample_rate)
    else:
        processed = sig
    processed.write_wav(f'../../datasets/LibriAugmented/processed/{wav_name}.wav')
    
    ref = sig.samples.flatten().astype('float64')
    deg = processed.samples.flatten().astype('float64')
    pesq_score = pesq.pesq(fs=sample_rate, ref=ref, deg=deg)
    visqol_score = api.Measure(ref, deg).moslqo
    
    new_data = pd.DataFrame({
        'processed': f'../../datasets/LibriAugmented/processed/{wav_name}.wav',
        'reference': f'../../datasets/LibriAugmented/reference/{wav_name}.wav',
        'augmentation': selected_augmentation,
        'parameters': parameters,
        'pesq': pesq_score,
        'visqol': visqol_score
    },index = [0])
    df = pd.concat([df, new_data])
df.to_csv(f'../../datasets/LibriAugmented/{valid}.csv')

## Double Augmentations

In [None]:
df = pd.DataFrame(columns=['processed', 'reference', 'augmentation', 'parameters', 'pesq', 'visqol'])
for path in tqdm.tqdm(SpeechData._paths, total=len(SpeechData._paths), desc='Processing augmentations...'):
    wav_name = path.split('/')[-1][:-5]
    sig = SpeechData.load_wav(path)
    #sig.write_wav(f'../../datasets/LibriAugmented/reference/{wav_name}.wav')
    selected_augmentation = random.choice(augmentations2)
    parameters = None
    
    if selected_augmentation == 'Noise+Reverb':
        noise_path = NoiseData.get_random_sample_path()
        transform1 = AddBackgroundNoise(
            sounds_path=noise_path,
            min_snr_in_db=-10.0, max_snr_in_db=15.0,
            noise_transform=PolarityInversion(), p=1.0
        )
        transform2 = RoomSimulator(
            min_target_rt60=0.8, max_target_rt60=1.5, p=1.0
        )
        parameters = noise_path
    elif selected_augmentation == 'Noise+LPF':
        noise_path = NoiseData.get_random_sample_path()
        transform1 = AddBackgroundNoise(
            sounds_path=noise_path,
            min_snr_in_db=-10.0, max_snr_in_db=15.0,
            noise_transform=PolarityInversion(), p=1.0
        )
        transform2 = LowPassFilter(
            min_cutoff_freq=500, max_cutoff_freq=1000, p=1.0
        )
        parameters = noise_path
    elif selected_augmentation == 'Noise+TimeStretch':
        noise_path = NoiseData.get_random_sample_path()
        transform1 = AddBackgroundNoise(
            sounds_path=noise_path,
            min_snr_in_db=-10.0, max_snr_in_db=15.0,
            noise_transform=PolarityInversion(), p=1.0
        )
        transform2 = TimeStretch(
            min_rate=0.5, max_rate=2.0, p=1.0
        )
        parameters = noise_path
    elif selected_augmentation == 'Reverb+Mp3':
        noise_path = NoiseData.get_random_sample_path()
        transform1 = AddBackgroundNoise(
            sounds_path=noise_path,
            min_snr_in_db=-10.0, max_snr_in_db=15.0,
            noise_transform=PolarityInversion(), p=1.0
        )
        transform2 = Mp3Compression(
            min_bitrate=8, max_bitrate=14, p=1.0
        )
        parameters = noise_path
    elif selected_augmentation == 'Pitch+LPF':
        transform1 = PitchShift(
            min_semitones=-4.0, max_semitones=4.0, p=1.0
        )
        transform2 = LowPassFilter(
            min_cutoff_freq=500, max_cutoff_freq=1000, p=1.0
        )
    elif selected_augmentation == 'GainTransition+TimeMask':
        transform1 = GainTransition(
            min_gain_db=-60, max_gain_db=20, p=1.0
        )
        transform2 = TimeMask(
            min_band_part=0.2, max_band_part=0.5, fade=True, p=1.0
        )
    else:
        raise ValueError(f'{selected_augmentation} not supported!')

    aug1_sig = transform1(sig.samples[:, 0], sample_rate)
    aug2_sig = transform2(aug1_sig, sample_rate)
    processed = Audio(aug2_sig[:int(sample_rate*clip_duration)], sample_rate)       
    processed.write_wav(f'../../datasets/LibriAugmented/processed2/{wav_name}.wav')
    
    ref = sig.samples.flatten().astype('float64')
    deg = processed.samples.flatten().astype('float64')
    pesq_score = pesq.pesq(fs=sample_rate, ref=ref, deg=deg)
    visqol_score = api.Measure(ref, deg).moslqo
    
    new_data = pd.DataFrame({
        'processed': f'../../datasets/LibriAugmented/processed2/{wav_name}.wav',
        'reference': f'../../datasets/LibriAugmented/reference/{wav_name}.wav',
        'augmentation': selected_augmentation,
        'parameters': parameters,
        'pesq': pesq_score,
        'visqol': visqol_score
    },index = [0])
    df = pd.concat([df, new_data])
df.to_csv(f'../../datasets/LibriAugmented/{valid}.csv')