In [1]:
from scipy.io.wavfile import read
import torchaudio
import torch
from librosa.util import normalize
from librosa.filters import mel as librosa_mel_fn
import numpy as np
import librosa
import librosa.display
import soundfile as sf
import os
from tqdm import tqdm
import json
import sys
from tqdm import tqdm
from scipy.io.wavfile import read, write
MAX_WAV_VALUE = 32768.0

In [2]:
def load_wav(full_path):
    sampling_rate, data = read(full_path)
    return data, sampling_rate

def dynamic_range_compression(x, C=1, clip_val=1e-5):
    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)

def dynamic_range_decompression(x, C=1):
    return np.exp(x) / C

def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
    return torch.log(torch.clamp(x, min=clip_val) * C)

def dynamic_range_decompression_torch(x, C=1):
    return torch.exp(x) / C

def spectral_normalize_torch(magnitudes):
    output = dynamic_range_compression_torch(magnitudes)
    return output

def spectral_de_normalize_torch(magnitudes):
    output = dynamic_range_decompression_torch(magnitudes)
    return output

mel_basis = {}
hann_window = {}

def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
    if torch.min(y) < -1.:
        print('min value is ', torch.min(y))
    if torch.max(y) > 1.:
        print('max value is ', torch.max(y))

    global mel_basis, hann_window
    if fmax not in mel_basis:
        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
        mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)

    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
    y = y.squeeze(1)

    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)

    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))

    spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
    spec = spectral_normalize_torch(spec)

    return spec

In [3]:
wav_origin_path = "/blob/v-yuancwang/WavCaps/BBC/wav_origin"
wav_path = "/blob/v-yuancwang/WavCaps/BBC/wav"
wav_origin_lists = os.listdir(wav_origin_path)
len(wav_origin_lists)

11755

In [4]:
wav_origin_lists[:5]

['07043127.wav',
 'NHU05021168.wav',
 'NHU05017176.wav',
 '07044066.wav',
 '07043344.wav']

In [4]:
mel_path = "/blob/v-yuancwang/WavCaps/BBC/mel"

In [6]:
data_info_json = "/home/v-yuancwang/AUDIT_v2/WavCaps/data/json_files/BBC_Sound_Effects/bbc_final.json"
with open(data_info_json, "r") as f:
    data_infos = json.load(f)["data"]
data_info_dict = {}
for info in data_infos:
    data_info_dict[info["id"]] = {'duration': info['duration'], 'caption': info['caption']}

In [7]:
saved_lists = os.listdir(mel_path)

In [None]:
metadata_lists = []
for wav_id in wav_origin_lists[:]:
    if wav_id.replace(".wav", ".npy") in saved_lists:
        metadata_lists.append({"mel": os.path.join(mel_path, wav_id.replace(".wav", ".npy")),
                            "caption": data_info_dict[wav_id.replace(".wav","")]['caption']})
        continue

    try:
        # print(wav_id, data_info_dict[wav_id.replace(".wav","")]['caption'], data_info_dict[wav_id.replace(".wav","")]['duration'])
        wav, sr = librosa.load(os.path.join(wav_origin_path, wav_id), sr=16000)
        # print(torchaudio.info(os.path.join(wav_origin_path, wav_id)))
    except:
        continue

    if len(wav) >= 16000 * 10:
        wav = wav[:16000 * 10]
    else:
        wav = np.pad(wav, ((0, 16000 * 10-len(wav))), "wrap")
    wav = np.clip(wav, -1, 1)

    x = torch.FloatTensor(wav)
    x = mel_spectrogram(x.unsqueeze(0), n_fft=1024, num_mels=80, sampling_rate=16000,
                        hop_size=256, win_size=1024, fmin=0, fmax=8000)
    spec = x.cpu().numpy()[0]
    np.save(os.path.join(mel_path, wav_id.replace(".wav", ".npy")), spec)

    wav = wav * MAX_WAV_VALUE
    wav = wav.astype('int16')
    write(os.path.join(wav_path, wav_id), 16000, wav)

    metadata_lists.append({"mel": os.path.join(mel_path, wav_id.replace(".wav", ".npy")),
                           "caption": data_info_dict[wav_id.replace(".wav","")]['caption']})

print(len(metadata_lists))
with open("/home/v-yuancwang/AUDIT_v2/medata_infos/bbc.json", "w") as f:
    json.dump(metadata_lists, f)

In [6]:
import matplotlib.pyplot as plt
from IPython.display import Audio

In [22]:
test_mel = "/blob/v-yuancwang/WavCaps/BBC/mel/07005092.npy"
mel = np.load(os.path.join(mel_path, test_mel))
plt.imshow(mel)

<matplotlib.image.AxesImage at 0x7fa0d2555fd0>

In [23]:
test_wav = test_mel.replace("/mel/", "/wav/").replace(".npy", ".wav")
Audio(test_wav)