# Process FreeSound and FSD50K

In [1]:
from scipy.io.wavfile import read
import torchaudio
import torch
from librosa.util import normalize
from librosa.filters import mel as librosa_mel_fn
import numpy as np
import librosa
import librosa.display
import soundfile as sf
import os
from tqdm import tqdm
import json
import sys
from tqdm import tqdm
from scipy.io.wavfile import read, write
MAX_WAV_VALUE = 32768.0

In [14]:
def load_wav(full_path):
    sampling_rate, data = read(full_path)
    return data, sampling_rate

def dynamic_range_compression(x, C=1, clip_val=1e-5):
    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)

def dynamic_range_decompression(x, C=1):
    return np.exp(x) / C

def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
    return torch.log(torch.clamp(x, min=clip_val) * C)

def dynamic_range_decompression_torch(x, C=1):
    return torch.exp(x) / C

def spectral_normalize_torch(magnitudes):
    output = dynamic_range_compression_torch(magnitudes)
    return output

def spectral_de_normalize_torch(magnitudes):
    output = dynamic_range_decompression_torch(magnitudes)
    return output

mel_basis = {}
hann_window = {}

def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
    if torch.min(y) < -1.:
        print('min value is ', torch.min(y))
    if torch.max(y) > 1.:
        print('max value is ', torch.max(y))

    global mel_basis, hann_window
    if fmax not in mel_basis:
        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
        mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)

    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
    y = y.squeeze(1)

    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)

    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))

    spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
    spec = spectral_normalize_torch(spec)

    return spec

In [3]:
fsd_json_file = "/blob/v-yuancwang/WavCaps/fsd_final_2s.json"
with open(fsd_json_file, "r") as f:
    fsd_infos = json.load(f)
fsd_infos = fsd_infos['data']
# print(len(fsd_infos))
fsd_infos_simple = []
for info in fsd_infos:
    fsd_infos_simple.append({"id": info["id"], "file_name": info["file_name"],
                             "download_link": info["download_link"],
                             "caption": info["caption"],
                             "duration": info["duration"]})
# print(len(fsd_infos_simple))
# for info in fsd_infos_simple[:5]:
#     print(info)
id_caption_dict = {}
for info in fsd_infos_simple:
    id_caption_dict[info['id']] = info['caption']
len(id_caption_dict)

222935

In [12]:
fsd_origin_path = "/blob/v-yuancwang/WavCaps/FreeSound/wav_origin"
fsd_wav_path = "/blob/v-yuancwang/WavCaps/FreeSound/wav"
fsd_mel_path = "/blob/v-yuancwang/WavCaps/FreeSound/mel"

In [15]:
fsd_wav_origin_lists = os.listdir(fsd_origin_path)
print(len(fsd_wav_origin_lists))
fsd_wav_origin_lists[:5]

25689


['908.wav', '416438.wav', '1985.wav', '542.wav', '1927.wav']

In [None]:
_2s_lists = []
_5s_lists = []
_10s_lists = []

for wav_id in tqdm(fsd_wav_origin_lists[:]):
    try:
        wav, sr = librosa.load(os.path.join(fsd_origin_path, wav_id), sr=16000)
    except:
        continue

    if len(wav) >= 16000 * 10:
        wav = wav[:16000 * 10]
    elif len(wav) < 16000 * 10 and len(wav) >= 16000 * 7:
        wav = np.pad(wav, ((0, 16000 * 10 - len(wav))), 'wrap')
    elif len(wav) < 16000 * 7 and len(wav) >= 16000 * 5:
        wav = wav[:16000 * 5]
    elif len(wav) < 16000 * 5 and len(wav) >= 16000 * 2.5:
        wav = np.pad(wav, ((0, 16000 * 5 - len(wav))), 'wrap')
    elif len(wav) < 16000 * 2.5 and len(wav) >= 16000 * 2:
        wav = wav[:16000 * 2]
    else:
        wav = np.pad(wav, ((0, 16000 * 2 - len(wav))), 'wrap')
    wav = np.clip(wav, -1, 1)

    x = torch.FloatTensor(wav)
    x = mel_spectrogram(x.unsqueeze(0), n_fft=1024, num_mels=80, sampling_rate=16000,
                        hop_size=256, win_size=1024, fmin=0, fmax=8000)
    spec = x.cpu().numpy()[0]
    np.save(os.path.join(fsd_mel_path, wav_id.replace(".wav", ".npy")), spec)

    wav = wav * MAX_WAV_VALUE
    wav = wav.astype('int16')
    write(os.path.join(fsd_wav_path, wav_id), 16000, wav)


print(len(_2s_lists))
print(len(_5s_lists))
print(len(_10s_lists))

with open("/home/v-yuancwang/AUDIT_v2/medata_infos/fsd_2s.json", "w") as f:
    json.dump(_2s_lists, f)
with open("/home/v-yuancwang/AUDIT_v2/medata_infos/fsd_5s.json", "w") as f:
    json.dump(_5s_lists, f)
with open("/home/v-yuancwang/AUDIT_v2/medata_infos/fsd_10s.json", "w") as f:
    json.dump(_10s_lists, f)