In [None]:
from scipy.io.wavfile import read, write
import torchaudio
import torch
from librosa.util import normalize
from librosa.filters import mel as librosa_mel_fn
import numpy as np
import librosa
from IPython.display import Audio
from tqdm import tqdm
import os
import soundfile as sf
import json

In [None]:
MAX_WAV_VALUE = 32768.0

def load_wav(full_path):
    sampling_rate, data = read(full_path)
    return data, sampling_rate

def dynamic_range_compression(x, C=1, clip_val=1e-5):
    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)

def dynamic_range_decompression(x, C=1):
    return np.exp(x) / C

def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
    return torch.log(torch.clamp(x, min=clip_val) * C)

def dynamic_range_decompression_torch(x, C=1):
    return torch.exp(x) / C

def spectral_normalize_torch(magnitudes):
    output = dynamic_range_compression_torch(magnitudes)
    return output

def spectral_de_normalize_torch(magnitudes):
    output = dynamic_range_decompression_torch(magnitudes)
    return output

mel_basis = {}
hann_window = {}

def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
    if torch.min(y) < -1.:
        print('min value is ', torch.min(y))
    if torch.max(y) > 1.:
        print('max value is ', torch.max(y))

    global mel_basis, hann_window
    if fmax not in mel_basis:
        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
        mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)

    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
    y = y.squeeze(1)

    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
                      center=center, pad_mode='reflect', normalized=False, onesided=True)

    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))

    spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
    spec = spectral_normalize_torch(spec)

    return spec

In [None]:
a_add_b_1_templates = ["Add a sound effect of \{A\} at the beginning",
                        "Add \{A\} at the beginning", "Insert \{A\} at the start",
                        "Put a \{A\} sound effect at the beginning",
                        "Introduce a sound effect of \{A\} at the start",
                        "Add \{A\} in the beginning",
                        "Add \{A\} to the beginning of the audio",
                        "Add a sound effect of \{A\} to the beginning of the audio",
                        "Combine \{A\} with the initial part of the audio",
                        "Mix \{A\} with the first part of the audio",
                        "Add a short clip of \{A\} in the beginning",
                        "Add a brief sound effect of \{A\} at the start",
                        "Fuse a short sound of \{A\} to the start of the audio track",
                        "Place a short audio of \{A\} at the beginning of the audio",
                        "Add: \{A\} in the beginning",
                        "add: \{A\} in the beginning"]

In [None]:
a_add_b_2_templates = ["Insert \{A\} in the middle of the audio",
                        "Insert: \{A\} in the middle",
                        "Add a sound effect of \{A\} in the middle",
                        "Add a short clip of \{A\} in the middle",
                        "Add: \{A\} in the middle",
                        "Add \{A\} at the midpoint",
                        "Add a sound effect of \{A\} at the midpoint",
                        "Include:\{A\} in the middle section",
                        "Integrate \{A\} in the middle",
                        "Blend \{A\} with the audio at the halfway point",
                        "Mix \{A\} into the middle part",
                        "Fuse \{A\} into the audio at the midpoint",
                        "Merge \{A\} into the middle section of the audio",
                        "Combine \{A\} with the audio at the middle point",
                        "Place a \{A\} in the middle of the audio track",
                        "Merge: a short clip of \{A\} into the audio at the midpoint"]

In [None]:
a_add_b_3_templates = ["Add \{A\} at the end",
                        "Add: \{A\} with the final part",
                        "Add: \{A\} at the end",
                        "Add a short clip of \{A\} at the end",
                        "Add: a short clip of \{A\} at the end",
                        "Append \{A\} to the end",
                        "Include a sound effect of \{A\} in the end",
                        "Integrate \{A\} as the final sound of the audio",
                        "Blend \{A\} with the last part of the audio",
                        "Mix \{A\} with the final part",
                        "Mix: \{A\} with the final part",
                        "Fuse \{A\} to the end of the audio",
                        "Merge \{A\} with the closing sequence of the audio",
                        "Combine \{A\} with the final part",
                        "Place \{A\} at the end",
                        "Place a sound effect of \{A\} at the end",
                        "Place: \{A\} at the end"]

In [None]:
# import pandas as pd

In [None]:
# dev_df =pd.read_csv("/blob/v-yuancwang/FSD50K/FSD50K.ground_truth/dev.csv")
# dev_set = {}
# for wav_id, label in zip(dev_df["fname"], dev_df["labels"]):
#     dev_set[str(wav_id)] = label.split(",")[0].replace("_", " ", 5)
# dev_set

In [None]:
# fsd50k_dev_infos = []
# fsd50k_dev_path = "/blob/v-yuancwang/FSD50K/FSD50K.dev_audio"
# fsd50k_dev_list = os.listdir(fsd50k_dev_path)

In [None]:
# for wav_id in tqdm(fsd50k_dev_list[:]):
#     wav_path = os.path.join(fsd50k_dev_path, wav_id)
#     wav, sr = librosa.load(wav_path, sr=16000)
#     if len(wav) < 16000*5.1:
#         fsd50k_dev_infos.append({"wav": wav_path, "caption": dev_set[wav_id.replace(".wav", "")]})
# fsd50k_dev_infos[:10]

In [None]:
# len(fsd50k_dev_infos)

In [None]:
# eval_df =pd.read_csv("/blob/v-yuancwang/FSD50K/FSD50K.ground_truth/eval.csv")
# eval_set = {}
# for wav_id, label in zip(eval_df["fname"], eval_df["labels"]):
#     eval_set[str(wav_id)] = label.split(",")[0].replace("_", " ", 5)
# eval_set

In [None]:
# fsd50k_eval_infos = []
# fsd50k_eval_path = "/blob/v-yuancwang/FSD50K/FSD50K.eval_audio"
# fsd50k_eval_list = os.listdir(fsd50k_eval_path)

In [None]:
# for wav_id in tqdm(fsd50k_eval_list[:]):
#     wav_path = os.path.join(fsd50k_eval_path, wav_id)
#     wav, sr = librosa.load(wav_path, sr=16000)
#     if len(wav) < 16000*5.1:
#         fsd50k_eval_infos.append({"wav": wav_path, "caption": eval_set[wav_id.replace(".wav", "")]})
# fsd50k_eval_infos[:10]

In [None]:
# print(len(fsd50k_eval_infos))

In [None]:
# fsd50k_short_infos = fsd50k_dev_infos + fsd50k_eval_infos

In [None]:
# with open("/home/v-yuancwang/AUDIT_v2/medata_infos/fsd50k_short.json", "w") as f:
#     json.dump(fsd50k_short_infos, f)

In [None]:
# Audio("/blob/v-yuancwang/FSD50K/FSD50K.dev_audio/117942.wav")