In [2]:
from scipy.io.wavfile import read
import torchaudio
import torch
from librosa.util import normalize
from librosa.filters import mel as librosa_mel_fn
import numpy as np
import librosa
import librosa.display
import soundfile as sf
import os
from tqdm import tqdm
import sys

In [3]:
MAX_WAV_VALUE = 32768.0

def load_wav(full_path):
    sampling_rate, data = read(full_path)
    return data, sampling_rate

def dynamic_range_compression(x, C=1, clip_val=1e-5):
    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)

def dynamic_range_decompression(x, C=1):
    return np.exp(x) / C

def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
    return torch.log(torch.clamp(x, min=clip_val) * C)

def dynamic_range_decompression_torch(x, C=1):
    return torch.exp(x) / C

def spectral_normalize_torch(magnitudes):
    output = dynamic_range_compression_torch(magnitudes)
    return output

def spectral_de_normalize_torch(magnitudes):
    output = dynamic_range_decompression_torch(magnitudes)
    return output

mel_basis = {}
hann_window = {}

def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
    if torch.min(y) < -1.:
        print('min value is ', torch.min(y))
    if torch.max(y) > 1.:
        print('max value is ', torch.max(y))

    global mel_basis, hann_window
    if fmax not in mel_basis:
        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
        mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)

    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
    y = y.squeeze(1)

    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)

    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))

    spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
    spec = spectral_normalize_torch(spec)

    return spec

In [4]:
vgg_audio_path = "/blob/v-yuancwang/DiffAudioImg/VGGSound/data/vggsound/audio"
vgg_video_path = "/blob/v-yuancwang/DiffAudioImg/VGGSound/data/vggsound/video"

In [6]:
vgg_audio_lists = os.listdir(vgg_audio_path)
vgg_video_lists = os.listdir(vgg_video_path)

In [7]:
print(len(vgg_audio_lists))
vgg_audio_lists.sort()
print(vgg_audio_lists[:5])
print(len(vgg_video_lists))
vgg_video_lists.sort()
print(vgg_video_lists[:5])

177056
['---g-f_I2yQ_1000_11000.flac', '--0PQM4-hqg_30000_40000.flac', '--56QUhyDQM_185000_195000.flac', '--8puiAGLhs_30000_40000.flac', '--96EN9NUQM_242000_252000.flac']
176955
['---g-f_I2yQ_1000_11000.mp4', '--0PQM4-hqg_30000_40000.mp4', '--56QUhyDQM_185000_195000.mp4', '--8puiAGLhs_30000_40000.mp4', '--96EN9NUQM_242000_252000.mp4']


In [8]:
vgg_wav_path = "/blob/v-yuancwang/DiffAudioImg/VGGSound/data/vggsound/wav"
vgg_mel_path = "/blob/v-yuancwang/DiffAudioImg/VGGSound/data/vggsound/mel"

In [None]:
vgg_wav_lists = os.listdir(vgg_wav_path)

In [11]:
vgg_wav_lists.sort()
print(vgg_wav_lists[:5])

['---g-f_I2yQ.wav', '--0PQM4-hqg.wav', '--56QUhyDQM.wav', '--8puiAGLhs.wav', '--96EN9NUQM.wav']


In [18]:
yid_dict = {}
for id in vgg_audio_lists[:]:
    id = id.split(".")[0]
    if id[:11] not in yid_dict:
        yid_dict[id[:11]] = []
    yid_dict[id[:11]].append(id)

In [None]:
for wav_name in tqdm(vgg_audio_lists[:]):
    if wav_name[:11] in yid_dict:
        os.rename(os.path.join(vgg_wav_path, wav_name), os.path.join(vgg_wav_path, yid_dict[wav_name[:11]][-1]+".wav"))

In [None]:
vgg_wav_lists.sort()
print(vgg_wav_lists[:5])

In [12]:
def flac_to_wav(file_name, save_file_name):
    os.system("/usr/bin/ffmpeg -y -i" + " " + file_name + " " + save_file_name)

In [2]:
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

In [3]:
for f in tqdm(vgg_audio_lists[:100]):
    file_name = os.path.join(vgg_audio_path, f)
    save_file_name = f[:11]+".wav"
    save_file_name = os.path.join(vgg_wav_path, save_file_name)
    with HiddenPrints():
        flac_to_wav(file_name, save_file_name)

In [None]:
test_wav_path = "/blob/v-yuancwang/DiffAudioImg/VGGSound/data/vggsound/wav/--8puiAGLhs.wav"
metadata = torchaudio.info(test_wav_path)
print(metadata)

wav, sr = librosa.load(test_wav_path, sr=16000)
print(len(wav), sr)
print(wav.shape)
wav = torch.FloatTensor(wav)
print(wav.unsqueeze(0).shape)
x = mel_spectrogram(wav.unsqueeze(0), n_fft=1024, num_mels=80, sampling_rate=16000,
                    hop_size=256, win_size=1024, fmin=0, fmax=8000)
print(x.shape, x.max(), x.min())
print(x)

In [12]:
spec = x.cpu().numpy()[0]
print(spec.shape)

(80, 625)
