In [1]:

import shutil
import pandas as pd
import subprocess 
import os

import librosa
import numpy as np
import time

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import torch.nn.functional as F

import matplotlib.pyplot as plt
import math

In [9]:
df = pd.read_csv('../../data/validated.tsv', delimiter='\t')
df = df[df['sentence'].str.contains(r'un|deux|trois|quatre|oui|non')]

for index, row in df.iterrows(): # keep only files which contain the words we are interested in
    shutil.copy("../../data/all_clips/" + row['path'], "../../data/clips/" + row['path'])

df["path"] = df["path"].str.replace(".mp3", ".wav")
df = df.drop(columns=['client_id', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'])
df.to_csv('../../data/data.csv', index=False)

In [2]:
directory = '../../data/clips'
wav_directory = '../../data/clips_wav'

In [14]:
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    if os.path.isfile(f):
        input_file = f
        output_file = os.path.join('../../data/clips_wav', filename.replace(".mp3", ".wav"))
        subprocess.call(['ffmpeg', '-v', '0' , '-y', '-i', input_file, output_file])
  

In [3]:
print(f'mp3 files: {len(os.listdir(directory))}')

print(f'wav files: {len(os.listdir(wav_directory))}')

mp3 files: 10767
wav files: 8652


In [4]:
def audio_to_power_spectrogram(audio_file, sample_rate=22050, n_fft=2048, hop_length=512):
    y, sr = librosa.load(audio_file, sr=sample_rate)
    spectrogram = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length)) ** 2
    return spectrogram, y

def audio_to_mel_spectrogram_db(audio_file, sample_rate=22050, n_fft=2048, hop_length=512):
    spectrogram, y = audio_to_power_spectrogram(audio_file, sample_rate, n_fft, hop_length)
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sample_rate, S=spectrogram, n_fft=n_fft, hop_length=hop_length, win_length=n_fft)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max) 
    return mel_spectrogram_db

In [20]:
file = '../../data/clips_wav/common_voice_fr_21894154.wav'
spectre = audio_to_power_spectrogram(file, sample_rate=22020, n_fft=512, hop_length=512)[0]
print(spectre.shape)

(257, 123)


In [5]:
df = pd.read_csv('../../data/data.csv', delimiter=',')
df.head(6)

Unnamed: 0,path,sentence
0,common_voice_fr_22098482.wav,trois
1,common_voice_fr_21955578.wav,quatre
2,common_voice_fr_22500710.wav,un
3,common_voice_fr_21964070.wav,non
4,common_voice_fr_22357111.wav,trois
5,common_voice_fr_22591161.wav,oui


In [6]:
sample_rate=22050
hop_length = 512 # in num. of samples
n_fft = 512 # window in num. of samples

df = pd.read_csv('../../data/data.csv')
df['path_wav'] = df['path'].apply(lambda x: '../../data/clips_wav/' + x)
df['path_mp3'] = df['path'].apply(lambda x: '../../data/clips/' + x.replace(".wav", ".mp3"))

In [7]:
def pad(spectrograms):
    max_x = 0
    max_y = 0

    for spectrogram in spectrograms:
        if spectrogram.shape[0] > max_x:
            max_x = spectrogram.shape[0]
        if spectrogram.shape[1] > max_y:
            max_y = spectrogram.shape[1]

    print('max shape 0 size : ', max_x)
    print('max shape 1 size : ', max_y)
    
    result = [F.pad(input=spectrogram, pad=(0, max_y - spectrogram.shape[1], 0, max_x - spectrogram.shape[0]), mode='constant', value=0) for spectrogram in spectrograms]
    
    # taille_tensor = len(spectrograms)
    # batch_size = 500
    # nb_batch = math.ceil(taille_tensor / batch_size)
    # i = 0
    # j = batch_size
    
    # for b in range(nb_batch):
    #     batch = spectrograms[i:j]
        
        # for spectrogram in batch:
            # result.append(F.pad(input=spectrogram, pad=(0, max_y - spectrogram.shape[1], 0, max_x - spectrogram.shape[0]), mode='constant', value=0))
            

        # spectrograms = spectrograms[j:]
        # print('audio to pad left : ', len(spectrograms))
    
    del spectrograms
    return result

def spectrogram_to_tensor_save(spectrogram, filename):
    spectrogram = [torch.tensor(data, dtype=torch.float16) for data in spectrogram]
    spectrogram = pad(spectrogram)
    spectrogram = torch.stack(spectrogram)
    torch.save(spectrogram, f'../../data/{filename}.pt')
    


In [12]:

start_time = time.time()
spectrogram_wav = df['path_wav'].apply(lambda x: audio_to_power_spectrogram(x)[0])
spectrogram_wav_time = time.time() - start_time

print(f'wav to spectrogram : {spectrogram_wav_time} seconds')

# spectrogram_to_tensor_save(spectrogram_wav, 'spectrogram_wav')


# clear variable for RAM space
del spectrogram_wav

wav to spectrogram : 53.9186429977417 seconds


In [13]:
start_time = time.time()
mel_spectrogram_wav = df['path_wav'].apply(lambda x: audio_to_mel_spectrogram_db(x))
mel_spectrogram_wav_time = time.time() - start_time

print(f'wav to mel spectrogram : {mel_spectrogram_wav_time} seconds')

# spectrogram_to_tensor_save(mel_spectrogram_wav, 'mel_spectrogram_wav')


# clear variable for RAM space
del mel_spectrogram_wav

wav to mel spectrogram : 86.52624773979187 seconds


In [14]:
start_time = time.time()
spectrogram_mp3 = df['path_mp3'].apply(lambda x: audio_to_power_spectrogram(x)[0])
spectrogram_mp3_time = time.time() - start_time

print(f'mp3 to spectrogram : {spectrogram_mp3_time} seconds')

# spectrogram_to_tensor_save(spectrogram_mp3, 'spectrogram_mp3')


# clear variable for RAM space
del spectrogram_mp3

mp3 to spectrogram : 68.04297113418579 seconds


In [15]:
start_time = time.time()
mel_spectrogram_mp3 = df['path_mp3'].apply(lambda x: audio_to_mel_spectrogram_db(x))
mel_spectrogram_mp3_time = time.time() - start_time

print(f'mp3 to mel spectrogram : {mel_spectrogram_mp3_time} seconds')

# spectrogram_to_tensor_save(mel_spectrogram_mp3, 'mel_spectrogram_mp3')


# clear variable for RAM space
del mel_spectrogram_mp3

mp3 to mel spectrogram : 101.2370855808258 seconds
