In [1]:
#imports 
import sys
import numpy as np
import matplotlib.pyplot as plt
import h5py
import pandas as pd
import librosa
import soundfile as sound

print("Librosa version = ",librosa.__version__)
print("Pysoundfile version = ",sound.__version__)


Librosa version =  0.6.3
Pysoundfile version =  0.10.2


In [2]:
BasePath = '../../Data/TAU-urban-acoustic-scenes-2020-3class-development/'
TrainFile = BasePath + 'evaluation_setup/fold1_train.csv'
ValFile = BasePath + 'evaluation_setup/fold1_evaluate.csv'
sr = 48000
num_audio_channels = 2

In [3]:
  
SampleDuration = 10

#log-mel spectrogram parameters
NumFreqBins = 256 #128
NumFFTPoints = 4096 #2048
HopLength = int(NumFFTPoints/4) #int(NumFFTPoints/2)
NumTimeBins = int(np.ceil(SampleDuration*sr/HopLength))


In [4]:


#load filenames and labels
dev_train_df = pd.read_csv(TrainFile,sep='\t', encoding='ASCII')
dev_val_df = pd.read_csv(ValFile,sep='\t', encoding='ASCII')
wavpaths_train = dev_train_df['filename'].tolist()
wavpaths_val = dev_val_df['filename'].tolist()
y_train_labels =  dev_train_df['scene_label'].astype('category').cat.codes.values
y_val_labels =  dev_val_df['scene_label'].astype('category').cat.codes.values

ClassNames = np.unique(dev_train_df['scene_label'])
NumClasses = len(ClassNames)



In [5]:
dev_train_df['scene_label'].value_counts()

outdoor           3757
transportation    2724
indoor            2704
Name: scene_label, dtype: int64

In [5]:
ClassNames

array(['indoor', 'outdoor', 'transportation'], dtype=object)

In [6]:
LM_train = np.zeros((len(wavpaths_train),NumFreqBins,NumTimeBins,num_audio_channels),'float32')
for i in range(len(wavpaths_train)):
    stereo,fs = sound.read(BasePath + wavpaths_train[i],stop=SampleDuration*sr)

    for channel in range(num_audio_channels):
        if len(stereo.shape)==1:
            stereo = np.expand_dims(stereo,-1)
        LM_train[i,:,:,channel]= librosa.feature.melspectrogram(stereo[:,channel], 
                                       sr=sr,
                                       n_fft=NumFFTPoints,
                                       hop_length=HopLength,
                                       n_mels=NumFreqBins,
                                       fmin=0.0,
                                       fmax=sr/2,
                                       htk=True,
                                       norm=None)

In [7]:
stereo.shape

(480000, 2)

In [8]:
LM_train.shape

(9185, 256, 469, 2)

In [9]:

LM_val = np.zeros((len(wavpaths_val),NumFreqBins,NumTimeBins,num_audio_channels),'float32')
for i in range(len(wavpaths_val)):
    stereo,fs = sound.read(BasePath + wavpaths_val[i],stop=SampleDuration*sr)
    for channel in range(num_audio_channels):
        if len(stereo.shape)==1:
            stereo = np.expand_dims(stereo,-1)
        LM_val[i,:,:,channel]= librosa.feature.melspectrogram(stereo[:,channel], 
                                       sr=sr,
                                       n_fft=NumFFTPoints,
                                       hop_length=HopLength,
                                       n_mels=NumFreqBins,
                                       fmin=0.0,
                                       fmax=sr/2,
                                       htk=True,
                                       norm=None)


In [10]:
LM_val.shape

(4185, 256, 469, 2)

In [11]:
#np.save('Task1b_LM_train_128_2048.npy', LM_train)
np.save('Task1b_LM_train_256_4096.npy', LM_train)

In [12]:
#np.save('Task1b_LM_val_128_2048.npy', LM_val)
np.save('Task1b_LM_val_256_4096.npy', LM_val)

In [None]:
#make a copy with all the data
All_data_df =  pd.read_csv('../../Data/TAU-urban-acoustic-scenes-2020-3class-development/meta.csv',sep='\t', encoding='ASCII')
All_data_df['city']=[aa.split('-')[1] for aa in All_data_df['filename'].get_values()]

wavpaths = All_data_df['filename'].tolist()
y_labels = All_data_df['scene_label'].astype('category').cat.codes.values
    
LM = np.zeros((len(wavpaths),NumFreqBins,NumTimeBins,num_audio_channels),'float32')
for i in range(len(wavpaths)):
    stereo,fs = sound.read(BasePath + wavpaths[i],stop=SampleDuration*sr)

    for channel in range(num_audio_channels):
        if len(stereo.shape)==1:
            stereo = np.expand_dims(stereo,-1)
        LM[i,:,:,channel]= librosa.feature.melspectrogram(stereo[:,channel], 
                                           sr=sr,
                                           n_fft=NumFFTPoints,
                                           hop_length=HopLength,
                                           n_mels=NumFreqBins,
                                           fmin=0.0,
                                           fmax=sr/2,
                                           htk=True,
                                           norm=None)
            
np.save('Task1b_LM_train_256_4096_all.npy', LM)