In [1]:
#from google.colab import drive

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os, sys, re, pickle, glob
import urllib.request
import zipfile
import soundfile


#from IPython.display import Audio
import IPython.display as ipd
from tqdm import tqdm
import librosa
#drive.mount('/content/drive')

In [203]:
def padArra(an_array):
    np.array(an_array)
    shape = np.shape(an_array)
    print(shape)
    if shape[0] < 129:
    
        padded_array = np.zeros((128))
        padded_array[:shape[0]] = an_array
        print("padded: ", padded_array.shape)
        return(padded_array.reshape(-1,1))

In [179]:
def feature_chromagram(waveform, sample_rate):
    # STFT computed here explicitly; mel spectrogram and MFCC functions do this under the hood
    stft_spectrogram=np.abs(librosa.stft(waveform))
    # Produce the chromagram for all STFT frames and get the mean of each column of the resulting matrix to create a feature array
    chromagram=np.mean(librosa.feature.chroma_stft(S=stft_spectrogram, sr=sample_rate,hop_length=512,n_fft=2048).T,axis=0)
    return chromagram

def feature_melspectrogram(waveform, sample_rate):
    # Produce the mel spectrogram for all STFT frames and get the mean of each column of the resulting matrix to create a feature array
    # Using 8khz as upper frequency bound should be enough for most speech classification tasks
    melspectrogram=np.mean(librosa.feature.melspectrogram(y=waveform, sr=sample_rate, n_mels=128, fmax=sample_rate, hop_length=512,n_fft=2048).T,axis=0) ###
    return melspectrogram

def feature_mfcc(waveform, sample_rate):
    # Compute the MFCCs for all STFT frames and get the mean of each column of the resulting matrix to create a feature array
    # 40 filterbanks = 40 coefficients
    mfc_coefficients=np.mean(librosa.feature.mfcc(y=waveform, sr=sample_rate, n_mfcc=40).T, axis=0) 
    return mfc_coefficients



def getPitch(x,fs,winLen=0.02):
  #winLen = 0.02 
    p = winLen*fs
    frame_length = int(2**int(p-1).bit_length())
    hop_length = frame_length//2
    f0, voiced_flag, voiced_probs = librosa.pyin(y=x, fmin=80, fmax=450, sr=fs, frame_length=frame_length,hop_length=hop_length)
    return f0,voiced_flag



def get_features(file):
    # load an individual soundfile
     with soundfile.SoundFile(file) as audio:
        waveform = audio.read(dtype="float32")
        sample_rate = audio.samplerate
        # compute features of soundfile
        chromagram = padArra(feature_chromagram(waveform, sample_rate))
        melspectrogram = padArra(feature_melspectrogram(waveform, sample_rate))
        mfc_coefficients = padArra(feature_mfcc(waveform, sample_rate))

        # my added features
        #######
        y = waveform
        sr = sample_rate
        stft_=np.abs(librosa.stft(waveform))
        #print(chromagram.shape)
        cent = padArra(np.mean( librosa.feature.spectral_centroid(y=y, sr=sr,hop_length=512,n_fft=2048).T, axis=0))
        
        contrast = padArra(np.mean( librosa.feature.spectral_contrast(S=stft_, sr=sr,hop_length=512,n_fft=2048).T, axis=0))
        #tonnetz = np.mean( librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sample_rate ,hop_length=512).T,axis=0)#
        rms = padArra(np.mean( librosa.feature.rms(y=y,frame_length=2048 ,hop_length=512).T, axis=0))
        spec_bw = padArra(np.mean( librosa.feature.spectral_bandwidth(y=y, sr=sr ,n_fft=2048, hop_length=512).T, axis=0))
        rolloff = padArra(np.mean( librosa.feature.spectral_rolloff(y=y, sr=sr ,n_fft=2048, hop_length=512).T, axis=0))
        zcr = padArra(np.mean( librosa.feature.zero_crossing_rate(y, frame_length=2048, hop_length=512).T, axis=0))
        #######
    
        #feature_matrix=np.array([])
        # use np.hstack to stack our feature arrays horizontally to create a feature matrix
        feature_matrix = np.column_stack((chromagram, np.array(melspectrogram).reshape(-1,1), mfc_coefficients , cent, contrast, rms, spec_bw, rolloff, zcr))
        
        return feature_matrix

In [211]:
def getXy(files,labels_file,scale_audio=False, onlySingleDigit=False):
    X,y =[],[]
    for file in tqdm(files):
        file = file.replace("\\", "/")
        fileID = file.split("/")[-1]
        #print(fileID)
        yi = list(labels_file[labels_file['File ID']==fileID]['digit_label'])[0]
        label = list(labels_file[labels_file['File ID']==fileID]['digit_label'])[0]
        if onlySingleDigit and yi>9:
            continue
        else:
            fs = None # if None, fs would be 22050
            x, fs = librosa.load(file,sr=fs)
            if scale_audio: x = x/np.max(np.abs(x))
            f0, voiced_flag = getPitch(x,fs,winLen=0.02)

            print( (np.sum(x**2)/len(x).type))

            
            power = np.sum(x**2)/len(x)
            pitch_mean = np.nanmean(f0) if np.mean(np.isnan(f0))<1 else 0
            pitch_std  = np.nanstd(f0) if np.mean(np.isnan(f0))<1 else 0
            voiced_fr = np.mean(voiced_flag)

            #added 
            #features = get_features(file)

            #xi = [power,pitch_mean,pitch_std,voiced_fr]
            print(features.shape)
            
            xi = np.column_stack((power,pitch_mean,pitch_std,voiced_fr,features))

            X.append(xi)
            y.append(label)
    return np.array(X),np.array(y)

In [212]:
labels = pd.read_csv('./Data/MLEnd/trainingMLEnd.csv')
labels


Unnamed: 0,File ID,digit_label,participant,intonation
0,0000000.wav,4,S73,question
1,0000001.wav,2,S88,excited
2,0000002.wav,70,S5,neutral
3,0000003.wav,2,S85,bored
4,0000004.wav,4,S30,excited
...,...,...,...,...
19995,0019995.wav,90,S163,excited
19996,0019996.wav,10,S99,question
19997,0019997.wav,90,S46,question
19998,0019998.wav,19,S13,neutral


In [213]:
files = glob.glob("./Data/MLEnd/training/Training/*.wav" )




In [214]:
X,y = getXy(files[:1],labels_file=labels,scale_audio=True, onlySingleDigit=True)

#a,b = getXy(files[:1000],labels_file=labels,scale_audio=True, onlySingleDigit=True)

  0%|          | 0/1 [00:00<?, ?it/s]


AttributeError: 'int' object has no attribute 'type'

In [196]:
X.shape

(1, 4)

In [198]:
X

array([[1.01288382e-02, 2.08882834e+02, 1.18829623e+02, 2.15277778e-01]])