In [1]:
#from google.colab import drive

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os, sys, re, pickle, glob
import urllib.request
import zipfile
import soundfile


#from IPython.display import Audio
import IPython.display as ipd
from tqdm import tqdm
import librosa
#drive.mount('/content/drive')

In [295]:
def padArra(an_array):
    np.array(an_array)
    shape = np.shape(an_array)
    #print(shape)
    if shape[0] < 129:
    
        padded_array = np.zeros((128))
        padded_array[:shape[0]] = an_array
        #print("padded: ", padded_array.shape)
        return(padded_array.reshape(-1,1))

In [296]:
def feature_chromagram(waveform, sample_rate):
    # STFT computed here explicitly; mel spectrogram and MFCC functions do this under the hood
    stft_spectrogram=np.abs(librosa.stft(waveform))
    # Produce the chromagram for all STFT frames and get the mean of each column of the resulting matrix to create a feature array
    chromagram=np.mean(librosa.feature.chroma_stft(S=stft_spectrogram, sr=sample_rate,hop_length=512,n_fft=2048).T,axis=0)
    return chromagram

def feature_melspectrogram(waveform, sample_rate):
    # Produce the mel spectrogram for all STFT frames and get the mean of each column of the resulting matrix to create a feature array
    # Using 8khz as upper frequency bound should be enough for most speech classification tasks
    melspectrogram=np.mean(librosa.feature.melspectrogram(y=waveform, sr=sample_rate, n_mels=128, fmax=sample_rate, hop_length=512,n_fft=2048).T,axis=0) ###
    return melspectrogram

def feature_mfcc(waveform, sample_rate):
    # Compute the MFCCs for all STFT frames and get the mean of each column of the resulting matrix to create a feature array
    # 40 filterbanks = 40 coefficients
    mfc_coefficients=np.mean(librosa.feature.mfcc(y=waveform, sr=sample_rate, n_mfcc=40).T, axis=0) 
    return mfc_coefficients



def getPitch(x,fs,winLen=0.02):
  #winLen = 0.02 
    p = winLen*fs
    frame_length = int(2**int(p-1).bit_length())
    hop_length = frame_length//2
    f0, voiced_flag, voiced_probs = librosa.pyin(y=x, fmin=80, fmax=450, sr=fs, frame_length=frame_length,hop_length=hop_length)
    return f0,voiced_flag



def get_features(file):
    # load an individual soundfile
     with soundfile.SoundFile(file) as audio:
        waveform = audio.read(dtype="float32")
        sample_rate = audio.samplerate
        # compute features of soundfile
        chromagram = padArra(feature_chromagram(waveform, sample_rate))
        melspectrogram = padArra(feature_melspectrogram(waveform, sample_rate))
        mfc_coefficients = padArra(feature_mfcc(waveform, sample_rate))

        # my added features
        #######
        y = waveform
        sr = sample_rate
        stft_=np.abs(librosa.stft(waveform))
        #print(chromagram.shape)
        cent = padArra(np.mean( librosa.feature.spectral_centroid(y=y, sr=sr,hop_length=512,n_fft=2048).T, axis=0))
        
        contrast = padArra(np.mean( librosa.feature.spectral_contrast(S=stft_, sr=sr,hop_length=512,n_fft=2048).T, axis=0))
        #tonnetz = np.mean( librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sample_rate ,hop_length=512).T,axis=0)#
        rms = padArra(np.mean( librosa.feature.rms(y=y,frame_length=2048 ,hop_length=512).T, axis=0))
        spec_bw = padArra(np.mean( librosa.feature.spectral_bandwidth(y=y, sr=sr ,n_fft=2048, hop_length=512).T, axis=0))
        rolloff = padArra(np.mean( librosa.feature.spectral_rolloff(y=y, sr=sr ,n_fft=2048, hop_length=512).T, axis=0))
        zcr = padArra(np.mean( librosa.feature.zero_crossing_rate(y, frame_length=2048, hop_length=512).T, axis=0))
        #######
    
        #feature_matrix=np.array([])
        # use np.hstack to stack our feature arrays horizontally to create a feature matrix
        feature_matrix = np.column_stack((chromagram, np.array(melspectrogram).reshape(-1,1), mfc_coefficients , cent, contrast, rms, spec_bw, rolloff, zcr))
        
        return feature_matrix

In [297]:
def getXy(files,labels_file,scale_audio=False, onlySingleDigit=False):
    X,y =[],[]
    for file in tqdm(files):
        file = file.replace("\\", "/")
        fileID = file.split("/")[-1]
        #print(fileID)
        yi = list(labels_file[labels_file['File ID']==fileID]['digit_label'])[0]
        label = list(labels_file[labels_file['File ID']==fileID]['digit_label'])[0]
        if onlySingleDigit and yi>9:
            continue
        else:
            fs = None # if None, fs would be 22050
            x, fs = librosa.load(file,sr=fs)
            if scale_audio: x = x/np.max(np.abs(x))
            f0, voiced_flag = getPitch(x,fs,winLen=0.02)

            a = np.sum(x**2)/len(x)
            b = np.array(( a )).reshape(-1,1)
            #print( np.array(( a )).reshape(-1,1) )
            #print( b.shape )


            
            
            
            power =      padArra( np.array(( np.sum(x**2)/len(x) )) .reshape(-1,1))
            pitch_mean = padArra( np.array(( np.nanmean(f0) if np.mean(np.isnan(f0))<1 else 0)).reshape(-1,1) )
            pitch_std  = padArra( np.array(( np.nanstd(f0) if np.mean(np.isnan(f0))<1 else 0)).reshape(-1,1) )
            voiced_fr =  padArra( np.array(( np.mean(voiced_flag) )) .reshape(-1,1))

            
            #print (power)

            #added 
            features = get_features(file)

            #xi = [power,pitch_mean,pitch_std,voiced_fr]
            #print(features.shape)
            
            xi = np.column_stack((power,pitch_mean,pitch_std,voiced_fr,features))

            X.append(xi)
            y.append(label)
    return np.array(X),np.array(y)

In [298]:
labels = pd.read_csv('./Data/MLEnd/trainingMLEnd.csv')
labels


Unnamed: 0,File ID,digit_label,participant,intonation
0,0000000.wav,4,S73,question
1,0000001.wav,2,S88,excited
2,0000002.wav,70,S5,neutral
3,0000003.wav,2,S85,bored
4,0000004.wav,4,S30,excited
...,...,...,...,...
19995,0019995.wav,90,S163,excited
19996,0019996.wav,10,S99,question
19997,0019997.wav,90,S46,question
19998,0019998.wav,19,S13,neutral


In [299]:
files = glob.glob("./Data/MLEnd/training/Training/*.wav" )




In [301]:
X,y = getXy(files[:500],labels_file=labels,scale_audio=True, onlySingleDigit=True)

#a,b = getXy(files[:1000],labels_file=labels,scale_audio=True, onlySingleDigit=True)

100%|██████████| 500/500 [01:16<00:00,  6.53it/s]


In [302]:
X.shape

(166, 128, 13)

In [303]:
y.shape

(166,)

In [304]:
y

array([4, 2, 2, 4, 2, 1, 3, 6, 9, 9, 2, 5, 4, 7, 9, 1, 9, 9, 1, 9, 5, 4,
       5, 6, 3, 9, 7, 3, 5, 0, 6, 4, 9, 9, 0, 9, 0, 7, 3, 9, 1, 7, 1, 6,
       6, 3, 9, 9, 4, 7, 6, 8, 2, 8, 2, 3, 4, 1, 1, 0, 1, 3, 5, 5, 5, 1,
       1, 0, 2, 0, 2, 8, 2, 4, 2, 4, 9, 3, 6, 2, 5, 0, 1, 5, 7, 4, 8, 8,
       8, 6, 6, 0, 1, 4, 4, 6, 3, 3, 1, 8, 2, 5, 9, 3, 5, 6, 5, 0, 3, 6,
       8, 3, 3, 6, 3, 1, 4, 1, 6, 6, 8, 4, 1, 4, 8, 3, 9, 4, 6, 5, 6, 9,
       3, 8, 3, 4, 0, 2, 1, 3, 7, 9, 9, 9, 3, 4, 8, 7, 1, 7, 8, 0, 7, 1,
       6, 1, 8, 3, 4, 3, 3, 6, 3, 3, 7, 6])

array([[[1.01288382e-02, 2.08882834e+02, 1.18829623e+02, ...,
         2.19799573e+03, 4.69393921e+03, 1.59003364e-01],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        ...,
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00]],

       [[3.53392653e-02, 1.57289085e+02, 3.17844155e+01, ...,
         2.43435264e+03, 5.33570107e+03, 1.55646073e-01],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 

In [307]:
import torch
import torchvision
import torch.nn.functional as F
import torch.nn as nn
import time
import torch.optim as optim

In [308]:
t = torch.rand(4, 4)
b = t.view(2, 8)
t.storage().data_ptr() == b.storage().data_ptr()  # `t` and `b` share the same underlying data.
True
# Modifying view tensor changes base tensor as well.
b[0][0] = 3.14
t[0][0]


tensor(3.1400)

In [312]:
# Create (2, 3, 4) shape data tensor filled with 0.
a = torch.zeros(2, 3, 4)

# Flatten 2nd and 3rd dimensions of the original data 
# tensor using `view` and `flatten` methods.
b = a.view(2, 12)
c = torch.flatten(a, start_dim=1)

print(a)
# Change a distinct value in each flattened tensor object.
b[0, 2] = 1
c[0, 4] = 2

# Compare tensors objects data to each other to look for 
# any mismatches.
print("Tensors A and B data match?", all(a.view(-1) == b.view(-1)))
print("Tensors A and C data match?", all(a.view(-1) == c.view(-1)))
print("Tensors B and C data match?", all(b.view(-1) == c.view(-1)))

tensor([[[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]])
Tensors A and B data match? True
Tensors A and C data match? True
Tensors B and C data match? True
