This Notebook prepares dataset for training and testing. Feature embeddings are extracted using mfcc cofficients in librosa and then non important features are removed. Then the preprocessed dataset is saved to hard drive.

### Loading important modules

In [None]:
import librosa
import numpy as np
import os

### Loading and preprocessing dataset

In [None]:

def load_data(main_directory):
    l=(os.listdir(main_directory))
    Data_frontal=[]
    count=0
    y=[]
    for i in l:
        d=main_directory+'\\'+i+'\\'
        L=os.listdir(d)
        for j in L[:6]:
            k=os.listdir(d+'\\'+j)[0]
            wave,sr=librosa.load(d+'\\'+j+'\\'+k, mono=True, sr=None)
            wave = wave[::3]
            i=librosa.feature.mfcc(wave, sr=16000)
            Data_frontal.append(i)
            y.append(count)
        print (count)
        count+=1
    shapes=[]
    for i in Data_frontal:
        shapes.append(i.shape[1])
    max_pad_len=np.max(shapes)
    Data=[]
    for mfcc in Data_frontal:
        pad_width = max_pad_len - mfcc.shape[1]
        val=np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
        Data.append(val.flatten())
    return np.array(Data),np.array(y)
X,y=load_data('Data')

In [None]:
X=X[:,np.mean(np.abs(X),axis=0)!=0]
np.save('X_Data.npy',X)
np.save('Y_Data.npy',y)

### Extracting important features

In [10]:
import numpy as np
X=np.load('X_Data.npy')
y=np.load('Y_Data.npy')
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
pca=PCA(n_components=20)
X=pca.fit_transform(X)
P=PolynomialFeatures(degree=2)
X=P.fit_transform(X)
print ((X.shape))

(708, 231)


In [11]:
#Rearranging the dataset 
def arrange_data(X,Y):
    n_x=[]
    n_y=[]
    i=0
    while i<X.shape[0]:
        n_x.append(X[i:i+6])
        n_y.append(Y[i:i+6])
        i+=6
    return np.array(n_x),np.array(n_y)
X,y=arrange_data(X,y)
print (X.shape,y.shape)

(118, 6, 231) (118, 6)


### Generating dataset in  the required format

In [12]:
def generate_data(X,y,no_of_examples):
    m=int(np.floor(no_of_examples*0.5))
    x_new=[]
    y_new=[]
    i=0
    ind=0
    while i<m:
        person_images=X[ind]
        person_labels=y[ind]
        indexes=np.random.randint(0,6,3)
        v=person_images[indexes].flatten()
        y_new.append(1)
        x_new.append(v)
        i+=1
        ind+=1
        ind%=X.shape[0]
    m=int(np.floor(no_of_examples*0.25))
    i=0
    ind=0
    while i<m:
        person_images=X[ind]
        person_labels=y[ind]
        indexes=np.random.randint(0,6,2)
        v1=person_images[indexes].flatten()
        ind_=np.random.randint(0,X.shape[0])
        if (ind==ind_):
            if (ind!=0):
                ind-=1
            else:
                ind+=1
        person_images=X[ind_]
        indexes=np.random.randint(0,6,1)
        v2=person_images[indexes].flatten()
        x_new.append(np.concatenate([v1,v2]))
        y_new.append(0)
        i+=1
        ind+=1
        ind%=X.shape[0]
    i=0
    ind=0
    while i<m:
        p_indexes=np.random.randint(0,X.shape[0],3)
        while(p_indexes[0]==p_indexes[1] or p_indexes[1]==p_indexes[2] or p_indexes[0]==p_indexes[2]):
            p_indexes=np.random.randint(0,X.shape[0],3)
        p_images=X[p_indexes]
        indexes=np.random.randint(0,6,3)
        v1=p_images[0,indexes[0]]
        v2=p_images[1,indexes[1]]
        v3=p_images[2,indexes[2]]
        x_new.append(np.concatenate([v1,v2,v3]))
        y_new.append(0)
        i+=1
        ind+=1
        ind%=X.shape[0]
    return np.array(x_new),np.array(y_new)
X,y=generate_data(X,y,10000)

In [13]:
np.save('final_x.npy',X)
np.save('final_y.npy',y)