# Extract audio features for subset of data (90 deg. rm 1 and rm 2)

In [1]:
import os 
import librosa
from glob import glob
import librosa.display 
import matplotlib.pyplot as plt
import random
import IPython.display as ipd
import fnmatch
import itertools
import numpy as np
from types import *
import pandas as pd

%matplotlib inline

In [2]:
files = os.listdir()

In [3]:
data_cols  = ['room', 'noise', 'Speaker', 'mic_id', 'mic_type', 'location', 'spk_angle']

In [4]:
tmp_ls = []
for i in np.arange(len(files)):  
    tmp_ls.append([int(files[i][19:20]), files[i][21:25], int(files[i][28:32]), files[i][51:53], files[i][54:57], 
                         files[i][58:61], files[i][64:67]])

In [5]:
df = pd.DataFrame(tmp_ls, columns = data_cols)
df.head()

Unnamed: 0,room,noise,Speaker,mic_id,mic_type,location,spk_angle
0,1,musi,5126,7,stu,beh,90
1,1,none,3549,3,stu,mid,90
2,2,babb,4331,7,stu,beh,90
3,2,tele,196,1,stu,clo,90
4,2,musi,2289,3,stu,mid,90


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4096 entries, 0 to 4095
Data columns (total 7 columns):
room         4096 non-null int64
noise        4096 non-null object
Speaker      4096 non-null int64
mic_id       4096 non-null object
mic_type     4096 non-null object
location     4096 non-null object
spk_angle    4096 non-null object
dtypes: int64(2), object(5)
memory usage: 224.1+ KB


In [7]:
# import speaker info
df_spkr = pd.read_csv('Lab41-SRI-VOiCES-speaker-gender-dataset.tbl', sep = ' ')

In [8]:
df_spkr.head()

Unnamed: 0,Speaker,Gender,DataSet
0,32,F,train-clean-100
1,83,F,train-clean-100
2,93,F,train-clean-360
3,112,F,train-clean-360
4,115,F,train-clean-360


In [9]:
df_spkr.dtypes

Speaker     int64
Gender     object
DataSet    object
dtype: object

In [10]:
df1 = pd.merge(df, df_spkr, how='left', on='Speaker')
df1.head()

Unnamed: 0,room,noise,Speaker,mic_id,mic_type,location,spk_angle,Gender,DataSet
0,1,musi,5126,7,stu,beh,90,M,train-clean-360
1,1,none,3549,3,stu,mid,90,F,train-clean-360
2,2,babb,4331,7,stu,beh,90,F,train-clean-360
3,2,tele,196,1,stu,clo,90,M,train-clean-100
4,2,musi,2289,3,stu,mid,90,M,train-clean-100


In [11]:
df1.count()

room         4096
noise        4096
Speaker      4096
mic_id       4096
mic_type     4096
location     4096
spk_angle    4096
Gender       4096
DataSet      4096
dtype: int64

## Extract feaures 

In [12]:
from collections import Iterable
def flatten(items):
    """Yield items from any nested iterable; see Reference."""
    for x in items:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            for sub_x in flatten(x):
                yield sub_x
        else:
            yield x

In [13]:
feature_cols = ['Centroid', 'variance','skewness','kurtosis',['mfcc'+str(i) for i in np.arange(12)+1], 
                'roll_off_max', 'roll_off_min']
feature_cols = list(flatten(feature_cols))
#feature_cols

In [14]:
tmp_ftr = []

#freq_range = [0, 1000]

for i in np.arange(len(files)): 
    y, sr = librosa.load(files[i])
    
    #features calculated over all time, thus taking averages over time interval (all wav file here)
    cntrd = librosa.feature.spectral_centroid(y=y, sr=sr)
    centroid_mean = np.mean(cntrd) 
    #centroid_std = np.std(cntrd)
    
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=12)
    mfccs_mean = np.mean(mfccs, axis = 1)
    
    bandwidth_2 =librosa.feature.spectral_bandwidth(y=y, sr=sr, p = 2)
    bandwidth_3 =librosa.feature.spectral_bandwidth(y=y, sr=sr, p = 3)
    bandwidth_4 =librosa.feature.spectral_bandwidth(y=y, sr=sr, p = 4)
    
    #sp_ft = librosa.stft(y)
    #sp_db = librosa.amplitude_to_db(abs(sp_ft))
    #ln = np.mean(sp_db[freq[0]:freq[1], :], axis = 1) # lineout selec freq over all time
    #freqs = freq_range[0]+np.arange(len(ln))*(freq_range[1]-freq_range[0])/(len(ln) -1)
    #max_f = freqs[np.where(ln == np.max(ln))[0][0]]
    
    roll_off_max = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent = 0.8)
    roll_off_min = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent = 0.1)
    
    #max_f
    feature_ls = [centroid_mean, np.mean(bandwidth_2), np.mean(bandwidth_3), 
                  np.mean(bandwidth_4), mfccs_mean, np.mean(roll_off_max), 
                  np.mean(roll_off_min)]
    feature_ls = list(flatten(feature_ls))
    

    tmp_ftr.append(feature_ls)

In [15]:
df_features = pd.DataFrame(tmp_ftr, columns = feature_cols )
df_features.head()

Unnamed: 0,Centroid,variance,skewness,kurtosis,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,mfcc8,mfcc9,mfcc10,mfcc11,mfcc12,roll_off_max,roll_off_min
0,1369.398345,1414.508183,1808.999734,2195.872352,-286.760703,157.717446,-37.476527,47.580181,-19.780722,8.274728,-8.022123,0.586187,-12.076546,-0.450236,-3.663054,-10.178352,2267.065004,175.390908
1,2009.319289,1612.224503,1974.578827,2318.955283,-262.874208,114.245265,-46.646418,21.524778,-24.670042,-2.313114,-16.068898,4.275575,-0.543397,5.348371,-3.721555,1.623454,3150.364701,379.463243
2,1164.829415,1162.326446,1581.974782,1992.371902,-286.703191,160.501955,-27.973229,-0.923948,-21.554743,-14.068063,-6.621132,-14.529982,-8.319861,-0.395488,-3.755954,-9.38157,1765.189656,251.926294
3,1609.693173,1432.359345,1794.042862,2140.470324,-315.849672,142.718227,-46.678627,27.265026,-22.227656,6.975141,-13.284871,-11.583312,0.705184,-6.407151,-10.479363,-7.361547,2532.617001,264.891274
4,2058.137285,1673.071654,2036.431046,2372.0221,-309.738487,125.265817,-38.425686,58.686303,-29.806671,26.878554,-15.679705,10.373796,-1.904526,6.741255,-1.635499,-3.869155,3264.408186,295.557505


In [16]:
df_features.describe()

Unnamed: 0,Centroid,variance,skewness,kurtosis,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,mfcc8,mfcc9,mfcc10,mfcc11,mfcc12,roll_off_max,roll_off_min
count,4096.0,4096.0,4096.0,4096.0,4096.0,4096.0,4096.0,4096.0,4096.0,4096.0,4096.0,4096.0,4096.0,4096.0,4096.0,4096.0,4096.0,4096.0
mean,1661.738969,1478.151514,1846.267046,2200.252933,-302.274888,135.322976,-36.769503,32.041328,-19.739777,2.489604,-14.415792,-5.868783,-7.184822,-2.619034,-5.441512,-6.379128,2681.974866,288.819853
std,323.28202,152.30083,138.954451,128.187016,31.965732,18.396419,15.806769,13.680775,10.218901,10.461162,8.563248,8.068022,6.648379,5.837547,5.346953,5.285814,524.177324,89.479639
min,873.782626,998.995973,1385.448222,1771.041971,-431.293771,71.285723,-84.662746,-13.974083,-50.055808,-36.501196,-49.173392,-27.741027,-27.199249,-23.20337,-22.153866,-21.2704,1326.987493,112.7036
25%,1437.164048,1382.914599,1758.960011,2120.981412,-323.241957,123.049466,-47.843609,22.989147,-27.217149,-4.019385,-19.813713,-11.737609,-11.643138,-6.622703,-9.245014,-10.07563,2314.868957,224.661825
50%,1636.409416,1481.397328,1844.931271,2196.717028,-302.837164,136.005029,-37.332574,31.956496,-19.265299,2.281686,-14.164433,-6.002367,-7.551522,-2.603387,-5.6504,-6.533956,2651.959041,275.733565
75%,1860.92974,1575.494467,1931.762684,2276.294073,-281.734136,148.000827,-26.203018,41.265104,-12.549569,8.498091,-8.671322,-0.331543,-3.113734,1.284646,-1.799247,-2.80561,3030.689639,336.603473
max,2704.351186,2165.136338,2515.531112,2810.649686,-180.630591,190.252638,11.590553,80.30762,16.797211,41.530383,7.255857,19.649625,17.446494,16.390635,12.080619,13.193112,4540.569513,856.673974


In [17]:
df_final = pd.concat([df1, df_features], axis=1, join='inner')
df_final.head()

Unnamed: 0,room,noise,Speaker,mic_id,mic_type,location,spk_angle,Gender,DataSet,Centroid,...,mfcc5,mfcc6,mfcc7,mfcc8,mfcc9,mfcc10,mfcc11,mfcc12,roll_off_max,roll_off_min
0,1,musi,5126,7,stu,beh,90,M,train-clean-360,1369.398345,...,-19.780722,8.274728,-8.022123,0.586187,-12.076546,-0.450236,-3.663054,-10.178352,2267.065004,175.390908
1,1,none,3549,3,stu,mid,90,F,train-clean-360,2009.319289,...,-24.670042,-2.313114,-16.068898,4.275575,-0.543397,5.348371,-3.721555,1.623454,3150.364701,379.463243
2,2,babb,4331,7,stu,beh,90,F,train-clean-360,1164.829415,...,-21.554743,-14.068063,-6.621132,-14.529982,-8.319861,-0.395488,-3.755954,-9.38157,1765.189656,251.926294
3,2,tele,196,1,stu,clo,90,M,train-clean-100,1609.693173,...,-22.227656,6.975141,-13.284871,-11.583312,0.705184,-6.407151,-10.479363,-7.361547,2532.617001,264.891274
4,2,musi,2289,3,stu,mid,90,M,train-clean-100,2058.137285,...,-29.806671,26.878554,-15.679705,10.373796,-1.904526,6.741255,-1.635499,-3.869155,3264.408186,295.557505


In [19]:
df_final.to_csv(path+'VOiCES_90deg_features.csv')

