# RANDOM CROP - MLP

## Imports and data loading from .csv file

In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import display
import seaborn as sn  #Per heatmap
import time
import scipy as sp
import os
import librosa as lb
from random import randrange

#sklearn
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder,OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, ParameterGrid, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

Import data from csv file

In [2]:
data = pd.read_csv('./data/development.csv', index_col=0)
data['path'] = data['path'].map(lambda x: x.split('/')[1])
display(data)

Unnamed: 0_level_0,sampling_rate,age,gender,ethnicity,mean_pitch,max_pitch,min_pitch,jitter,shimmer,energy,zcr_mean,spectral_centroid_mean,tempo,hnr,num_words,num_characters,num_pauses,silence_duration,path
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,22050,24.0,female,arabic,1821.69060,3999.7170,145.43066,0.013795,0.082725,0.002254,0.210093,3112.257251,[151.99908088],-123.999726,69,281,39,23.846893,1.wav
1,22050,22.5,female,hungarian,1297.81870,3998.8590,145.37268,0.025349,0.096242,0.007819,0.078849,1688.016389,[129.19921875],-86.928478,69,281,21,19.388662,2.wav
2,22050,22.0,female,portuguese,1332.85240,3998.8025,145.42395,0.019067,0.119456,0.002974,0.105365,2576.901706,[117.45383523],-98.450670,69,281,1,21.640998,3.wav
3,22050,22.0,female,english,1430.34990,3998.4510,147.98083,0.017004,0.102389,0.022371,0.173701,3269.751413,[117.45383523],-56.459762,69,281,9,19.644127,4.wav
4,22050,22.0,male,dutch,1688.72340,3998.6113,145.44772,0.028027,0.124831,0.005369,0.107279,1930.897375,[112.34714674],-80.349204,69,281,11,18.041905,5.wav
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2928,22050,24.0,male,english,1641.14930,3999.1616,145.39359,0.023647,0.115361,0.001879,0.111799,2188.853478,[184.5703125],-100.921055,69,281,11,17.461406,2929.wav
2929,22050,15.0,female,igbo,1089.60050,3984.6550,145.58409,0.015317,0.126740,0.000339,0.070508,2712.362323,[83.35433468],6.757283,0,0,1,1.509206,2930.wav
2930,22050,17.0,female,igbo,994.46484,3989.1785,148.97475,0.009677,0.103535,0.001464,0.058442,2248.698477,[89.10290948],-53.913449,1,9,1,1.645034,2931.wav
2931,22050,18.0,male,arabic,1600.00820,3999.7559,145.36101,0.019571,0.100946,0.004451,0.115139,1834.596924,[143.5546875],-96.143090,69,281,19,16.346848,2932.wav


## Ethnicity subgroups identification

In [3]:
sentence_feat = data.loc[:,['num_words','num_characters']].values
uniq, ind, count= np.unique(np.array(sentence_feat), return_counts=True, return_inverse=True, axis=0)
display(count)
print(uniq[-1])

array([ 409,    2,    1,   39,    8,    5,    2,    3,    3,    3,    1,
          8,    5,   19,    9,   16,    7,    7,    1,    4,    1,    2,
          9,    8,   27,   66,   26,    6,    5,    7,    2,   58,    1,
         52,   56,    6,   51,    5,    9,    4,    8,    4,    1,    2,
          3,    3,   15,    1,    6,    2,   28,    3,   55,   28,    3,
          1,   44,    1,    1,    5,    4,    5,    1,    1,    1,    2,
          6,    3,   11,   16,    1,    1,    1,    1,    1,    1, 1710])

[ 69 281]


#### Divide between long and short audios

In [4]:
data.loc[ind==76,'ethnicity'] = 1
data.loc[ind!=76,'ethnicity'] = 0
clean_data = data.drop(columns=['num_words','num_characters','num_pauses','sampling_rate','max_pitch','min_pitch'])
clean_data['tempo'] = clean_data['tempo'].map(lambda x: float(x[1:-1]))
clean_data['gender'] = clean_data['gender'].map(lambda x: 1 if x=='male' else 0)
display(clean_data)

Unnamed: 0_level_0,age,gender,ethnicity,mean_pitch,jitter,shimmer,energy,zcr_mean,spectral_centroid_mean,tempo,hnr,silence_duration,path
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,24.0,0,1,1821.69060,0.013795,0.082725,0.002254,0.210093,3112.257251,151.999081,-123.999726,23.846893,1.wav
1,22.5,0,1,1297.81870,0.025349,0.096242,0.007819,0.078849,1688.016389,129.199219,-86.928478,19.388662,2.wav
2,22.0,0,1,1332.85240,0.019067,0.119456,0.002974,0.105365,2576.901706,117.453835,-98.450670,21.640998,3.wav
3,22.0,0,1,1430.34990,0.017004,0.102389,0.022371,0.173701,3269.751413,117.453835,-56.459762,19.644127,4.wav
4,22.0,1,1,1688.72340,0.028027,0.124831,0.005369,0.107279,1930.897375,112.347147,-80.349204,18.041905,5.wav
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2928,24.0,1,1,1641.14930,0.023647,0.115361,0.001879,0.111799,2188.853478,184.570312,-100.921055,17.461406,2929.wav
2929,15.0,0,0,1089.60050,0.015317,0.126740,0.000339,0.070508,2712.362323,83.354335,6.757283,1.509206,2930.wav
2930,17.0,0,0,994.46484,0.009677,0.103535,0.001464,0.058442,2248.698477,89.102909,-53.913449,1.645034,2931.wav
2931,18.0,1,1,1600.00820,0.019571,0.100946,0.004451,0.115139,1834.596924,143.554688,-96.143090,16.346848,2932.wav


In [5]:
data_long = clean_data[clean_data['ethnicity']==1].drop(columns=['ethnicity'])
data_short = clean_data[clean_data['ethnicity']==0].drop(columns=['ethnicity'])

In [6]:
RANDOM_SEED = 42
SAMPLE_RATE = 22050

#### Divide in train and validation sets

In [7]:
data_long_train, data_long_val = train_test_split(data_long, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)

## Data extraction - Functions

In [8]:
def silence_removal(audio_array, threshold=60, ref=np.max, aggregate=np.max):
    """Remove silence from the audio array."""
    db = lb.core.amplitude_to_db(audio_array, ref=ref, top_db=None)
    if db.ndim > 1:
        db = np.apply_over_axes(aggregate, db, range(db.ndim - 1))
        db = np.squeeze(db, axis=tuple(range(db.ndim - 1)))
    nonzero = np.flatnonzero(db > -threshold)
    return audio_array[nonzero]

In [9]:
def load_data(audio_files, folder_path, sample_rate,silence_threshold=60):
    """Load audio files and remove silence"""
    time_stamps = []
    audio_arrays = {}
    for file_name in audio_files:
        if file_name.endswith(".wav"):
            audio_array = lb.load(folder_path + file_name)
            #Trim leading and trailing silence
            trimmed_audio_array= lb.effects.trim(audio_array[0])
            #Remove silence within the audio
            noSilence = silence_removal(trimmed_audio_array[0], threshold=silence_threshold, ref=np.max(trimmed_audio_array[0]), aggregate=np.max(trimmed_audio_array[0]))
            time_stamps.append(len(noSilence))
            audio_arrays[file_name] = noSilence            
    return (audio_arrays, np.array(time_stamps))

In [10]:
def standardize_time_mfcc(mfcc,num_buckets):
    """Standardize the number of mfccs of an audio to given value"""
    reduced_list =[]
    existing_int = mfcc.shape[1]
    for i in range(0,existing_int-(existing_int%num_buckets),existing_int//num_buckets):  #Divide audio in num_buckets intervals and mean within them
        reduced_list.append(np.mean(mfcc[:, i:i+(mfcc.shape[1]//num_buckets)], axis=1).flatten())
    return np.array(reduced_list).transpose()

In [11]:
def standardize_all(mfcc_dict,num_buckets):
    """Standardize the number of mfccs across the audios with different lengths"""
    standard_mfccs = {}
    for name, mfcc in mfcc_dict.items():
        standard_mfccs[name] = standardize_time_mfcc(mfcc,num_buckets)
    return standard_mfccs

In [12]:
def reshape_all(mfcc_dict):
    """From 2d mfccs to 1d dataframe"""
    reshaped_mfccs = {}
    for name, mfcc in mfcc_dict.items():
        reshaped_mfccs[name] = mfcc.flatten()
    return pd.DataFrame(reshaped_mfccs).T

In [13]:
def flatten_all(mfcc_dict):
    "From 2d mfccs to 1d dataframe by averagong along time axis"
    flattened_mfccs = {}
    for name, mfcc in mfcc_dict.items():
        flattened_mfccs[name] = mfcc.mean(axis=1).flatten()
    return pd.DataFrame(flattened_mfccs).T

In [14]:
def random_crop(mfcc_dict,crop_dim):
    """Select random interval and perform crop on mfcc"""
    random_mfccs = {}
    for name, mfcc in mfcc_dict.items():
        start = randrange(mfcc.shape[1]-crop_dim+1)
        random_mfccs[name] = mfcc[:, start:start+crop_dim]
    return random_mfccs

In [15]:
def divide_audio(mfcc,crop_dim):
    """Dicide mfcc in crops with specified length"""
    extra = mfcc.shape[1]%crop_dim
    divided_mfcc = np.split(mfcc[:,:mfcc.shape[1]-extra], int((mfcc.shape[1]-extra)/crop_dim), axis=1)
    return divided_mfcc

In [16]:
def create_evaluation_input(mfcc_dict, crop_dim, features_path):
    """From evaluation mfccs to multiple crops of same mfcc with equal length"""
    names=[]
    evaluation_input = {}
    count=0
    for name, mfcc in mfcc_dict.items():
        divided_mfcc = divide_audio(mfcc, crop_dim)
        for crop in divided_mfcc:
            evaluation_input[count]=crop.mean(axis=1).flatten()
            names.append(name)
            count+=1
    evaluation_df = pd.DataFrame(evaluation_input).T.sort_index()
    evaluation_df['path']=names
    joined_data = evaluation_df.join(features_path.set_index('path'), on='path', how='right').drop(columns=['path'])
    joined_data.columns = joined_data.columns.astype(str)
    return evaluation_df['path'], joined_data.sort_index(axis=1)

In [17]:
def find_score(data,paths,y_pred,name='-'):
    """Compute root mean squared error for a given prediction"""
    results = pd.concat((paths.reset_index(drop=True),pd.Series(y_pred, name='Pred')), axis=1)
    groups = results.groupby('path')
    grouped_results = groups.mean()
    joined_results = data.join(grouped_results, on='path')
    score = root_mean_squared_error(joined_results['age'], joined_results['Pred'])
    print(f"\tRoot mean squared error ({name}): ",score)
    return score, joined_results['Pred']

Load audio files

In [18]:
audio_files_train = data_long_train['path'].values.tolist()
audio_arrays_train,time_sampl_train = load_data(audio_files_train, '../data/audios_development/', SAMPLE_RATE)

audio_files_val = data_long_val['path'].values.tolist()
audio_arrays_val,time_sampl_val = load_data(audio_files_val, '../data/audios_development/', SAMPLE_RATE)

In [19]:
#audio_files_train_age = data_long_train[(data_long_train['age']<18) | (data_long_train['age']>70)]['path'].values.tolist()
#audio_arrays_train_age, time_sampl_train_age = load_data(audio_files_train_age, '../data/audios_development/', SAMPLE_RATE)

#audio_files_val_age = data_long_val[(data_long_val['age']<18) | (data_long_val['age']>70)]['path'].values.tolist()
#audio_arrays_val_age, time_sampl_val_age = load_data(audio_files_val_age, '../data/audios_development/', SAMPLE_RATE)

## Training data preparation and MLP model

In [20]:
#HYPERPARAMETERS

#mfcc hyperparameters
n_mfcc=[35]
n_fft=66150
hop_length=22050
#num_time_buckets = 20

#MLP hyperparameters
crop_dim = [5]
max_iter = [300]
layerSize=[200]
layers=[3] 

hiddenLayerSizes=[]
for i in layerSize:
    for j in layers:
        hiddenLayerSizes.append([i]*j)

GRID = {
    'hidden_layer_sizes': hiddenLayerSizes,
    'activation': ['relu'],
    'alpha': [0.00005],   #Strength of the L2 regularization term. The L2 regularization term is divided by the sample size when added to the loss.
    'learning_rate': ['adaptive'],
    'batch_size': ['auto'],
    'n_iter_no_change': [10],    
    'solver': ['adam'],
    'learning_rate_init': [0.001],
    'tol': [0.0001],
    'epsilon': [1e-08],
}


#### Dataset and mfcc data

In [21]:
train_scores =[]
val_scores =[]
# Iteration on possible numbers of mfcc rows
for mfcc_rows in n_mfcc:
    print('\n\tNumber of mfcc rows: ',mfcc_rows)
    # Compute mfcc for train recordings
    mfccs_train = {}
    for name_train,audio_train in audio_arrays_train.items():
        mfccs_train[name_train] = lb.feature.mfcc(y=audio_train, sr=SAMPLE_RATE, n_mfcc=mfcc_rows, n_fft=n_fft,hop_length=hop_length)
    # Compute mfcc for validation recordings
    mfccs_val = {}
    for name_val,audio_val in audio_arrays_val.items():
        mfccs_val[name_val] = lb.feature.mfcc(y=audio_val, sr=SAMPLE_RATE, n_mfcc=mfcc_rows, n_fft=n_fft,hop_length=hop_length)
    # Iteration on possible crop widths
    for crop in crop_dim:
        print('\n\tCrop dimension: ', crop)
        # Iteration on possible numbers of iterations
        for it in max_iter:
            print('\n\tMax iterations: ', it)
            #Iterate on each combination of the grid search
            for g in ParameterGrid(GRID):
                print(g)
                # Initialize model
                mlp_mfcc = MLPRegressor(random_state=RANDOM_SEED, shuffle=True,)
                mlp_mfcc.set_params(**g)
                # Model iterations
                for iter in range(it):
                    if iter%1 == 0:
                        # From mfcc crop to features
                        reshaped_mfccs = flatten_all(random_crop(mfccs_train,crop))
                        joined_data = data_long_train.join(reshaped_mfccs, on='path').drop(columns=['path'])
                        X = joined_data.drop(columns=['age'])                        
                        X.columns = X.columns.astype(str)
                    mlp_mfcc.partial_fit(X.sort_index(axis=1), joined_data['age'])
                #Train prediction
                paths_train, X_train = create_evaluation_input(mfccs_train, crop, data_long_train)
                y_pred_train = mlp_mfcc.predict(X_train.drop(columns=['age']))
                score_train = find_score(data_long_train,paths_train,y_pred_train,name='train')
                train_scores.append(score_train)
                #Validation prediction
                paths_val, X_val = create_evaluation_input(mfccs_val, crop, data_long_val)
                y_pred_val = mlp_mfcc.predict(X_val.drop(columns=['age']))
                score_val = find_score(data_long_val,paths_val,y_pred_val,name='val')
                val_scores.append(score_val)
                print()



	Number of mfcc rows:  35

	Crop dimension:  5

	Max iterations:  300
{'activation': 'relu', 'alpha': 5e-05, 'batch_size': 'auto', 'epsilon': 1e-08, 'hidden_layer_sizes': [200, 200, 200], 'learning_rate': 'adaptive', 'learning_rate_init': 0.001, 'n_iter_no_change': 10, 'solver': 'adam', 'tol': 0.0001}
	Root mean squared error (train):  10.888800930051223
	Root mean squared error (val):  13.517563181462533



In [22]:
'''
train_scores =[]
val_scores =[]
for mfcc_rows in n_mfcc:
    print('\n\tNumber of mfcc rows: ',mfcc_rows)
    mfccs_train = {}
    mfccs_train_age = {}
    for name_train,audio_train in audio_arrays_train.items():
        mfccs_train[name_train] = lb.feature.mfcc(y=audio_train, sr=SAMPLE_RATE, n_mfcc=mfcc_rows, n_fft=n_fft,hop_length=hop_length)
    for name_train,audio_train in audio_arrays_train_age.items():
        mfccs_train_age[name_train] = lb.feature.mfcc(y=audio_train, sr=SAMPLE_RATE, n_mfcc=mfcc_rows, n_fft=n_fft,hop_length=hop_length)
    mfccs_val = {}
    mfccs_val_age = {}
    for name_val,audio_val in audio_arrays_val.items():
        mfccs_val[name_val] = lb.feature.mfcc(y=audio_val, sr=SAMPLE_RATE, n_mfcc=mfcc_rows, n_fft=n_fft,hop_length=hop_length)
    for name_val,audio_val in audio_arrays_val_age.items():
        mfccs_val_age[name_val] = lb.feature.mfcc(y=audio_val, sr=SAMPLE_RATE, n_mfcc=mfcc_rows, n_fft=n_fft,hop_length=hop_length)
    
    for crop in crop_dim:
        print('\n\tCrop dimension: ', crop)
        for it in max_iter:
            print('\n\tMax iterations: ', it)
            for g in ParameterGrid(GRID):
                print(g)
                mlp_mfcc = MLPRegressor(random_state=RANDOM_SEED, shuffle=True,)
                mlp_mfcc.set_params(**g)
                for iter in range(it):
                    if iter%5 == 0:
                        reshaped_mfccs = flatten_all(random_crop(mfccs_train,crop))
                        reshaped_mfccs_age = flatten_all(random_crop(mfccs_train_age,crop))
                        joined_data = data_long_train.join(reshaped_mfccs, on='path').drop(columns=['path'])
                        joined_data_age = data_long_train[(data_long_train['age']<18) | (data_long_train['age']>70)].join(reshaped_mfccs_age, on='path').drop(columns=['path'])
                        join = pd.concat([joined_data, joined_data_age, joined_data_age, joined_data_age])
                        X = join.drop(columns=['age'])
                        X.columns = X.columns.astype(str)
                    mlp_mfcc.partial_fit(X.sort_index(axis=1), join['age'])
                #Train
                paths_train, X_train = create_evaluation_input(mfccs_train, crop, data_long_train)
                y_pred_train = mlp_mfcc.predict(X_train.drop(columns=['age']))
                score_train = find_score(data_long_train,paths_train,y_pred_train,name='train')
                train_scores.append(score_train)
                #Validation
                paths_val, X_val = create_evaluation_input(mfccs_val, crop, data_long_val)
                y_pred_val = mlp_mfcc.predict(X_val.drop(columns=['age']))
                score_val = find_score(data_long_val,paths_val,y_pred_val,name='val')
                val_scores.append(score_val)
                print()
'''

"\ntrain_scores =[]\nval_scores =[]\nfor mfcc_rows in n_mfcc:\n    print('\n\tNumber of mfcc rows: ',mfcc_rows)\n    mfccs_train = {}\n    mfccs_train_age = {}\n    for name_train,audio_train in audio_arrays_train.items():\n        mfccs_train[name_train] = lb.feature.mfcc(y=audio_train, sr=SAMPLE_RATE, n_mfcc=mfcc_rows, n_fft=n_fft,hop_length=hop_length)\n    for name_train,audio_train in audio_arrays_train_age.items():\n        mfccs_train_age[name_train] = lb.feature.mfcc(y=audio_train, sr=SAMPLE_RATE, n_mfcc=mfcc_rows, n_fft=n_fft,hop_length=hop_length)\n    mfccs_val = {}\n    mfccs_val_age = {}\n    for name_val,audio_val in audio_arrays_val.items():\n        mfccs_val[name_val] = lb.feature.mfcc(y=audio_val, sr=SAMPLE_RATE, n_mfcc=mfcc_rows, n_fft=n_fft,hop_length=hop_length)\n    for name_val,audio_val in audio_arrays_val_age.items():\n        mfccs_val_age[name_val] = lb.feature.mfcc(y=audio_val, sr=SAMPLE_RATE, n_mfcc=mfcc_rows, n_fft=n_fft,hop_length=hop_length)\n    \n    f