In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from math import sqrt


from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, mean_squared_error

In [2]:
def load_audio_dataset(data):
    features = data[data.columns[:-1]].values
    label = data[data.columns[-1]].values

    return features, label

In [3]:
def rmse(y, y_pred):
    return sqrt(mean_squared_error(y, y_pred))

In [4]:
def cross_val_regression_RMSE(regressors, features, labels, preprocessfunc):
    columns = list(regressors.keys())
    scores = pd.DataFrame(columns=columns, index=['RMSE'])

    for reg_name, reg in regressors.items():
        scorer = {'rmse': make_scorer(rmse)}
        reg = make_pipeline(*preprocessfunc, reg)
        reg_score = cross_validate(reg, features, labels, scoring=scorer, cv=10, return_train_score=False) 
        scores.loc['RMSE', reg_name] = reg_score['test_rmse'].mean()

    return scores

In [13]:
def backward(df):
    
    a = list(df.columns.values)
    target = a[-1]
    X = df.drop(columns=[target]) # feature matrix
    y = df[target] # target variable
        
    cols = list(X.columns)
    pmax = 1
    while (len(cols)>0):
        p = []
        X_1 = X[cols]
        X_1 = sm.add_constant(X_1.values)
        model = sm.OLS(y,X_1).fit()
        p = pd.Series(model.pvalues.values[1:],index = cols)      
        pmax = max(p)
        feature_with_p_max = p.idxmax()
        if(pmax>0.05):
            cols.remove(feature_with_p_max)
        else:
            break
    selected_features_BE = cols
    print('number of features for ', target, ': ', len(selected_features_BE))
    print('features:\n', selected_features_BE, '\n')
    backward_df = df[np.intersect1d(df.columns, selected_features_BE)]
    backward_df = backward_df.join(y)
        
    return backward_df  

In [14]:
def MER_regression(features_dir, VA_dir):

    VA_mean_dir = VA_dir + '/static_annotations.csv'
    VA_std_dir = VA_dir + '/static_annotations_std.csv'
    lab = ['music_ID']

    VA_mean_df = pd.read_csv(VA_mean_dir)
    VA_std_df = pd.read_csv(VA_std_dir)

    audio_dir = features_dir + '/static_features.csv'
    eda_dir = features_dir + '/static_features_EDA.csv'
    audio_df = pd.read_csv(audio_dir)
    eda_df = pd.read_csv(eda_dir)
    eda_df = eda_df.drop(columns=['subject_ID'])
    eda_df = eda_df.groupby(by=lab, as_index=False).mean() # mean over 10 subjects
    data_VA_df = pd.merge(audio_df, eda_df, on=lab)
    data_VA_df = pd.merge(data_VA_df, VA_mean_df, on=lab)
    data_VA_df = pd.merge(data_VA_df, VA_std_df, on=lab)
    
    data_VA_df = data_VA_df.dropna(axis='columns')
    
    data_v_mean = data_VA_df.drop(columns=lab)
    data_v_mean = data_v_mean.drop(columns=['Arousal(mean)', 'Arousal(std)', 'Valence(std)'])
    data_v_mean = backward(data_v_mean)
    data_a_mean = data_VA_df.drop(columns=lab)
    data_a_mean = data_a_mean.drop(columns=['Valence(mean)', 'Arousal(std)', 'Valence(std)'])
    data_a_mean = backward(data_a_mean)
    data_v_std = data_VA_df.drop(columns=lab)
    data_v_std = data_v_std.drop(columns=['Arousal(mean)', 'Valence(mean)', 'Arousal(std)'])
    data_v_std = backward(data_v_std)
    data_a_std = data_VA_df.drop(columns=lab)
    data_a_std = data_a_std.drop(columns=['Arousal(mean)', 'Valence(mean)', 'Valence(std)'])
    data_a_std = backward(data_a_std)

        
    prefunc = [StandardScaler()]
    
    features_v_mean, labels_v_mean = load_audio_dataset(data_v_mean)
    scores_v_mean_RMSE = cross_val_regression_RMSE(regressors, features_v_mean, labels_v_mean, prefunc)
    
    features_a_mean, labels_a_mean = load_audio_dataset(data_a_mean)
    scores_a_mean_RMSE = cross_val_regression_RMSE(regressors, features_a_mean, labels_a_mean, prefunc)

    features_v_std, labels_v_std = load_audio_dataset(data_v_std)
    scores_v_std_RMSE = cross_val_regression_RMSE(regressors, features_v_std, labels_v_std, prefunc)
    
    features_a_std, labels_a_std = load_audio_dataset(data_a_std)
    scores_a_std_RMSE = cross_val_regression_RMSE(regressors, features_a_std, labels_a_std, prefunc)

    return scores_v_mean_RMSE, scores_a_mean_RMSE, scores_v_std_RMSE, scores_a_std_RMSE


In [15]:
f_dir = '/Users/gioelepozzi/Desktop/data/features_thesis'
VA_dir = '/Users/gioelepozzi/Desktop/data/annotations_thesis'

# parametri da cambiare:
# solver -> non cambia nulla
# max_iter -> non cambia
# alpha -> più piccolo = meglio, ma non troppo piccolo


regressors = {
    'Ridge_a=1e-10': Ridge(alpha=1e-10),
    'Ridge_a=0.001': Ridge(alpha=0.001),
    'Ridge': Ridge(),
    'Ridge_a=10': Ridge(alpha=10),
    'Ridge_a=100': Ridge(alpha=100)
}

In [16]:
data = []

a = MER_regression(f_dir, VA_dir)
scores_v_mean_RMSE = a[0]
scores_a_mean_RMSE = a[1]
scores_v_std_RMSE = a[2]
scores_a_std_RMSE = a[3]

regressor_v_mean_RMSE, regressor_a_mean_RMSE = [], []
rmse_v_mean, rmse_a_mean = [], []
regressor_v_std_RMSE, regressor_a_std_RMSE = [], []
rmse_v_std, rmse_a_std = [], []
    
for i in range(len(regressors)):
    regressor_v_mean_RMSE.append(scores_v_mean_RMSE.columns[i])
    regressor_a_mean_RMSE.append(scores_a_mean_RMSE.columns[i])
    rmse_v_mean.append(scores_v_mean_RMSE[scores_v_mean_RMSE.columns[i]][0])
    rmse_a_mean.append(scores_a_mean_RMSE[scores_a_mean_RMSE.columns[i]][0])

    regressor_v_std_RMSE.append(scores_v_std_RMSE.columns[i])
    regressor_a_std_RMSE.append(scores_a_std_RMSE.columns[i])
    rmse_v_std.append(scores_v_std_RMSE[scores_v_std_RMSE.columns[i]][0])
    rmse_a_std.append(scores_a_std_RMSE[scores_a_std_RMSE.columns[i]][0])
    
for j in range(len(regressor_v_mean_RMSE)):
    data.append([regressor_v_mean_RMSE[j], 'V(mean)', rmse_v_mean[j]])
    data.append([regressor_a_mean_RMSE[j], 'A(mean)', rmse_a_mean[j]])
    data.append([regressor_v_std_RMSE[j], 'V(std)', rmse_v_std[j]])
    data.append([regressor_a_std_RMSE[j], 'A(std)', rmse_a_std[j]])

df = pd.DataFrame(data, columns = ['regressor', 'VA', 'RMSE'])
df_ordered_RMSE = df.sort_values(by='RMSE', ascending=True)
df_ordered_RMSE

number of features for  Valence(mean) :  54
features:
 ['chroma_stft_mean', 'chroma_cq_mean', 'chroma_cq_var', 'chroma_cens_std', 'melspectrogram_mean', 'contrast_std', 'contrast_var', 'rolloff_mean', 'poly_std', 'poly_var', 'zcr_std', 'zcr_var', 'harm_std', 'harm_var', 'perc_std', 'frame_std', 'frame_var', 'meanMFCC[0]_x', 'stdMFCC[0]_x', 'kurtMFCC[0]_x', 'skewMFCC[0]_x', 'medianMFCC[1]_x', 'skewMFCC[1]_x', 'stdMFCC[3]_x', 'skewMFCC[3]_x', 'kurtMFCC[4]_x', 'skewMFCC[4]_x', 'stdMFCC[5]_x', 'medianMFCC[5]_x', 'skewMFCC[5]_x', 'kurtMFCC[7]_x', 'stdMFCC[8]_x', 'medianMFCC[8]_x', 'meanMFCC[9]_x', 'meanMFCC[11]_x', 'ZCR', 'power_inband_4', 'power_inband_5', 'peak_inband_3', 'peak_inband_5', 'medianMFCC[0]_y', 'meanMFCC[2]_y', 'medianMFCC[2]_y', 'meanMFCC[3]_y', 'stdMFCC[4]_y', 'medianMFCC[4]_y', 'kurtMFCC[4]_y', 'medianMFCC[5]_y', 'skewMFCC[5]_y', 'kurtMFCC[6]_y', 'skewMFCC[6]_y', 'stdMFCC[7]_y', 'meanMFCC[9]_y', 'meanMFCC[11]_y'] 

number of features for  Arousal(mean) :  51
features:
 ['c

Unnamed: 0,regressor,VA,RMSE
7,Ridge_a=0.001,A(std),0.041865
3,Ridge_a=1e-10,A(std),0.041867
11,Ridge,A(std),0.041975
15,Ridge_a=10,A(std),0.042066
19,Ridge_a=100,A(std),0.042927
2,Ridge_a=1e-10,V(std),0.043523
6,Ridge_a=0.001,V(std),0.043524
10,Ridge,V(std),0.043944
14,Ridge_a=10,V(std),0.044318
18,Ridge_a=100,V(std),0.04466
