In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.linear_model import Lasso, ElasticNet, Ridge, LinearRegression, RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_validate, train_test_split

from math import sqrt

import IPython.display as ipd

import statsmodels.api as sm

In [2]:
def load_audio_dataset(data):
    features = data[data.columns[:-1]].values
    labels = data[data.columns[-1]].values
    #scaler = StandardScaler(copy=False)
    #scaler.fit_transform(features)
    return features, labels

In [3]:
def rmse(y, y_pred):
    return sqrt(mean_squared_error(y, y_pred))

regressors = {
    'LR': LinearRegression(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'Ridge': Ridge(),
    'kNN': KNeighborsRegressor(),
    'SVRrbf': SVR(kernel='rbf', gamma='scale'),
    'SVRpoly': SVR(kernel='poly', gamma='scale'),
    'SVRlinear': SVR(kernel='linear', gamma='scale'),
    'DT': DecisionTreeRegressor(max_depth=5),
    'RF': RandomForestRegressor(max_depth=5, n_estimators=10, max_features=1),
}

In [4]:
def cross_val_regression(regressors, features, labels, preprocessfunc):
    columns = list(regressors.keys())
    scores = pd.DataFrame(columns=columns, index=['RMSE'])

    for reg_name, reg in regressors.items(): #for reg_name, reg in tqdm(regressors.items(), desc='regressors'):
        scorer = {'rmse': make_scorer(rmse)}
        reg = make_pipeline(*preprocessfunc, reg)
        reg_score = cross_validate(reg, features, labels, scoring=scorer, cv=10, return_train_score=False) 
        scores.loc['RMSE', reg_name] = reg_score['test_rmse'].mean()
        #scores.loc['R', reg_name] = reg_score['test_r'].mean()

    mean_rmse = scores.mean(axis=1)
    std_rmse = scores.std(axis=1)
    
    scores['Mean'] = mean_rmse
    scores['std'] = std_rmse
    return scores

def format_scores(scores):
    def highlight(s):
        is_min = s == min(s)
#         is_max = s == max(s)
#         is_max_or_min = (is_min | is_max)
        return ['background-color: yellow' if v else '' for v in is_min]
    scores = scores.style.apply(highlight, axis=1, subset=pd.IndexSlice[:, :scores.columns[-2]])
    return scores.format('{:.3f}')

In [6]:
# target = 'Valence(mean)', 'Valence(std)', 'Arousal(mean)', 'Arousal(std)'
# selection_method = 'Pearson', 'backward', 'RFE', 'embedded'
# pearson_treshold = [0-1] value

def feature_selector(df, selection_method, pearson_treshold=0):
    
    a = list(df.columns.values)
    target = a[-1]
    X = df.drop(columns=[target]) # feature matrix
    y = df[target] # target variable
    
    if selection_method == 'Pearson':
        
        cor = df.corr()
        cor_target = abs(cor[target])
        relevant_features = cor_target[cor_target>pearson_treshold]
        pearson_list = relevant_features.axes
        pearson_df = df[np.intersect1d(df.columns, pearson_list)]
        pearson_df = pearson_df.drop([target], axis=1)
        pearson_df = pearson_df.join(y)
        
        return pearson_df
    
    if selection_method == 'backward':
        
        cols = list(X.columns)
        pmax = 1
        while (len(cols)>0):
            p = []
            X_1 = X[cols]
            X_1 = sm.add_constant(X_1)
            model = sm.OLS(y,X_1).fit()
            p = pd.Series(model.pvalues.values[1:],index = cols)      
            pmax = max(p)
            feature_with_p_max = p.idxmax()
            if(pmax>0.05):
                cols.remove(feature_with_p_max)
            else:
                break
        selected_features_BE = cols
        backward_df = df[np.intersect1d(df.columns, selected_features_BE)]
        backward_df = backward_df.join(y)
        
        return backward_df
    
    if selection_method == 'RFE':
        
        nof_list=np.arange(1,X.shape[1]) 
        high_score=0
        nof=0           
        score_list =[]
        for n in range(len(nof_list)):
            X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)
            model = LinearRegression()
            rfe = RFE(model,nof_list[n])
            X_train_rfe = rfe.fit_transform(X_train,y_train)
            X_test_rfe = rfe.transform(X_test)
            model.fit(X_train_rfe,y_train)
            score = model.score(X_test_rfe,y_test)
            score_list.append(score)
            if(score>high_score):
                high_score = score
                nof = nof_list[n]

        cols = list(X.columns)
        model = LinearRegression()
        #Initializing RFE model
        rfe = RFE(model, nof)             
        #Transforming data using RFE
        X_rfe = rfe.fit_transform(X,y)  
        #Fitting the data to model
        model.fit(X_rfe,y)              
        temp = pd.Series(rfe.support_,index = cols)
        selected_features_rfe = temp[temp==True].index        
        rfe_df = df[np.intersect1d(df.columns, selected_features_rfe)]
        rfe_df = rfe_df.join(y)
        
        return rfe_df
    
    if selection_method == 'embedded':
        
        reg = LassoCV()
        reg.fit(X, y)
        coef = pd.Series(reg.coef_, index = X.columns)
        imp_coef = coef.sort_values()
        embedded_list = imp_coef[imp_coef!=0].axes
        embedded_df = df[np.intersect1d(df.columns, embedded_list)]
        embedded_df = embedded_df.join(y)
        
        return embedded_df
    

In [7]:
# data_type = 'audio', 'eda', 'fusion', on which type of data want to have the regression
# feature_sel = True, False, apply or not feature selection
# target = 'Valence(mean)', 'Valence(std)', 'Arousal(mean)', 'Arousal(std)'
# selection_method = 'Pearson', 'backward', 'RFE', 'embedded'
# pearson_treshold = [0-1] value


def MER_dynamic_regression(features_dir, VA_dir, data_type, feature_sel, selection_method='', pearson_treshold=0):
    
    VA_mean_dir = VA_dir + '/dynamic_annotations.csv'
    VA_mean_df = pd.read_csv(VA_mean_dir)
    VA_std_dir = VA_dir + '/dynamic_annotations_std.csv'
    VA_std_df = pd.read_csv(VA_std_dir)
    
    if data_type == 'audio':
        data_dir = features_dir + '/dynamic_features.csv'
        data_df = pd.read_csv(data_dir)
        data_VA_df = pd.merge(data_df, VA_mean_df, on=['music_ID', 'frame'])
        data_VA_df = pd.merge(data_VA_df, VA_std_df, on=['music_ID', 'frame'])
        
    if data_type == 'eda':
        data_dir = features_dir + '/dynamic_features_EDA.csv'
        data_df = pd.read_csv(data_dir)
        data_df = data_df.drop(columns=['subject_ID'])
        data_df = data_df.groupby(by=['music_ID'], as_index=False).mean() # mean over 10 subjects
        data_VA_df = pd.merge(data_df, VA_mean_df, on=['music_ID', 'frame'])
        data_VA_df = pd.merge(data_VA_df, VA_std_df, on=['music_ID', 'frame'])

    if data_type == 'fusion':
        audio_dir = features_dir + '/dynamic_features.csv'
        audio_df = pd.read_csv(audio_dir)
        audio_VA_df = pd.merge(audio_df, VA_mean_df, on=['music_ID', 'frame'])
        audio_VA_df = pd.merge(audio_VA_df, VA_std_df, on=['music_ID', 'frame'])
        eda_dir = features_dir + '/dynamic_features_EDA.csv'
        eda_df = pd.read_csv(eda_dir)
        eda_df = eda_df.drop(columns=['subject_ID'])
        eda_df = eda_df.groupby(by=['music_ID'], as_index=False).mean() # mean over 10 subjects
        eda_VA_df = pd.merge(eda_df, VA_mean_df, on=['music_ID', 'frame'])
        eda_VA_df = pd.merge(eda_VA_df, VA_std_df, on=['music_ID', 'frame'])
        
        data_VA_df = pd.merge(audio_df, eda_df, on=['music_ID', 'frame'])
        data_VA_df = pd.merge(data_VA_df, VA_mean_df, on=['music_ID', 'frame'])
        data_VA_df = pd.merge(data_VA_df, VA_std_df, on=['music_ID', 'frame'])
    
    
    data_v_mean = data_VA_df.drop(columns=['music_ID', 'frame', 'Arousal(mean)', 'Arousal(std)', 'Valence(std)'])
    data_a_mean = data_VA_df.drop(columns=['music_ID', 'frame', 'Valence(mean)', 'Arousal(std)', 'Valence(std)'])
    data_v_std = data_VA_df.drop(columns=['music_ID', 'frame', 'Arousal(mean)', 'Valence(mean)', 'Arousal(std)'])
    data_a_std = data_VA_df.drop(columns=['music_ID', 'frame', 'Arousal(mean)', 'Valence(mean)', 'Valence(std)'])  
    
    if feature_sel == False:
        print(str(data_type),'features with no feature selection:\n')
    
    if feature_sel == True:
        data_v_mean = feature_selector(data_v_mean, selection_method, pearson_treshold)
        data_a_mean = feature_selector(data_a_mean, selection_method, pearson_treshold)
        data_v_std = feature_selector(data_v_std, selection_method, pearson_treshold)
        data_a_std = feature_selector(data_a_std, selection_method, pearson_treshold)
        print(str(data_type),'features with', selection_method, 'feature selection:\n')
       
    prefunc = [StandardScaler()]
    
    print('In Arousal (mean) dimension...')
    features_a_mean, labels_a_mean = load_audio_dataset(data_a_mean)
    scores_a_a_mean = cross_val_regression(regressors, features_a_mean, labels_a_mean, prefunc)
    ipd.display(format_scores(scores_a_a_mean))

    print('In Valence (mean) dimension ...')
    features_v_mean, labels_v_mean = load_audio_dataset(data_v_mean)
    scores_a_v_mean = cross_val_regression(regressors, features_v_mean, labels_v_mean, prefunc)
    ipd.display(format_scores(scores_a_v_mean))

    print('In Arousal (std) dimension...')
    features_a_std, labels_a_std = load_audio_dataset(data_a_std)
    scores_a_a_std = cross_val_regression(regressors, features_a_std, labels_a_std, prefunc)
    ipd.display(format_scores(scores_a_a_std))

    print('In Valence (std) dimension...')
    features_v_std, labels_v_std = load_audio_dataset(data_v_std)
    scores_a_v_std = cross_val_regression(regressors, features_v_std, labels_v_std, prefunc)
    ipd.display(format_scores(scores_a_v_std))    

In [8]:
features_dir = '/Users/gioelepozzi/Desktop/data/features_thesis'
VA_dir = '/Users/gioelepozzi/Desktop/data/annotations_thesis'

In [None]:
MER_dynamic_regression(features_dir, VA_dir, 'audio', False)
MER_dynamic_regression(features_dir, VA_dir, 'audio', feature_sel=True, selection_method='Pearson', pearson_treshold=0.1)
MER_dynamic_regression(features_dir, VA_dir, 'audio', feature_sel=True, selection_method='backward')
MER_dynamic_regression(features_dir, VA_dir, 'audio', feature_sel=True, selection_method='RFE')
MER_dynamic_regression(features_dir, VA_dir, 'audio', feature_sel=True, selection_method='embedded')

MER_dynamic_regression(features_dir, VA_dir, 'eda', False)
MER_dynamic_regression(features_dir, VA_dir, 'eda', feature_sel=True, selection_method='Pearson', pearson_treshold=0.05)
MER_dynamic_regression(features_dir, VA_dir, 'eda', feature_sel=True, selection_method='backward')
MER_dynamic_regression(features_dir, VA_dir, 'eda', feature_sel=True, selection_method='RFE')
MER_dynamic_regression(features_dir, VA_dir, 'eda', feature_sel=True, selection_method='embedded')

MER_dynamic_regression(features_dir, VA_dir, 'fusion', False)
MER_dynamic_regression(features_dir, VA_dir, 'fusion', feature_sel=True, selection_method='Pearson', pearson_treshold=0.1)
MER_dynamic_regression(features_dir, VA_dir, 'fusion', feature_sel=True, selection_method='backward')
MER_dynamic_regression(features_dir, VA_dir, 'fusion', feature_sel=True, selection_method='RFE')
MER_dynamic_regression(features_dir, VA_dir, 'fusion', feature_sel=True, selection_method='embedded')

In [9]:

MER_static_regression(features_dir, VA_dir, 'audio', False)



audio features with no feature selection:

In Arousal (mean) dimension...




ValueError: Input contains NaN, infinity or a value too large for dtype('float64').