## Regression problem

Best regressors for the MER problem are Ridge and Linear Regressor

In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [3]:
f_dir = '/Users/gioelepozzi/Desktop/data/features_thesis'
VA_dir = '/Users/gioelepozzi/Desktop/data/annotations_thesis'

In [4]:
audio_dir = f_dir + '/static_features.csv'
eda_dir = f_dir + '/static_features_EDA.csv'
VA_mean_dir = VA_dir + '/static_annotations.csv'
VA_std_dir = VA_dir + '/static_annotations_std.csv'

In [5]:
audio_df = pd.read_csv(audio_dir)
eda_df = pd.read_csv(eda_dir)
eda_df = eda_df.drop(columns=['subject_ID'])
eda_df = eda_df.groupby(by=['music_ID'], as_index=False).mean() # mean over 10 subjects

fusion_df = pd.merge(audio_df, eda_df, on=['music_ID'])

va_mean_df = pd.read_csv(VA_mean_dir)
va_std_df = pd.read_csv(VA_std_dir)

data_df = pd.merge(fusion_df, va_mean_df, on=['music_ID'])
data_df = pd.merge(data_df, va_std_df, on=['music_ID'])

`audio_df`dataframe of audio features with music_ID (108 features)

`eda_df` dataframe of EDA features with music_ID (91 features)

`fusion_df` dataframe with audio and EDA features

`va_mean_df` dataframe with mean values of Valence and Arousal

`va_std_df` dataframe with standard deviation values of Valence and Arousal

`data_df` dataframe with audio features, EDA features, Valence and Arousal mean and standard values

### Useful functions:

`backward` -> takes the dataframe with features and at -1 the target value

`cross_val_regression` -> make RMSE evaluation for Linear Regression and Ridge

In [35]:
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.preprocessing import StandardScaler

from math import sqrt

regressors = {
    'LR': LinearRegression(),
    'Ridge': Ridge(alpha=0.1),
}

def rmse(y, y_pred):
    return sqrt(mean_squared_error(y, y_pred))

def cross_val_regression(regressors, features, labels, preprocessfunc):
    columns = list(regressors.keys())
    scores = pd.DataFrame(columns=columns, index=['RMSE'])
    for reg_name, reg in regressors.items():
        scorer = {'rmse': make_scorer(rmse)}
        reg = make_pipeline(*preprocessfunc, reg)
        reg_score = cross_validate(reg, features, labels, scoring=scorer, cv=10, return_train_score=False) 
        scores.loc['RMSE', reg_name] = reg_score['test_rmse'].mean()
    return scores


# Backward Elimination

def backward(df):
    a = list(df.columns.values)
    target = a[-1]
    X = df.drop(columns=[target]) # feature matrix
    y = df[target] # target variable
    
    cols = list(X.columns)
    pmax = 1
    while (len(cols)>0):
        p = []
        X_1 = X[cols]
        X_1 = sm.add_constant(X_1.values)
        model = sm.OLS(y,X_1).fit()
        p = pd.Series(model.pvalues.values[1:],index = cols)      
        pmax = max(p)
        feature_with_p_max = p.idxmax()
        if(pmax>0.05):
            cols.remove(feature_with_p_max)
        else:
            break
    
    selected_features_BE = cols
    #print('number of features selected:',len(selected_features_BE), '\nfeatures:', selected_features_BE)
    backward_df = df[np.intersect1d(df.columns, selected_features_BE)]
    backward_df = backward_df.join(y)
    return backward_df


### For audio Backward features selection

In [37]:
#va_df -> df con v-a-mean-std
#df -> df con audio/eda/fusion, v-a-mean-std
def reg_results(df, va_df):
    


    # audio valence mean
    v_mean = va_mean_df[['music_ID', 'Valence(mean)']]
audio_v_mean = pd.merge(audio_df, v_mean, on=['music_ID'])
audio_v_mean = audio_v_mean.drop(columns=['music_ID'])
print('AUDIO VALENCE MEAN')
bw_v_mean = backward(audio_v_mean)
features_v_mean = bw_v_mean[bw_v_mean.columns[:-1]].values
label_v_mean = bw_v_mean[bw_v_mean.columns[-1]].values
prefunc = [StandardScaler()]
RMSE_v_mean = cross_val_regression(regressors, features_v_mean, label_v_mean, prefunc)
print('Ridge RMSE:', RMSE_v_mean['Ridge'][0])
print('LR RMSE:', RMSE_v_mean['LR'][0], '\n')


# audio arousal mean
a_mean = va_mean_df[['music_ID', 'Arousal(mean)']]
audio_a_mean = pd.merge(audio_df, a_mean, on=['music_ID'])
audio_a_mean = audio_a_mean.drop(columns=['music_ID'])
print('AUDIO AROUSAL MEAN')
bw_a_mean = backward(audio_a_mean)
features_a_mean = bw_a_mean[bw_a_mean.columns[:-1]].values
label_a_mean = bw_a_mean[bw_a_mean.columns[-1]].values
prefunc = [StandardScaler()]
RMSE_a_mean = cross_val_regression(regressors, features_a_mean, label_a_mean, prefunc)
print('Ridge RMSE:', RMSE_a_mean['Ridge'][0])
print('LR RMSE:', RMSE_a_mean['LR'][0], '\n')


# audio valence std
v_std = va_std_df[['music_ID', 'Valence(std)']]
audio_v_std = pd.merge(audio_df, v_std, on=['music_ID'])
audio_v_std = audio_v_std.drop(columns=['music_ID'])
print('AUDIO VALENCE STANDARD DEVIATION')
bw_v_std = backward(audio_v_std)
features_v_std = bw_v_std[bw_v_std.columns[:-1]].values
label_v_std = bw_v_std[bw_v_std.columns[-1]].values
prefunc = [StandardScaler()]
RMSE_v_std = cross_val_regression(regressors, features_v_std, label_v_std, prefunc)
print('Ridge RMSE:', RMSE_v_std['Ridge'][0])
print('LR RMSE:', RMSE_v_std['LR'][0], '\n')


# audio arousal std
a_std = va_std_df[['music_ID', 'Arousal(std)']]
audio_a_std = pd.merge(audio_df, a_std, on=['music_ID'])
audio_a_std = audio_a_std.drop(columns=['music_ID'])
print('AUDIO AROUSAL STANDARD DEVIATION')
bw_a_std = backward(audio_a_std)
features_a_std = bw_v_std[bw_v_std.columns[:-1]].values
label_a_std = bw_a_std[bw_a_std.columns[-1]].values
RMSE_a_std = cross_val_regression(regressors, features_a_std, label_a_std, prefunc)
print('Ridge RMSE:', RMSE_a_std['Ridge'][0])
print('LR RMSE:', RMSE_a_std['LR'][0])

AUDIO VALENCE MEAN
Ridge RMSE: 0.10995555574622791
LR RMSE: 0.11000369922758502 

AUDIO AROUSAL MEAN
Ridge RMSE: 0.09323701375775093
LR RMSE: 0.09326857119630548 

AUDIO VALENCE STANDARD DEVIATION
Ridge RMSE: 0.04426190722703359
LR RMSE: 0.044145563147221865 

AUDIO AROUSAL STANDARD DEVIATION
Ridge RMSE: 0.044757957843901214
LR RMSE: 0.044799270143477087


In [None]:
# Prova sito

In [None]:
X = audio_v_mean.iloc[:, :-1].values
y = audio_v_mean.iloc[:, -1].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train,Y_train)
print(model.score(X_test,Y_test))

In [None]:
X_train = np.append(arr=np.ones([X_train.shape[0],1]).astype(int), values = X_train, axis = 1)

In [None]:
def column_index(df, query_cols):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols,query_cols,sorter=sidx)]

X_opt = column_index(audio_df, selected_features_BE)

In [None]:

regressor = sm.OLS(Y_train, X_train[:,X_opt]).fit() # OLS -> Ordinary Least Squares
print(regressor.summary())

In [None]:
from sklearn.preprocessing import StandardScaler
XX = X[X_opt]
X_train, X_test, Y_train, Y_test = train_test_split(XX, y[X_opt], test_size = 0.2, random_state = 0)

model = LinearRegression()
model.fit(X_train,Y_train)
print('Model score: '+str(model.score(X_test,Y_test)))