## Audio model evaluation

In [48]:
# load groundtruth (single label + multilabel), probabilities, one hot
# load as dataframe
import pandas as pd
from sklearn.metrics import cohen_kappa_score,f1_score,accuracy_score, precision_score, recall_score, classification_report, roc_auc_score, \
    hamming_loss
import numpy as np

LABELS_LIST = ['car', 'gym', 'happy', 'night', 'relax',
       'running', 'sad', 'summer', 'work', 'workout']

# [TODO] edit paths to match audio experiment output [single-label case]
exp_dir = "/src_code/repo/experiments_results/audio_system_single_label/2020-10-02_11-06-43"
audio_multi_exp_dir = "/src_code/repo/experiments_results/audio_system_multilabel/2020-10-01_11-34-29"
user_exp_dir = "/src_code/repo/experiments_results/user_aware_system/2020-10-02_12-09-30"

In [44]:
# Read model's output and groundtruth
test_ground_truth = pd.read_csv("/src_code/repo/GroundTruth/test_set.csv")
test_groundtruth_from_model = np.loadtxt(exp_dir + "/test_ground_truth_classes.txt",delimiter=',')
user_ids = np.loadtxt(exp_dir + "/user_ids.txt",delimiter=',')
track_ids = np.loadtxt(exp_dir + "/tracks_ids.txt",delimiter=',')
test_output = np.loadtxt(exp_dir + "/predictions.out",delimiter=',')
test_output_one_hot = np.loadtxt(exp_dir + "/test_output_one_hot.out",delimiter=',') ### WHAAAT

# Format model output and groundtruth to a dataframe
model_ground_truth = test_ground_truth.copy()
model_ground_truth.song_id = track_ids
model_ground_truth.user_id = user_ids
model_ground_truth.iloc[:,2:] = test_groundtruth_from_model

model_predictions = test_ground_truth.copy()
model_predictions.song_id = track_ids
model_predictions.user_id = user_ids
model_predictions.iloc[:,2:] = test_output

output_one_hot = test_ground_truth.copy()
output_one_hot.song_id = track_ids
output_one_hot.user_id = user_ids
output_one_hot.iloc[:,2:] = test_output_one_hot

### Single-output-single-groundtruth Protocol (SO-SG)

In [45]:
## Per label evaluation of single-output-single-groundtruth 
# Create a dataframe where we keep all the evaluations
results_df = pd.DataFrame(columns=LABELS_LIST)
results_df.index.astype(str, copy=False)
percentage_of_positives_perclass = sum(model_ground_truth.values[:,2:]) / len(model_ground_truth)
results_df.loc[0] = percentage_of_positives_perclass
results_df.index = ['Ratio of positive samples']

# compute additional metrics (AUC,f1,recall,precision)
auc_roc_per_label = roc_auc_score(model_ground_truth.values[:,2:], model_predictions.values[:,2:], average=None)
precision_perlabel = precision_score(model_ground_truth.values[:,2:], output_one_hot.values[:,2:], average=None)
recall_perlabel = recall_score(model_ground_truth.values[:,2:], output_one_hot.values[:,2:], average=None)
f1_perlabel = f1_score(model_ground_truth.values[:,2:], output_one_hot.values[:,2:], average=None)

results_df = results_df.append(
    pd.DataFrame([auc_roc_per_label,recall_perlabel, precision_perlabel, f1_perlabel], columns=LABELS_LIST))
results_df.index = ['Ratio of positive samples',"AUC", "Recall", "Precision", "f1-score"]
results_df['average'] = results_df.mean(numeric_only=True, axis=1)
results_df.round(3).T

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,Ratio of positive samples,AUC,Recall,Precision,f1-score
car,0.085,0.54,0.0,0.0,0.0
gym,0.114,0.665,0.439,0.178,0.254
happy,0.065,0.582,0.0,0.0,0.0
night,0.092,0.572,0.002,0.082,0.004
relax,0.116,0.736,0.595,0.233,0.334
running,0.113,0.605,0.054,0.179,0.083
sad,0.092,0.742,0.001,0.185,0.002
summer,0.154,0.58,0.415,0.19,0.261
work,0.084,0.526,0.0,0.0,0.0
workout,0.085,0.706,0.112,0.191,0.141


### Multi-output-single-groundtruth Protocol (MO-SG)

In [46]:
"""
    Evaluate on multi-label output and single-label grountruth
"""
# Create a dataframe where we keep all the evaluations
model_output_rounded = np.round(model_predictions.values[:,2:])
model_output_rounded = np.clip(model_output_rounded, 0, 1)
results_df = pd.DataFrame(columns=LABELS_LIST)
results_df.index.astype(str, copy=False)
percentage_of_positives_perclass = sum(model_ground_truth.values[:,2:]) / len(model_ground_truth)
results_df.loc[0] = percentage_of_positives_perclass
results_df.index = ['Ratio of positive samples']

# compute additional metrics (AUC,f1,recall,precision)
auc_roc_per_label = roc_auc_score(model_ground_truth.values[:,2:], model_predictions.values[:,2:], average=None)
precision_perlabel = precision_score(model_ground_truth.values[:,2:], model_output_rounded, average=None)
recall_perlabel = recall_score(model_ground_truth.values[:,2:], model_output_rounded, average=None)
f1_perlabel = f1_score(model_ground_truth.values[:,2:], model_output_rounded, average=None)

results_df = results_df.append(
    pd.DataFrame([auc_roc_per_label,recall_perlabel, precision_perlabel, f1_perlabel], columns=LABELS_LIST))
results_df.index = ['Ratio of positive samples',"AUC", "Recall", "Precision", "f1-score"]
results_df['average'] = results_df.mean(numeric_only=True, axis=1)
results_df.round(3).T

Unnamed: 0,Ratio of positive samples,AUC,Recall,Precision,f1-score
car,0.085,0.54,0.992,0.086,0.158
gym,0.114,0.665,0.931,0.133,0.232
happy,0.065,0.582,0.98,0.066,0.124
night,0.092,0.572,0.983,0.094,0.172
relax,0.116,0.736,0.899,0.16,0.272
running,0.113,0.605,0.962,0.12,0.213
sad,0.092,0.742,0.862,0.142,0.244
summer,0.154,0.58,0.998,0.155,0.268
work,0.084,0.526,1.0,0.084,0.155
workout,0.085,0.706,0.916,0.111,0.197


## Display the computed evaluation of the user model and the MO-MG protocol

### Multi-output-multi-groundtruth Protocol (MO-MG)

In [47]:
MOMG_results = pd.read_csv(audio_multi_exp_dir + "/results_report.csv",index_col = 0)
MOMG_results[['Ratio of positive samples',"AUC", "Recall", "Precision", "f1-score"]]

Unnamed: 0,Ratio of positive samples,AUC,Recall,Precision,f1-score
car,0.46,0.56,0.99,0.46,0.63
gym,0.5,0.71,0.9,0.56,0.69
happy,0.35,0.6,0.98,0.36,0.52
night,0.47,0.6,0.98,0.48,0.64
relax,0.45,0.77,0.85,0.58,0.69
running,0.51,0.65,0.95,0.54,0.69
sad,0.33,0.77,0.81,0.48,0.6
summer,0.6,0.6,1.0,0.6,0.75
work,0.47,0.53,1.0,0.47,0.64
workout,0.38,0.75,0.89,0.49,0.63


### User-aware model results

In [50]:
user_results = pd.read_csv(user_exp_dir + "/results_report.csv",index_col = 0)
user_results[['Ratio of positive samples',"AUC", "Recall", "Precision", "f1-score"]]

Unnamed: 0,Ratio of positive samples,AUC,Recall,Precision,f1-score
car,0.09,0.61,0.29,0.15,0.19
gym,0.11,0.7,0.22,0.26,0.24
happy,0.06,0.61,0.08,0.12,0.1
night,0.09,0.61,0.1,0.15,0.12
relax,0.12,0.74,0.36,0.28,0.31
running,0.11,0.67,0.15,0.27,0.19
sad,0.09,0.78,0.49,0.26,0.34
summer,0.15,0.64,0.17,0.27,0.21
work,0.08,0.58,0.09,0.12,0.1
workout,0.08,0.7,0.21,0.2,0.21
