## Audio model evaluation

In [20]:
# load groundtruth (single label + multilabel), probabilities, one hot
# load as dataframe
import pandas as pd
from sklearn.metrics import cohen_kappa_score,f1_score,accuracy_score, precision_score, recall_score, classification_report, roc_auc_score, \
    hamming_loss
import numpy as np

LABELS_LIST = ['car', 'gym', 'happy', 'night', 'relax',
       'running', 'sad', 'summer', 'work', 'workout']

# [TODO] edit paths to match audio experiment output [single-label case]
exp_dir = "/src_code/repo/experiments_results/audio_system_single_label/2020-10-01_11-10-06"

audio_multi_exp_dir = "/src_code/repo/experiments_results/audio_system_multilabel/2020-10-01_10-12-32"
user_exp_dir = "/src_code/repo/experiments_results/user_aware_system/2020-09-01_14-18-10"

In [16]:
# Read model's output and groundtruth
test_ground_truth = pd.read_csv("/src_code/repo/GroundTruth/test_set.csv")
test_groundtruth_from_model = np.loadtxt(exp_dir + "/test_ground_truth_classes.txt",delimiter=',')
user_ids = np.loadtxt(exp_dir + "/user_ids.txt",delimiter=',')
track_ids = np.loadtxt(exp_dir + "/tracks_ids.txt",delimiter=',')
test_output = np.loadtxt(exp_dir + "/predictions.out",delimiter=',')
test_output_one_hot = np.loadtxt(exp_dir + "/test_output_one_hot.out",delimiter=',') ### WHAAAT

# Format model output and groundtruth to a dataframe
model_ground_truth = test_ground_truth.copy()
model_ground_truth.song_id = track_ids
model_ground_truth.user_id = user_ids
model_ground_truth.iloc[:,2:] = test_groundtruth_from_model

model_predictions = test_ground_truth.copy()
model_predictions.song_id = track_ids
model_predictions.user_id = user_ids
model_predictions.iloc[:,2:] = test_output

output_one_hot = test_ground_truth.copy()
output_one_hot.song_id = track_ids
output_one_hot.user_id = user_ids
output_one_hot.iloc[:,2:] = test_output_one_hot

### Single-output-single-groundtruth Protocol (SO-SG)

In [19]:
## Per label evaluation of single-output-single-groundtruth 
# Create a dataframe where we keep all the evaluations
results_df = pd.DataFrame(columns=LABELS_LIST)
results_df.index.astype(str, copy=False)
percentage_of_positives_perclass = sum(model_ground_truth.values[:,2:]) / len(model_ground_truth)
results_df.loc[0] = percentage_of_positives_perclass
results_df.index = ['Ratio of positive samples']

# compute additional metrics (AUC,f1,recall,precision)
auc_roc_per_label = roc_auc_score(model_ground_truth.values[:,2:], model_predictions.values[:,2:], average=None)
precision_perlabel = precision_score(model_ground_truth.values[:,2:], output_one_hot.values[:,2:], average=None)
recall_perlabel = recall_score(model_ground_truth.values[:,2:], output_one_hot.values[:,2:], average=None)
f1_perlabel = f1_score(model_ground_truth.values[:,2:], output_one_hot.values[:,2:], average=None)

results_df = results_df.append(
    pd.DataFrame([auc_roc_per_label,recall_perlabel, precision_perlabel, f1_perlabel], columns=LABELS_LIST))
results_df.index = ['Ratio of positive samples',"AUC", "Recall", "Precision", "f1-score"]
results_df['average'] = results_df.mean(numeric_only=True, axis=1)
results_df.round(3).T

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Unnamed: 0,Ratio of positive samples,Recall,Precision,f1-score
car,0.0,0.0,0.0,0.0
gym,0.0,0.0,0.0,0.0
happy,0.0,0.0,0.0,0.0
night,0.0,0.0,0.0,0.0
relax,0.0,0.0,0.0,0.0
running,0.0,1.0,0.281,0.439
sad,0.0,0.0,0.0,0.0
summer,0.0,0.0,0.0,0.0
work,0.0,0.0,0.0,0.0
workout,0.0,0.0,0.0,0.0


### Multi-output-single-groundtruth Protocol (MO-SG)

In [9]:
"""
    Evaluate on multi-label output and single-label grountruth
"""
# Create a dataframe where we keep all the evaluations
model_output_rounded = np.round(model_predictions.values[:,2:])
model_output_rounded = np.clip(model_output_rounded, 0, 1)
results_df = pd.DataFrame(columns=LABELS_LIST)
results_df.index.astype(str, copy=False)
percentage_of_positives_perclass = sum(model_ground_truth.values[:,2:]) / len(model_ground_truth)
results_df.loc[0] = percentage_of_positives_perclass
results_df.index = ['Ratio of positive samples']

# compute additional metrics (AUC,f1,recall,precision)
auc_roc_per_label = roc_auc_score(model_ground_truth.values[:,2:], model_predictions.values[:,2:], average=None)
precision_perlabel = precision_score(model_ground_truth.values[:,2:], model_output_rounded, average=None)
recall_perlabel = recall_score(model_ground_truth.values[:,2:], model_output_rounded, average=None)
f1_perlabel = f1_score(model_ground_truth.values[:,2:], model_output_rounded, average=None)

results_df = results_df.append(
    pd.DataFrame([auc_roc_per_label,recall_perlabel, precision_perlabel, f1_perlabel], columns=LABELS_LIST))
results_df.index = ['Ratio of positive samples',"AUC", "Recall", "Precision", "f1-score"]
results_df['average'] = results_df.mean(numeric_only=True, axis=1)
results_df.round(3).T

Unnamed: 0,Ratio of positive samples,Recall,Precision,f1-score
car,0.0,1.0,0.062,0.118
gym,0.0,0.0,0.0,0.0
happy,0.0,1.0,0.094,0.171
night,0.0,1.0,0.094,0.171
relax,0.0,0.0,0.0,0.0
running,0.0,1.0,0.281,0.439
sad,0.0,0.0,0.0,0.0
summer,0.0,0.333,0.2,0.25
work,0.0,1.0,0.188,0.316
workout,0.0,0.0,0.0,0.0


## Display the computed evaluation of the user model and the MO-MG protocol

### Multi-output-multi-groundtruth Protocol (MO-MG)

In [30]:
MOMG_results = pd.read_csv(audio_multi_exp_dir + "/results_report.csv",index_col = 0)
MOMG_results[['Ratio of positive samples',"AUC", "Recall", "Precision", "f1-score"]]

Unnamed: 0,Ratio of positive samples,AUC,Recall,Precision,f1-score
car,0.46,0.51,1.0,0.46,0.63
gym,0.5,0.53,0.0,0.0,0.0
happy,0.35,0.51,1.0,0.35,0.52
night,0.47,0.48,0.99,0.47,0.64
relax,0.45,0.48,0.01,0.45,0.02
running,0.51,0.5,1.0,0.51,0.68
sad,0.33,0.56,0.04,0.56,0.08
summer,0.6,0.52,0.35,0.6,0.44
work,0.47,0.5,1.0,0.47,0.64
workout,0.38,0.65,1.0,0.38,0.55


### User-aware model results

In [29]:
user_results = pd.read_csv(user_exp_dir + "/results_report.csv",index_col = 0)
user_results = user_results[['Ratio of positive samples',"AUC", "Recall", "Precision", "f1-score"]]