## Audio model evaluation

In [41]:
# load groundtruth (single label + multilabel), probabilities, one hot
# load as dataframe
import pandas as pd
from sklearn.metrics import cohen_kappa_score,f1_score,accuracy_score, precision_score, recall_score, classification_report, roc_auc_score, \
    hamming_loss
import numpy as np

LABELS_LIST = ['car', 'gym', 'happy', 'night', 'relax',
       'running', 'sad', 'summer', 'work', 'workout']

# [TODO] edit paths to match audio experiment output
exp_dir = "/srv/workspace/research/user_based_contexts_tagging/experiments_results/classic_updated_dataset_long/2020-05-08_13-17-43"
our_ground_truth = pd.read_csv(exp_dir+"/groundtruth_withIDS.csv")
our_predictions=  pd.read_csv(exp_dir+"/probabilities_withIDS.csv",)
one_hoted_df =  pd.read_csv(exp_dir+"/one_hoted_withIDS.csv")

              precision    recall  f1-score   support

         car       0.00      0.00      0.00      4355
         gym       0.18      0.38      0.25      5769
       happy       0.00      0.00      0.00      3299
       night       0.05      0.00      0.00      4814
       relax       0.24      0.64      0.35      6262
     running       0.18      0.04      0.06      5577
         sad       0.42      0.00      0.00      4678
      summer       0.19      0.41      0.26      7987
        work       0.11      0.00      0.00      4325
     workout       0.19      0.19      0.19      4038

   micro avg       0.20      0.20      0.20     51104
   macro avg       0.16      0.17      0.11     51104
weighted avg       0.17      0.20      0.13     51104
 samples avg       0.20      0.20      0.20     51104



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


## Single-output-single-groundtruth Protocol (SO-SG)

In [43]:
## Per label evaluation of single-output-single-groundtruth 
# Create a dataframe where we keep all the evaluations
results_df = pd.DataFrame(columns=LABELS_LIST)
results_df.index.astype(str, copy=False)
percentage_of_positives_perclass = sum(our_ground_truth.values[:,2:]) / len(our_ground_truth)
results_df.loc[0] = percentage_of_positives_perclass
results_df.index = ['Ratio of positive samples']

# compute additional metrics (AUC,f1,recall,precision)
auc_roc_per_label = roc_auc_score(our_ground_truth.values[:,2:], our_predictions.values[:,2:], average=None)
precision_perlabel = precision_score(our_ground_truth.values[:,2:], one_hoted_df.values[:,2:], average=None)
recall_perlabel = recall_score(our_ground_truth.values[:,2:], one_hoted_df.values[:,2:], average=None)
f1_perlabel = f1_score(our_ground_truth.values[:,2:], one_hoted_df.values[:,2:], average=None)

results_df = results_df.append(
    pd.DataFrame([auc_roc_per_label,recall_perlabel, precision_perlabel, f1_perlabel], columns=LABELS_LIST))
results_df.index = ['Ratio of positive samples',"AUC", "Recall", "Precision", "f1-score"]
results_df['average'] = results_df.mean(numeric_only=True, axis=1)
results_df.round(3).T

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,Ratio of positive samples,AUC,Recall,Precision,f1-score
car,0.085,0.545,0.0,0.0,0.0
gym,0.113,0.677,0.378,0.181,0.245
happy,0.065,0.563,0.0,0.0,0.0
night,0.094,0.575,0.0,0.051,0.001
relax,0.122,0.74,0.639,0.241,0.35
running,0.109,0.612,0.039,0.178,0.064
sad,0.092,0.741,0.001,0.417,0.002
summer,0.156,0.577,0.414,0.192,0.262
work,0.085,0.526,0.0,0.105,0.001
workout,0.079,0.717,0.186,0.193,0.189


## Multi-output-single-groundtruth Protocol (MO-SG)

In [49]:
"""
    Evaluate on multi-label output and single-label grountruth
"""
# Create a dataframe where we keep all the evaluations
our_predictions=  pd.read_csv(exp_dir+"/probabilities_withIDS.csv",)
model_output_rounded = np.round(our_predictions.values[:,2:])
model_output_rounded = np.clip(model_output_rounded, 0, 1)
results_df = pd.DataFrame(columns=LABELS_LIST)
results_df.index.astype(str, copy=False)
percentage_of_positives_perclass = sum(our_ground_truth.values[:,2:]) / len(our_ground_truth)
results_df.loc[0] = percentage_of_positives_perclass
results_df.index = ['Ratio of positive samples']

# compute additional metrics (AUC,f1,recall,precision)
auc_roc_per_label = roc_auc_score(our_ground_truth.values[:,2:], our_predictions.values[:,2:], average=None)
precision_perlabel = precision_score(our_ground_truth.values[:,2:], model_output_rounded, average=None)
recall_perlabel = recall_score(our_ground_truth.values[:,2:], model_output_rounded, average=None)
f1_perlabel = f1_score(our_ground_truth.values[:,2:], model_output_rounded, average=None)

results_df = results_df.append(
    pd.DataFrame([auc_roc_per_label,recall_perlabel, precision_perlabel, f1_perlabel], columns=LABELS_LIST))
results_df.index = ['Ratio of positive samples',"AUC", "Recall", "Precision", "f1-score"]
results_df['average'] = results_df.mean(numeric_only=True, axis=1)
results_df.round(3).T
# get plots of confusion matrix

Unnamed: 0,Ratio of positive samples,AUC,Recall,Precision,f1-score
car,0.085,0.545,0.973,0.088,0.162
gym,0.113,0.677,0.934,0.137,0.238
happy,0.065,0.563,0.975,0.066,0.124
night,0.094,0.575,0.993,0.095,0.173
relax,0.122,0.74,0.926,0.163,0.277
running,0.109,0.612,0.957,0.118,0.21
sad,0.092,0.741,0.894,0.137,0.237
summer,0.156,0.577,1.0,0.156,0.27
work,0.085,0.526,1.0,0.085,0.156
workout,0.079,0.717,0.913,0.107,0.192
