In [1]:
import json
import pandas as pd
import numpy as np

from tabulate import tabulate 
from sklearn.metrics import f1_score, precision_score, recall_score

pd.set_option('display.max_colwidth', None)

In [2]:
mace_annotation_df = pd.read_pickle('./data/annotation-results/MACE-measure/final_mace_predictions.pkl')

In [3]:
mace_annotation_df.exp_act_label.value_counts()

(E03) Provide an explanation                679
(E07) Providing Feedback                    285
(E04) Ask for an explanation                142
(E05) Signaling understanding               141
(E02) Testing prior knowledge               112
(E10) Other                                  59
(E01) Testing understanding                  56
(E09) Introducing Extraneous Information     48
(E06) Signaling non-understanding            17
(E08) Providing Assessment                   11
Name: exp_act_label, dtype: int64

In [4]:
def eval_preds(df, models_names, gt_clms, pred_clms):
    results_table = []
    for label in zip(gt_clms, pred_clms, models_names):
        ground_truths = df[label[0]].tolist()
        predictions   = df[label[1]].tolist()
        model_name = label[2]
        
        class_names = df[label[0]].unique()

        prc_scores = precision_score(ground_truths, predictions, average=None, labels=class_names)
        rec_scores = recall_score(ground_truths, predictions, average=None, labels=class_names)
        f1_scores  = f1_score(ground_truths, predictions, average=None, labels=class_names)
        
        macro_prc_scores = precision_score(predictions, ground_truths, average='macro', labels=class_names)
        macro_rec_scores = recall_score(predictions, ground_truths, average='macro', labels=class_names)
        macro_f1 = f1_score(predictions, ground_truths, average='macro', labels=class_names)
        
        scores ={}
        for i, c in enumerate(class_names):
            scores[c] = {'prec': round(prc_scores[i],2), 'recall': round(rec_scores[i],2), 'f1': round(f1_scores[i],2)}
        
        scores['Macro AVG.'] = {'prec': round(macro_prc_scores,2), 'recall': round(macro_rec_scores,2), 'f1': round(macro_f1,2)}
        
        results_table.append([model_name, label[0], scores])
    
    return results_table

In [5]:
mace_annotation_with_seq_preds_df = pd.read_pickle('./model_sequence_labeling/sequence_labeling_preds.pkl')

mace_annotation_with_mt_preds_df  = pd.read_pickle('./multi_task_learning/mt_final_preds.pkl')
mace_annotation_with_mt_preds_df.drop(['topic_func_label', 'dlg_act_label', 'exp_act_label', 'topic'],axis=1, inplace=True)

mace_annotatoin_with_basic_per_preds_df = pd.read_pickle('./majority_and_basic_bert_pred.pkl')
mace_annotatoin_with_basic_per_preds_df.drop(['topic_func_label', 'dlg_act_label', 'exp_act_label', 'topic'],axis=1, inplace=True)

mace_annotation_preds = pd.merge(mace_annotation_with_seq_preds_df, mace_annotatoin_with_basic_per_preds_df, on=['task_id', 'turn_id'])
mace_annotation_preds = pd.merge(mace_annotation_preds, mace_annotation_with_mt_preds_df, on=['task_id', 'turn_id'])

In [6]:
print_scores = ['f1']

In [8]:
results_table = eval_preds(mace_annotation_preds, ['Majority', 'BERT', 'BERT-SEQ', 'BERT-MT'], ['topic_func_label', 'topic_func_label', 'topic_func_label', 'topic_func_label'], 
                           ['topic_func_maj_pred', 'topic_func_label_bert_pred', 'topic_func_label_seq_bert_pred', 'topic-func-lable_mt_pred'])
class_names = sorted(mace_annotation_preds['topic_func_label'].unique().tolist())

print(tabulate([[r[0]] + [r[2][class_name][m] for class_name in class_names + ['Macro AVG.'] for m in print_scores] for r in results_table],
     headers=['app']+ class_names + ['Macro Avg']))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


app         (T01) It is the main topic    (T02) A subtopic    (T03) A related topic    (T04) Other - No topic was introduced    Macro Avg
--------  ----------------------------  ------------------  -----------------------  ---------------------------------------  -----------
Majority                          0                   0                        0                                        0.66         0.17
BERT                              0.58                0.11                     0.44                                     0.89         0.51
BERT-SEQ                          0.61                0.13                     0.44                                     0.89         0.52
BERT-MT                           0.39                0.13                     0.32                                     0.73         0.39


In [9]:
results_table = eval_preds(mace_annotation_preds, ['Majority', 'BERT', 'BERT-SEQ', 'BERT-MT'], ['dlg_act_label', 'dlg_act_label', 'dlg_act_label', 'dlg_act_label'], 
                           ['dlg_act_maj_pred', 'dlg_act_label_bert_pred', 'dlg_act_label_seq_bert_pred', 'dlg-act-lable_mt_pred'])
class_names = sorted(mace_annotation_preds['dlg_act_label'].unique().tolist())

print(tabulate([[r[0]] + [r[2][class_name][s] for class_name in class_names + ['Macro AVG.'] for s in print_scores] for r in results_table],
     headers=['app']+[x[0:5] for x in class_names]+ ['Macro Avg']))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


app         (D01)    (D02)    (D03)    (D04)    (D05)    (D06)    (D07)    (D08)    (D09)    (D10)    Macro Avg
--------  -------  -------  -------  -------  -------  -------  -------  -------  -------  -------  -----------
Majority     0        0        0        0        0        0        0           0     0.62     0            0.06
BERT         0.76     0.73     0        0.33     0.67     0        0.51        0     0.87     0.57         0.44
BERT-SEQ     0.76     0.72     0        0.35     0.67     0        0.69        0     0.87     0.61         0.47
BERT-MT      0.43     0.46     0.06     0.18     0.19     0.03     0.24        0     0.78     0.23         0.26


In [10]:
results_table = eval_preds(mace_annotation_preds, ['Majority', 'BERT', 'BERT-SEQ', 'BERT-SEQ'], ['exp_act_label', 'exp_act_label', 'exp_act_label', 'exp_act_label'], 
                           ['exp_act_maj_pred', 'exp_act_label_bert_pred', 'exp_act_label_seq_bert_pred', 'exp-act-lable_mt_pred'])
class_names = sorted(mace_annotation_preds['exp_act_label'].unique().tolist())

print(tabulate([[r[0]] + [r[2][class_name][s] for class_name in class_names + ['Macro AVG.'] for s in print_scores] for r in results_table],
     headers=['app']+[x[0:5] for x in class_names]+ ['Macro Avg']))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


app         (E01)    (E02)    (E03)    (E04)    (E05)    (E06)    (E07)    (E08)    (E09)    (E10)    Macro Avg
--------  -------  -------  -------  -------  -------  -------  -------  -------  -------  -------  -----------
Majority     0        0        0.61     0        0        0        0        0        0        0            0.06
BERT         0.27     0.64     0.84     0.6      0.29     0.34     0.51     0        0.11     0.5          0.41
BERT-SEQ     0.27     0.64     0.84     0.64     0.33     0.21     0.6      0.15     0.08     0.56         0.43
BERT-SEQ     0.14     0.28     0.78     0.3      0.12     0.07     0.23     0        0.04     0.16         0.21


## Compute significancy:

In [164]:
def check_sig(v1s, v2s, alpha=0.05):
    from scipy import stats
    from statsmodels.stats import weightstats 

    diff = list(map(lambda x1 , x2: x1 - x2, v1s, v2s))
    is_normal = stats.shapiro(diff)[1] > alpha
    
    if is_normal:
        #print('Distribution is normal, so using ttest_rel')
        tstat, pvalue, df = weightstats.ttest_ind(v1s, v2s, alternative='larger')
        #print(tstat, pvalue)
        if tstat >=0:
            if (pvalue) <= alpha:
                return True
            else:
                return False
        else:
            return False

    else:
        #print('Distribution is not normal, so using wilcoxon')
        ttest = stats.wilcoxon(v1s, v2s, alternative='greater')
        
        if ttest.statistic >=0:
            if (ttest.pvalue) <= alpha:
                return True
            else:
                return False
        else:
            return False

#### For Topic labels:

In [180]:
#1. compute f1 scores for 20 slices of the dataframe
#2. extract the f1 scores from the results
#3. compute significancy

results_table = [
    eval_preds(df_slice, ['Majority', 'BERT', 'BERT-SEQ', 'BERT-MT'], ['topic_func_label', 'topic_func_label', 'topic_func_label', 'topic_func_label'], 
                           ['topic_func_maj_pred', 'topic_func_label_bert_pred', 'topic_func_label_seq_bert_pred', 'topic-func-lable_mt_pred'])
for df_slice in np.array_split(mace_annotation_preds, 5)]

bert_seq_res   = [{x[0]: x[1]['f1'] for x in res_item[2][2].items()} for res_item in results_table]
bert_basic_res = [{x[0]: x[1]['f1'] for x in res_item[1][2].items()} for res_item in results_table]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [181]:
for clm in list(bert_basic_res[0].keys()):
    bert_seq_vals = [v[clm] for v in bert_seq_res]
    bert_basic_vals = [v[clm] for v in bert_basic_res]
    print('----- {} -----'.format(clm))
    print(bert_basic_vals)
    print(bert_seq_vals)
    print('Significancy for {} is {}'.format(clm, check_sig(bert_seq_vals, bert_basic_vals, alpha=0.1)))

----- (T04) Other - No topic was introduced -----
[0.87, 0.91, 0.87, 0.93, 0.87]
[0.87, 0.89, 0.88, 0.91, 0.88]
Significancy for (T04) Other - No topic was introduced is False
----- (T01) It is the main topic -----
[0.6, 0.6, 0.77, 0.24, 0.52]
[0.61, 0.72, 0.72, 0.37, 0.54]
Significancy for (T01) It is the main topic is False
----- (T03) A related topic -----
[0.3, 0.46, 0.48, 0.5, 0.41]
[0.38, 0.4, 0.49, 0.53, 0.35]
Significancy for (T03) A related topic is False
----- (T02) A subtopic -----
[0.0, 0.11, 0.19, 0.14, 0.11]
[0.23, 0.17, 0.18, 0.0, 0.0]
Significancy for (T02) A subtopic is False
----- Macro AVG. -----
[0.44, 0.52, 0.58, 0.45, 0.48]
[0.52, 0.54, 0.57, 0.45, 0.44]
Significancy for Macro AVG. is False




-----

### For Dialogue label:

In [182]:
results_table = [
    eval_preds(df_slice, ['Majority', 'BERT', 'BERT-SEQ', 'BERT-MT'], ['dlg_act_label', 'dlg_act_label', 'dlg_act_label', 'dlg_act_label'], 
                           ['dlg_act_maj_pred', 'dlg_act_label_bert_pred', 'dlg_act_label_seq_bert_pred', 'dlg-act-lable_mt_pred'])
    for df_slice in np.array_split(mace_annotation_preds, 5)]

bert_seq_res   = [{x[0]: x[1]['f1'] for x in res_item[2][2].items()} for res_item in results_table]
bert_basic_res = [{x[0]: x[1]['f1'] for x in res_item[1][2].items()} for res_item in results_table]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [183]:
for clm in list(bert_basic_res[0].keys()):
    bert_seq_vals = [v[clm] if clm in v else 0 for v in bert_seq_res]
    bert_basic_vals = [v[clm] if clm in v else 0 for v in bert_basic_res]
    print('----- {} -----'.format(clm))
    print(bert_basic_vals)
    print(bert_seq_vals)
    print('Significancy for {} is {}'.format(clm, check_sig(bert_seq_vals, bert_basic_vals, alpha=0.05)))

----- (D09) To provide informing statement -----
[0.93, 0.84, 0.88, 0.88, 0.81]
[0.9, 0.84, 0.86, 0.89, 0.82]
Significancy for (D09) To provide informing statement is False
----- (D07) To provide agreement statement -----
[0.59, 0.46, 0.49, 0.59, 0.43]
[0.74, 0.7, 0.63, 0.79, 0.6]
Significancy for (D07) To provide agreement statement is True
----- (D01) To ask a check question -----
[0.72, 0.68, 0.8, 0.75, 0.82]
[0.7, 0.7, 0.77, 0.77, 0.81]
Significancy for (D01) To ask a check question is False
----- (D05) To answer a question by disconfirming -----
[0.67, 0.75, 0.84, 0.67, 0.33]
[0.67, 0.75, 0.78, 0.67, 0.46]
Significancy for (D05) To answer a question by disconfirming is False
----- (D02) To ask what/how question -----
[0.71, 0.74, 0.81, 0.65, 0.72]
[0.73, 0.77, 0.74, 0.62, 0.71]
Significancy for (D02) To ask what/how question is False
----- (D06) To answer - Other -----
[0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0]
Significancy for (D06) To answer - Other is False
----- (D04)

  tstat = (value1 - value2 - diff) / std_diff


-------

### For Explanation move labels:

In [184]:
results_table = [ 
    eval_preds(df_slice, ['Majority', 'BERT', 'BERT-SEQ', 'BERT-SEQ'], ['exp_act_label', 'exp_act_label', 'exp_act_label', 'exp_act_label'], 
                           ['exp_act_maj_pred', 'exp_act_label_bert_pred', 'exp_act_label_seq_bert_pred', 'exp-act-lable_mt_pred'])
    for df_slice in np.array_split(mace_annotation_preds, 5)]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [186]:
bert_seq_res   = [{x[0]: x[1]['f1'] for x in res_item[2][2].items()} for res_item in results_table]
bert_basic_res = [{x[0]: x[1]['f1'] for x in res_item[1][2].items()} for res_item in results_table]

for clm in list(bert_basic_res[0].keys()):
    bert_seq_vals = [v[clm] if clm in v else 0 for v in bert_seq_res]
    bert_basic_vals = [v[clm] if clm in v else 0 for v in bert_basic_res]
    print('----- {} -----'.format(clm))
    print(bert_basic_vals)
    print(bert_seq_vals)
    print('Significancy for {} is {}'.format(clm, check_sig(bert_seq_vals, bert_basic_vals, alpha=0.1)))

----- (E02) Testing prior knowledge -----
[0.47, 0.77, 0.73, 0.52, 0.67]
[0.5, 0.71, 0.72, 0.62, 0.62]
Significancy for (E02) Testing prior knowledge is False
----- (E03) Provide an explanation -----
[0.88, 0.81, 0.82, 0.84, 0.86]
[0.88, 0.8, 0.82, 0.83, 0.85]
Significancy for (E03) Provide an explanation is False
----- (E05) Signaling understanding -----
[0.19, 0.3, 0.23, 0.33, 0.34]
[0.19, 0.53, 0.05, 0.39, 0.14]
Significancy for (E05) Signaling understanding is False
----- (E09) Introducing Extraneous Information -----
[0.13, 0.22, 0.08, 0.0, 0]
[0.15, 0.0, 0.19, 0.0, 0]
Significancy for (E09) Introducing Extraneous Information is False
----- (E07) Providing Feedback -----
[0.63, 0.39, 0.54, 0.52, 0.51]
[0.73, 0.42, 0.57, 0.62, 0.63]
Significancy for (E07) Providing Feedback is True
----- (E01) Testing understanding -----
[0.2, 0.29, 0.35, 0.29, 0.22]
[0.18, 0.27, 0.2, 0.25, 0.39]
Significancy for (E01) Testing understanding is False
----- (E10) Other -----
[0.67, 0.41, 0.63, 0.35, 

