In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

MACBERT_LOGITS_PATH = "./macbert_large_without_EF/predictionB/logits_B_test.npy"
MACBERT_EF_LOGITS_PATH = "./macbert_large_with_EF/predictionB/logits_B_test.npy"

ROBERTA_LOGITS_PATH = "./roberta_large_without_EF/predictionB/logits_B_test.npy"
ROBERTA_EF_LOGITS_PATH = "./roberta_large_with_EF/predictionB/logits_B_test.npy"

BERT_CHN_LOGITS_PATH = "./bert_base_chinese_without_EF/predictionB/logits_B_test.npy"
BERT_CHN_EF_LOGITS_PATH = "./bert_base_chinese_with_EF/predictionB/logits_B_test.npy"

BERT_MULLING_LOGITS_PATH = "./bert_base_multilingual_without_EF/predictionB/logits_B_test.npy"
BERT_MULLING_EF_LOGITS_PATH = "./bert_base_multilingual_with_EF/predictionB/logits_B_test.npy"

MACBERT_PRED_L_PATH = "./macbert_large_without_EF/predictionB/bayes_prediction_label.npy"
MACBERT_EF_PRED_L_PATH = "./macbert_large_with_EF/predictionB/bayes_prediction_label.npy"

ROBERTA_PRED_L_PATH = "./roberta_large_without_EF/predictionB/bayes_prediction_label.npy"
ROBERTA_EF_PRED_L_PATH = "./roberta_large_with_EF/predictionB/bayes_prediction_label.npy"

BERT_CHN_PRED_L_PATH = "./bert_base_chinese_without_EF/predictionB/bayes_prediction_label.npy"
BERT_CHN_EF_PRED_L_PATH = "./bert_base_chinese_with_EF/predictionB/bayes_prediction_label.npy"

BERT_MULLING_PRED_L_PATH = "./bert_base_multilingual_without_EF/predictionB/bayes_prediction_label.npy"
BERT_MULLING_EF_PRED_L_PATH = "./bert_base_multilingual_with_EF/predictionB/bayes_prediction_label.npy"

In [None]:
GOLD_DATA_PATH  = './GOLD_DATA_PATH/Data.txt'
with open(GOLD_DATA_PATH,'r') as file:
    lines = file.readlines()
data_B = []
for line in lines:
    data_B.append(json.loads(line))
data_B = pd.DataFrame(data_B)
data_B

### Evaluation of model prediction and interpretability

In [None]:
def Jaccard(pred_label, turb_pred_label, gold_label):
    '''
    pred_label: list of predicted labels
    turb_pred_label: list of predicted labels
    gold_label: list of predicted labels of perturbed 
    '''
    assert len(pred_label) == len(turb_pred_label) == len(gold_label)
    N = len(pred_label)
    num = np.sum( (np.array(pred_label) == np.array(gold_label)) * (np.array(turb_pred_label) == np.array(gold_label)))
    den = np.sum( (np.array(pred_label) != np.array(gold_label)) * (np.array(turb_pred_label) != np.array(gold_label)))
    j = num/(N-den)
    return(j)

In [None]:
def accuracy(gold_label, pred_label):
    '''
    gold_label: int
    pred_label: int
    '''
    return (gold_label == pred_label)*1.0

import collections
def rationale_macro_f1(gold_rationale, pred_rationale):
    '''
    gold_rationale: list of index
    pred_rationale: list of index
    '''
    intersection_list = collections.Counter(gold_rationale) & collections.Counter(pred_rationale)
    intersected_list = list(intersection_list.elements())
    inter_len = len(intersected_list)
    if len(pred_rationale) * inter_len == 0.0:
        return 0.0
    p = inter_len/(len(pred_rationale))
    r = inter_len/(len(gold_rationale))
    return 2 * (p*r) / (p+r)

def greater(gold_rationale, pred_rationale):
    ''' 
     A rationale is considered as a match if its Si = (intersection/union) is equal to or greater than 0.5
    '''
    union =  set(gold_rationale).union(pred_rationale)
    inter =  set(gold_rationale).intersection(pred_rationale)
    s = len(inter)/len(union)
    return (s>0.5)*1.0

def mean_avg_precision(ori_pred_rationale, turb_pred_rationale):
    '''
    ori_pred_rationale: list of index
    turb_pred_rationale: list of index
    '''
    map = 0.0
    if len(turb_pred_rationale) == 0:
        return map
    for i in range(1, len(turb_pred_rationale)+1):
        G = 0.0
        for j in range(i):
            G += (turb_pred_rationale[j] in ori_pred_rationale[:i]) * 1.0
        map += G/i
    map /= (len(turb_pred_rationale)+1e-6)
    return map


In [None]:
def test_dataframe(pred_B, data_B):
    '''
    pred_B: pd.DataFrame including columns {id, label, (optional) rationale_q, (optional) rationale_t}
    data_B: pd.DataFrame including columns {sent_id, sent_label, rationale_q_idx, rationale_t_idx, rel_ids}
    '''
    rationale_macro_f1_list = []
    map_list = []
    pred_l = []
    turb_pred_l = []
    gold_l = []
    greater_list = []
    wrong_list = []    
    
    for i in pred_B.id:
        gold_line = data_B[data_B['sent_id']==i]
        gold_label = gold_line['sent_label'].item()

        pred_line = pred_B[pred_B['id']==i]
        pred_label = pred_line['label'].item()

        try: 
            gold_rationale_q = gold_line['rationale_q_idx'].item()
            gold_rationale_t = gold_line['rationale_t_idx'].item()
            pred_rationale_q = pred_line['rationale_q'].item()
            pred_rationale_t = pred_line['rationale_t'].item()
            pred_rationale_q_token = [gold_line['text_q_token'].item()[k][0] for k in pred_rationale_q]
            pred_rationale_t_token = [gold_line['text_t_token'].item()[k][0] for k in pred_rationale_t]
            
            rationale_macro_f1_list.append(rationale_macro_f1(gold_rationale_q, pred_rationale_q))
            rationale_macro_f1_list.append(rationale_macro_f1(gold_rationale_t, pred_rationale_t))
            greater_list.append(greater(gold_rationale_q, pred_rationale_q))
            greater_list.append(greater(gold_rationale_t, pred_rationale_t))        
       
        except KeyError:   #no rationale reported
            pass

        except IndexError:
            wrong_list.append(i)


        if gold_line.sample_type.item() == 'ori': 
            rel_ids = gold_line['rel_ids'].item()
            for rel_id in rel_ids:
                
                gold_line_turb = data_B[data_B['sent_id']==rel_id]
                pred_line_trub = pred_B[pred_B['id']==rel_id]
                
                turb_pred_l.append(pred_line_trub['label'].item())
                pred_l.append(pred_label)
                gold_l.append(gold_label)

                try:
                    pred_rationale_trub_q = pred_line_trub['rationale_q'].item()
                    pred_rationale_trub_t = pred_line_trub['rationale_t'].item()
                    pred_rationale_trub_q_token = [gold_line_turb['text_q_token'].item()[i][0] for i in pred_rationale_trub_q]
                    pred_rationale_trub_t_token = [gold_line_turb['text_t_token'].item()[i][0] for i in pred_rationale_trub_t]
                    map_list.append(mean_avg_precision(pred_rationale_q_token, pred_rationale_trub_q_token))
                    map_list.append(mean_avg_precision(pred_rationale_t_token, pred_rationale_trub_t_token))
                    
                except KeyError:   #no rationale reported
                    pass
                
                except IndexError:
                    wrong_list.append(rel_id)
            
    if not rationale_macro_f1_list: rationale_macro_f1_list = [0]
    if not greater_list: greater_list = [0]
    if not map_list: map_list = [0]

    return { 
            'CLS_Accuracy': round(classification_report(data_B.sent_label, pred_B.label, output_dict = True)['accuracy']*100,1),
            'CLS_Macro_f1': round(classification_report(data_B.sent_label, pred_B.label, output_dict = True)['macro avg']['f1-score']*100,1),
            'CLS_Jaccard': round(Jaccard(pred_l,turb_pred_l, gold_l)*100,1),
            'Recall': round(classification_report(data_B.sent_label, pred_B.label, output_dict = True)['1']['recall']*100,1),
            'TNR': round(classification_report(data_B.sent_label, pred_B.label, output_dict = True)['0']['recall']*100,1),
            'Rationale_Macro_f1': round(np.mean(rationale_macro_f1_list)*100,1), 
            'Rationale_IOU_f1': round(np.mean(greater_list)*100,1),
            'Rationale_MAP': round(np.mean(map_list)*100,1),
            # 'Rationale_Macro_f1_list': rationale_macro_f1_list,
            # 'Rationale_IOU_f1_list': greater_list,
            # 'Rationale_MAP_list': map_list,
            }

### Read predicted data

In [None]:
model_pred_ERNIEbase_path = "./rationale_results/sim_rationale_erniebase.txt"
with open(model_pred_ERNIEbase_path, 'r') as f:  
    model_results = f.readlines()
pred_ERNIEbase = []
for r in model_results:
    line_str = r.replace('\n','').split('\t')
    pred_ERNIEbase.append([int(line_str[0]), int(line_str[1]), [int(i) for i in (line_str[2].split(',') if line_str[2] else [])], [int(i) for i in (line_str[3].split(',') if line_str[3] else [])]])
pred_ERNIEbase = pd.DataFrame(pred_ERNIEbase,columns=['id','label','rationale_q', 'rationale_t'])

model_pred_shap_path = "./rationale_results/sim_rationale_shap.txt"
with open(model_pred_shap_path, 'r') as f:  
    model_results = f.readlines()
pred_shap = []
for r in model_results:
    line_str = r.replace('\n','').split('\t')
    pred_shap.append([int(line_str[0]), int(line_str[1]), [int(i) for i in (line_str[2].split(',') if line_str[2] else [])], [int(i) for i in (line_str[3].split(',') if line_str[3] else [])]])
pred_shap = pd.DataFrame(pred_shap,columns=['id','label','rationale_q', 'rationale_t'])

model_pred_shapprop_path = "./rationale_results/sim_rationale_proportional_shap.txt"
with open(model_pred_shapprop_path, 'r') as f:  
    model_results = f.readlines()
pred_shap_prop = []
for r in model_results:
    line_str = r.replace('\n','').split('\t')
    pred_shap_prop.append([int(line_str[0]), int(line_str[1]), [int(i) for i in (line_str[2].split(',') if line_str[2] else [])], [int(i) for i in (line_str[3].split(',') if line_str[3] else [])]])
pred_shap_prop = pd.DataFrame(pred_shap_prop,columns=['id','label','rationale_q', 'rationale_t'])

model_pred_ig_path = "./rationale_results/sim_rationale_ig.txt"
with open(model_pred_ig_path, 'r') as f:  
    model_results = f.readlines()
pred_ig = []
for r in model_results:
    line_str = r.replace('\n','').split('\t')
    pred_ig.append([int(line_str[0]), int(line_str[1]), [int(i) for i in (line_str[2].split(',') if line_str[2] else [])], [int(i) for i in (line_str[3].split(',') if line_str[3] else [])]])
pred_ig = pd.DataFrame(pred_ig,columns=['id','label','rationale_q', 'rationale_t'])

model_pred_igprop_path = "./rationale_results/sim_rationale_proportional_ig.txt"
with open(model_pred_igprop_path, 'r') as f:  
    model_results = f.readlines()
pred_ig_prop = []
for r in model_results:
    line_str = r.replace('\n','').split('\t')
    pred_ig_prop.append([int(line_str[0]), int(line_str[1]), [int(i) for i in (line_str[2].split(',') if line_str[2] else [])], [int(i) for i in (line_str[3].split(',') if line_str[3] else [])]])
pred_ig_prop = pd.DataFrame(pred_ig_prop,columns=['id','label','rationale_q', 'rationale_t'])

model_pred_lime_path = "./rationale_results/sim_rationale_lime.txt"
with open(model_pred_lime_path, 'r') as f:  
    model_results = f.readlines()
pred_lime = []
for r in model_results:
    line_str = r.replace('\n','').split('\t')
    pred_lime.append([int(line_str[0]), int(line_str[1]), [int(i) for i in (line_str[2].split(',') if line_str[2] else [])], [int(i) for i in (line_str[3].split(',') if line_str[3] else [])]])
pred_lime = pd.DataFrame(pred_lime,columns=['id','label','rationale_q', 'rationale_t'])

model_pred_limeprop_path = "./rationale_results/sim_rationale_proportional_lime.txt"
with open(model_pred_limeprop_path, 'r') as f:  
    model_results = f.readlines()
pred_lime_prop = []
for r in model_results:
    line_str = r.replace('\n','').split('\t')
    pred_lime_prop.append([int(line_str[0]), int(line_str[1]), [int(i) for i in (line_str[2].split(',') if line_str[2] else [])], [int(i) for i in (line_str[3].split(',') if line_str[3] else [])]])
pred_lime_prop = pd.DataFrame(pred_lime_prop,columns=['id','label','rationale_q', 'rationale_t'])

model_pred_lexicality_path = "./rationale_results/sim_rationale_lexicality.txt"
with open(model_pred_lexicality_path, 'r') as f:  
    model_results = f.readlines()
pred_lexicality = []
for r in model_results:
    line_str = r.replace('\n','').split('\t')
    pred_lexicality.append([int(line_str[0]), int(line_str[1]), [int(i) for i in (line_str[2].split(',') if line_str[2] else [])], [int(i) for i in (line_str[3].split(',') if line_str[3] else [])]])
pred_lexicality = pd.DataFrame(pred_lexicality,columns=['id','label','rationale_q', 'rationale_t'])

model_pred_wo_denoise_path = "./rationale_results/sim_rationale_wo_denoising.txt"
with open(model_pred_wo_denoise_path, 'r') as f:  
    model_results = f.readlines()
wo_denoise_pred = []
for r in model_results:
    line_str = r.replace('\n','').split('\t')
    wo_denoise_pred.append([int(line_str[0]), int(line_str[1]), [int(i) for i in (line_str[2].split(',') if line_str[2] else [])], [int(i) for i in (line_str[3].split(',') if line_str[3] else [])]])
wo_denoise_pred = pd.DataFrame(wo_denoise_pred,columns=['id','label','rationale_q', 'rationale_t'])

model_pred_path_k3 = "./rationale_results/sim_rationale_dual_ranking.txt"
with open(model_pred_path_k3, 'r') as f:  
    model_results = f.readlines()
ours_pred_k3 = []
for r in model_results:
    line_str = r.replace('\n','').split('\t')
    ours_pred_k3.append([int(line_str[0]), int(line_str[1]), [int(i) for i in (line_str[2].split(',') if line_str[2] else [])], [int(i) for i in (line_str[3].split(',') if line_str[3] else [])]])
ours_pred_k3 = pd.DataFrame(ours_pred_k3,columns=['id','label','rationale_q', 'rationale_t'])

gpt4_pred_path = "./gpt/gpt4_answers/GPT4_ans.txt"
with open(gpt4_pred_path, 'r') as f:
    model_results = f.readlines()
pred_GPT4 = []
for r in model_results:
    r_dict = json.loads(r) 
    r_dict['rationale_q'], r_dict['rationale_t'] = r_dict['rationale'][0], r_dict['rationale'][1]
    pred_GPT4.append(r_dict)
pred_GPT4 = pd.DataFrame(pred_GPT4).drop(columns=['rationale'])

chatgpt_pred_path = "./gpt/chatgpt_answers/ChatGPT_ans.txt"
with open(chatgpt_pred_path, 'r') as f:
    model_results = f.readlines()
pred_chatgpt = []
for r in model_results:
    r_dict = json.loads(r) 
    r_dict['rationale_q'], r_dict['rationale_t'] = r_dict['rationale'][0], r_dict['rationale'][1]
    pred_chatgpt.append(r_dict)
pred_chatgpt = pd.DataFrame(pred_chatgpt).drop(columns=['rationale'])

In [None]:
print('macbert_positive_ig    ', test_dataframe(pred_ig, data_B))
print('macbert_prop_ig        ', test_dataframe(pred_ig_prop, data_B))
print('pred_ChatGPT           ', test_dataframe(pred_chatgpt, data_B))
print('pred_GPT4              ', test_dataframe(pred_GPT4, data_B))
print('pred_ERNIEbase         ', test_dataframe(pred_ERNIEbase, data_B))
print('macbert_positive_shap  ', test_dataframe(pred_shap, data_B))
print('macbert_prop_shap      ', test_dataframe(pred_shap_prop, data_B))
print('macbert_positive_lime  ', test_dataframe(pred_lime, data_B))
print('macbert_prop_lime      ', test_dataframe(pred_lime_prop, data_B))
print('macbert_pred_lexicality', test_dataframe(pred_lexicality, data_B))
print('macbert_wo_denoise_pred', test_dataframe(wo_denoise_pred, data_B))
print('macbert_dual_ranking   ', test_dataframe(ours_pred_k3, data_B))

In [None]:
#MacBERT
macbert_logits = np.load(MACBERT_LOGITS_PATH, allow_pickle=True)
macbert_logits_pred_label =  np.argmax(macbert_logits,axis=1)
pred_macbert = pd.DataFrame({'id':data_B['sent_id'], 'label': macbert_logits_pred_label})
pred_macbert_bip = pd.DataFrame({'id':data_B['sent_id'], 'label': np.load(MACBERT_PRED_L_PATH, allow_pickle=True)})
#MacBERT + EF
macbert_EF_logits =np.load(MACBERT_EF_LOGITS_PATH, allow_pickle=True)
macbert_EF_logits_pred_label =  np.argmax(macbert_EF_logits,axis=1)
pred_EF_macbert = pd.DataFrame({'id':data_B['sent_id'], 'label': macbert_EF_logits_pred_label})
pred_EF_macbert_bip = pd.DataFrame({'id':data_B['sent_id'], 'label': np.load(MACBERT_EF_PRED_L_PATH, allow_pickle=True)})

#RoBERTa
roberta_logits = np.load(ROBERTA_LOGITS_PATH, allow_pickle=True)
roberta_logits_pred_label =  np.argmax(roberta_logits,axis=1)
pred_roberta = pd.DataFrame({'id':data_B['sent_id'], 'label': roberta_logits_pred_label})
pred_roberta_bip = pd.DataFrame({'id':data_B['sent_id'], 'label': np.load(ROBERTA_PRED_L_PATH, allow_pickle=True)})
#RoBERTa + EF
roberta_EF_logits =np.load(ROBERTA_EF_LOGITS_PATH, allow_pickle=True)
roberta_EF_logits_pred_label =  np.argmax(roberta_EF_logits,axis=1)
pred_EF_roberta = pd.DataFrame({'id':data_B['sent_id'], 'label': roberta_EF_logits_pred_label})
pred_EF_roberta_bip = pd.DataFrame({'id':data_B['sent_id'], 'label': np.load(ROBERTA_EF_PRED_L_PATH, allow_pickle=True)})

#BERT Chinese 
bert_chinese_logits = np.load(BERT_CHN_LOGITS_PATH, allow_pickle=True)
bert_chinese_logits_pred_label =  np.argmax(bert_chinese_logits,axis=1)
pred_bert_chinese = pd.DataFrame({'id':data_B['sent_id'], 'label': bert_chinese_logits_pred_label})
pred_bert_chinese_bip = pd.DataFrame({'id':data_B['sent_id'], 'label': np.load(BERT_CHN_PRED_L_PATH, allow_pickle=True)})
#BERT Chinese + EF
bert_chinese_EF_logits = np.load(BERT_CHN_EF_LOGITS_PATH, allow_pickle=True)
bert_chinese_EF_logits_pred_label =  np.argmax(bert_chinese_EF_logits,axis=1)
pred_EF_bert_chinese = pd.DataFrame({'id':data_B['sent_id'], 'label': bert_chinese_EF_logits_pred_label})
pred_EF_bert_chinese_bip = pd.DataFrame({'id':data_B['sent_id'], 'label': np.load(BERT_CHN_EF_PRED_L_PATH, allow_pickle=True)})

#BERT multilingual 
bert_multilingual_logits = np.load(BERT_MULLING_LOGITS_PATH, allow_pickle=True)
bert_multilingual_logits_pred_label =  np.argmax(bert_multilingual_logits,axis=1)
pred_bert_multilingual = pd.DataFrame({'id':data_B['sent_id'], 'label': bert_multilingual_logits_pred_label})
pred_bert_multilingual_bip = pd.DataFrame({'id':data_B['sent_id'], 'label': np.load(BERT_MULLING_PRED_L_PATH, allow_pickle=True)})
#BERT multilingual + EF
bert_multilingual_EF_logits = np.load(BERT_MULLING_EF_LOGITS_PATH, allow_pickle=True)
bert_multilingual_EF_logits_pred_label =  np.argmax(bert_multilingual_EF_logits,axis=1)
pred_EF_bert_multilingual = pd.DataFrame({'id':data_B['sent_id'], 'label': bert_multilingual_EF_logits_pred_label})
pred_EF_bert_multilingual_bip = pd.DataFrame({'id':data_B['sent_id'], 'label': np.load(BERT_MULLING_EF_PRED_L_PATH, allow_pickle=True)})

In [None]:
print('pred_macbert       ', test_dataframe(pred_macbert, data_B))
print('pred_macbert+EF    ', test_dataframe(pred_EF_macbert, data_B))
print('pred_macbert+BIP   ', test_dataframe(pred_macbert_bip, data_B))
print('pred_macbert+EF+BIP', test_dataframe(pred_EF_macbert_bip, data_B))
print('-------------------------------------------------')
print('pred_roberta       ', test_dataframe(pred_roberta, data_B))
print('pred_roberta+EF    ', test_dataframe(pred_EF_roberta, data_B))
print('pred_roberta+BIP   ', test_dataframe(pred_roberta_bip, data_B))
print('pred_roberta+EF+BIP', test_dataframe(pred_EF_roberta_bip, data_B))
print('-------------------------------------------------')
print('pred_chinese       ', test_dataframe(pred_bert_chinese, data_B))
print('pred_chinese+EF    ', test_dataframe(pred_EF_bert_chinese, data_B))
print('pred_chinese+BIP   ', test_dataframe(pred_bert_chinese_bip, data_B))
print('pred_chinese+EF+BIP', test_dataframe(pred_EF_bert_chinese_bip, data_B))
print('-------------------------------------------------')
print('pred_multilingual       ', test_dataframe(pred_bert_multilingual, data_B))
print('pred_multilingual+EF    ', test_dataframe(pred_EF_bert_multilingual, data_B))
print('pred_multilingual+BIP   ', test_dataframe(pred_bert_multilingual_bip, data_B))
print('pred_multilingual+EF+BIP', test_dataframe(pred_EF_bert_multilingual_bip, data_B))

## GPT4 v.s. Ours correlation

In [None]:
record_ours = test_dataframe(ours_pred_k3, data_B)

In [None]:
record_GPT4 = test_dataframe(pred_GPT4, data_B)

In [None]:
Ours_and_GPT4_Rationale_Macro_f1 = pd.DataFrame(
    {
        'ours_Rationale_Macro_f1': record_ours['Rationale_Macro_f1_list'],
        'GPT4_Rationale_Macro_f1': record_GPT4['Rationale_Macro_f1_list']
    }
)
Ours_and_GPT4_Rationale_MAP = pd.DataFrame(
    {
        'ours_Rationale_MAP': record_ours['Rationale_MAP_list'],
        'GPT4_Rationale_MAP': record_GPT4['Rationale_MAP_list']
    }
)

In [None]:
sns.regplot(x="ours_Rationale_Macro_f1", y="GPT4_Rationale_Macro_f1", 
            data=Ours_and_GPT4_Rationale_Macro_f1, 
            line_kws={"color": "C3"}, scatter_kws={'alpha':0.2})
plt.xlabel("Our Rationale Macro F1")
plt.ylabel("GPT4 Rationale Macro F1")
plt.savefig("./GPT4_vs_Ours_Rationale_F1.png", format="png", dpi=700, bbox_inches='tight')
plt.show()

sns.regplot(x="ours_Rationale_MAP", y="GPT4_Rationale_MAP",
             data=Ours_and_GPT4_Rationale_MAP, 
             line_kws={"color": "C3"}, scatter_kws={'alpha':0.2})
plt.xlabel("Our Rationale MAP")
plt.ylabel("GPT4 Rationale MAP")
plt.savefig("./GPT4_vs_Ours_Rationale_MAP.png", format="png", dpi=700, bbox_inches='tight')
plt.show()