# Evaluate Explanations

The code in this notebook is for evaluating the predictions produced by GPT-3 based on the metrics suggested by Kunz, Human Ratings do not Reflect Downstream Utility, 2022.

## Import Libraries

In [4]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import openai
import re
import time
from scripts.prepare_data_helpers import prepare_examples, create_query
import spacy
import textacy
from dotenv import load_dotenv
from bert_score import score
import transformers

load_dotenv()
nlp = spacy.load('en_core_web_sm')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
import xgboost as xgb
import shap

  from .autonotebook import tqdm as notebook_tqdm


## Prepare Data

In [5]:
train1 = pd.read_csv('../../e-SNLI/dataset/esnli_train_1.csv')
train2 = pd.read_csv('../../e-SNLI/dataset/esnli_train_2.csv')
train = pd.concat([train1, train2])
dev = pd.read_csv('../../e-SNLI/dataset/esnli_dev.csv')
test = pd.read_csv('../../e-SNLI/dataset/esnli_test.csv')

train = train.dropna(subset=['Sentence1', 'Sentence2', 'Explanation_1'])
train = train.reset_index(drop=True)

np.random.seed(12345) # seed for numpy package
test_indices = list(np.random.choice(test.index.values, size=1000, replace=False))
test = test.loc[test_indices]
test = test.reset_index(drop=True)

In [6]:
gpt_pred = pd.read_csv("../output_data/svo_structure_4.csv", sep=";")
gpt2 = pd.read_csv("../output_data/svo_structure_gpt2.csv", sep=";")

# Transform list to single string

# GPT-3
gpt_pred.reduced_expl = [i.replace("'", " ").replace(" ", "").strip("][").replace(",", " ") for i in gpt_pred.reduced_expl]
gpt_pred.pos_expl = [i.replace("'", " ").replace(" ", "").strip("][").replace(",", " ") for i in gpt_pred.pos_expl]

# Gold Explanations
gpt_pred.reduced_gold_expl = [i.replace("'", " ").replace(" ", "").strip("][").replace(",", " ") for i in gpt_pred.reduced_gold_expl]
gpt_pred.pos_gold_expl = [i.replace("'", " ").replace(" ", "").strip("][").replace(",", " ") for i in gpt_pred.pos_gold_expl]

# GPT-2
gpt2.reduced_expl = [i.replace("'", " ").replace(" ", "").strip("][").replace(",", " ") for i in gpt2.reduced_expl]
gpt2.pos_expl = [i.replace("'", " ").replace(" ", "").strip("][").replace(",", " ") for i in gpt2.pos_expl]
gpt2["pred_label"] = gpt2.labels

## Bert Score

In [40]:
_, _, F1_gpt3 = score(list(gpt_pred.pred_explanation), list(gpt_pred.Explanation_1), lang="en", verbose=True)
_, _, F1_gpt2 = score(list(gpt2.generated), list(gpt2.explanations), lang="en", verbose=True)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


calculating scores...
computing bert embedding.


100%|██████████| 32/32 [01:17<00:00,  2.43s/it]


computing greedy matching.


100%|██████████| 16/16 [00:00<00:00, 115.11it/s]


done in 78.02 seconds, 12.82 sentences/sec


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


calculating scores...
computing bert embedding.


100%|██████████| 31/31 [01:08<00:00,  2.21s/it]


computing greedy matching.


100%|██████████| 16/16 [00:00<00:00, 124.90it/s]


done in 68.60 seconds, 14.58 sentences/sec


In [41]:
print(F1_gpt2.mean())
print(F1_gpt3.mean())

tensor(0.9136)
tensor(0.9025)


## Surface Features

In [44]:
nwords = gpt_pred.pred_explanation.apply(lambda x: len(x.split()))
nwords_gold = gpt_pred.Explanation_1.apply(lambda x: len(x.split()))
nwords_gpt2 = gpt2.generated.apply(lambda x: len(x.split()))

nchars = gpt_pred.pred_explanation.apply(len)
nchars_gold = gpt_pred.Explanation_1.apply(len)
nchars_gpt2 = gpt2.generated.apply(len)

print(f"Average number of words predicted explanations {nwords.mean()}")
print(f"Average number of words gold explanations {nwords_gold.mean()}")
print(f"Average number of words gpt2 explanations {nwords_gpt2.mean()}")
print(" ")

print(f"Average number of characters predicted explanations {nchars.mean()}")
print(f"Average number of characters gold explanations {nchars_gold.mean()}")
print(f"Average number of characters gpt2 explanations {nchars_gpt2.mean()}")

Average number of words predicted explanations 14.382
Average number of words gold explanations 12.916
Average number of words gpt2 explanations 11.782
 
Average number of characters predicted explanations 76.974
Average number of characters gold explanations 67.122
Average number of characters gpt2 explanations 60.293


## GPT-3

In [30]:
pred_expl_vocab = gpt_pred.pred_explanation.apply(lambda x: np.unique([token.lemma_ for token in nlp(x) if not token.is_punct]))
Expl1_vocab = gpt_pred.Explanation_1.apply(lambda x: np.unique([token.lemma_ for token in nlp(x) if not token.is_punct]))

In [46]:
overlap = list()
for pred, gold in zip(pred_expl_vocab, Expl1_vocab):
    overlap.append(len(set(pred) & set(gold)) / len(pred))
overlap = np.array(overlap)

In [48]:
overlap.mean()

0.47741691306320316

In [55]:
pred_expl_vocab_total = set([x for l in pred_expl_vocab for x in l])
Expl1_vocab_total = set([x for l in Expl1_vocab for x in l])
overlap_total = len(pred_expl_vocab_total & Expl1_vocab_total) / len(pred_expl_vocab_total)
print(f"Total vocabulary size predicted explanations: {len(pred_expl_vocab_total)}")
print(f"Total vocabulary size gold explanations: {len(Expl1_vocab_total)}")
print(f"Total overlap: {overlap_total}")

Total vocabulary size predicted explanations: 1664
Total vocabulary size gold explanations: 1616
Total overlap: 0.7379807692307693


In [60]:
np.mean(gpt_pred.pred_label == gpt_pred.gold_label)

0.79

## GPT-2

In [45]:
pred_expl_vocab = gpt2.generated.apply(lambda x: np.unique([token.lemma_ for token in nlp(x) if not token.is_punct]))
Expl1_vocab = gpt2.explanations.apply(lambda x: np.unique([token.lemma_ for token in nlp(x) if not token.is_punct]))

In [46]:
overlap = list()
for pred, gold in zip(pred_expl_vocab, Expl1_vocab):
    overlap.append(len(set(pred) & set(gold)) / len(pred))
overlap = np.array(overlap)

In [47]:
overlap.mean()

0.6079847222075813

In [48]:
pred_expl_vocab_total = set([x for l in pred_expl_vocab for x in l])
Expl1_vocab_total = set([x for l in Expl1_vocab for x in l])
overlap_total = len(pred_expl_vocab_total & Expl1_vocab_total) / len(pred_expl_vocab_total)
print(f"Total vocabulary size predicted explanations: {len(pred_expl_vocab_total)}")
print(f"Total vocabulary size gold explanations: {len(Expl1_vocab_total)}")
print(f"Total overlap: {overlap_total}")

Total vocabulary size predicted explanations: 1462
Total vocabulary size gold explanations: 1620
Total overlap: 0.8091655266757866
