# Download e-SNLI dev and test sets


In [None]:
%cd ../

In [3]:
!wget https://raw.githubusercontent.com/OanaMariaCamburu/e-SNLI/master/dataset/esnli_test.csv

--2022-01-02 11:17:55--  https://raw.githubusercontent.com/OanaMariaCamburu/e-SNLI/master/dataset/esnli_test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7438107 (7.1M) [text/plain]
Saving to: ‘esnli_test.csv’


2022-01-02 11:17:57 (4.90 MB/s) - ‘esnli_test.csv’ saved [7438107/7438107]



In [4]:
!wget https://raw.githubusercontent.com/OanaMariaCamburu/e-SNLI/master/dataset/esnli_dev.csv

--2022-01-02 11:18:26--  https://raw.githubusercontent.com/OanaMariaCamburu/e-SNLI/master/dataset/esnli_dev.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7501310 (7.2M) [text/plain]
Saving to: ‘esnli_dev.csv’


2022-01-02 11:18:29 (3.55 MB/s) - ‘esnli_dev.csv’ saved [7501310/7501310]



# Multilingual Plausiblity

## Find threshold on e-SNLI test set

In [2]:
from eval import multilingual_plausibility
from explainli.config import AttributionMethods, AttributionConfig, AggregationMethods, ForwardScoringOptions
from explainli.explainli import NLIAttribution

model_name = 'textattack/bert-base-uncased-snli'

In [3]:
attr_config = AttributionConfig(AttributionMethods.Saliency, remove_pad_tokens=True,
                                             remove_cls_token=True, remove_sep_tokens=True, join_subwords=True,
                                             normalize_scores=True,
                                             forward_scoring= ForwardScoringOptions.LOSS,
                                             aggregation_method=AggregationMethods.L2,
                                             label_names=['entailment', 'neutral', 'contradiction'])

attribution = NLIAttribution(model_name=model_name, config=attr_config)


In [4]:
f1, threshold = multilingual_plausibility.find_threshold_on_esnli(attribution, 'esnli_test.csv', 96)
print(f"best F1: {f1}, best threshold: {threshold}")

100%|██████████| 103/103 [00:43<00:00,  2.38it/s]


best F1: 0.48759531736655565, best threshold: 0.1673865020275116


## Create multilingual rationales dataset

### Initialize multilingual NLI model, attribution method and load XNLI test split

In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel

nli_model_name = '../explainli/bert-base-multilingual-finetuned-mnli'

dataset = load_dataset('xnli', 'all_languages', split='test')


attr_config = AttributionConfig(AttributionMethods.Saliency, remove_pad_tokens=True,
                                             remove_cls_token=True, remove_sep_tokens=False, join_subwords=True,
                                             normalize_scores=True,
                                             forward_scoring= ForwardScoringOptions.LOSS,
                                             aggregation_method=AggregationMethods.L2,
                                             label_names=['entailment', 'neutral', 'contradiction'])

attribution = NLIAttribution(model_name=nli_model_name, config=attr_config)

Reusing dataset xnli (/home/pyanardag/.cache/huggingface/datasets/xnli/all_languages/1.1.0/243f155ecab4d4f6e82e4eeab62b8c6b1f7abfcb8ed7fcc1661be8e25b117404)


### Extract rationales for source language, English by default, part of XNLI, align highlights to other languages and create dataset

In [6]:
word_aligner = '../explainli/awesome-align-finetuned-wo-co'

pairs, labels, src_highlight_idxs = multilingual_plausibility.extract_rationales(attribution, dataset, threshold, 10)
pairs, labels, translated_highlight_idxs = multilingual_plausibility.align_rationales(pairs, labels, src_highlight_idxs, word_aligner)

tokenizer = AutoTokenizer.from_pretrained(word_aligner)

multilingual_plausibility.create_dataset('exnli_test.csv', attribution, pairs, labels, translated_highlight_idxs, tokenizer)

5010it [00:00, 107744.22it/s]
100%|██████████| 501/501 [00:36<00:00, 13.74it/s]
Some weights of the model checkpoint at ../explainli/awesome-align-finetuned-wo-co were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'guide_layer.linear.weight', 'guide_layer.linear2.weight', 'guide_layer.linear.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'psi_cls.bias', 'cls.predictions.decoder.bias', 'psi_cls.transform.bias', 'psi_cls.decoder.weight', 'guide_layer.linear2.bias', 'cls.predictions.bias', 'psi_cls.transform.weight', 'psi_cls.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the che

### Sample from dataset

In [8]:
import pandas as pd
df = pd.read_csv('exnli_test.csv', engine='python', encoding='utf-8')
df[df.language == 'en']

Unnamed: 0,language,label,premise,hypothesis,premise_highlighted,hypothesis_highlighted
20040,en,contradiction,"Well, I wasn't even thinking about that, but I...",I havent spoken to him again.,"Well , I wasn ' t even thinking about that , b...",I *havent* *spoken* to him *again* .
20041,en,entailment,"Well, I wasn't even thinking about that, but I...",I was so upset that I just started talking to ...,"Well , I wasn ' t even thinking about that , b...",I was so *upset* that I *just* *started* talki...
20042,en,neutral,"Well, I wasn't even thinking about that, but I...",We had a great talk.,"Well , I wasn ' t even thinking about that , b...",We had a *great* *talk* .
20043,en,neutral,"And I thought that was a privilege, and it's s...",I was not aware that I was not the only person...,"And I *thought* that was a *privilege* , and i...",I was not *aware* that I was not the only pers...
20044,en,entailment,"And I thought that was a privilege, and it's s...",I was under the impression that I was the only...,"And I *thought* that was a *privilege* , and i...",I was *under* the *impression* that I was the ...
...,...,...,...,...,...,...
25045,en,entailment,Davidson should not adopt the pronunciation of...,Davidson shouldn't talk in a way where bone an...,Davidson should not *adopt* the *pronunciation...,Davidson *shouldn* ' t *talk* in a way where b...
25046,en,contradiction,Davidson should not adopt the pronunciation of...,It would be better if Davidson rhymed the word...,Davidson *should* not *adopt* the *pronunciati...,It *would* be *better* *if* Davidson *rhymed* ...
25047,en,neutral,"The average novel of 200,000 words for $25 wor...","A 200,000 word novel at $25 is a fair price.","The average novel of 200 , 000 words for $ 25 ...","A 200 , 000 *word* novel at $ 25 is a *fair* *..."
25048,en,contradiction,"The average novel of 200,000 words for $25 wor...","A 200,000 word novel for $25 is 4,000 words pe...","The average novel of 200 , 000 words for $ 25 ...","A 200 , 000 *word* novel for $ 25 is *4* , 000..."


## Evaluate Multilingual Plausibility for all languages

In [9]:
# clear previous attribution records
attribution.records.clear()
attribution.config.remove_sep_tokens = True
# get xnli languages
languages = dataset['hypothesis'][0]['language']
map_scores = multilingual_plausibility.evaluate_multilingual_plausibility('exnli_test.csv', attribution, languages, 8)

ar


100%|██████████| 627/627 [00:46<00:00, 13.54it/s]


MAP score for ar: 0.6632450105527579
bg


100%|██████████| 627/627 [00:48<00:00, 12.96it/s]


MAP score for bg: 0.7005667630172827
de


100%|██████████| 627/627 [00:44<00:00, 14.04it/s]


MAP score for de: 0.7320776542448928
el


100%|██████████| 627/627 [00:58<00:00, 10.67it/s]


MAP score for el: 0.6959715779373153
en


100%|██████████| 627/627 [00:43<00:00, 14.34it/s]


MAP score for en: 1.0
es


100%|██████████| 627/627 [00:44<00:00, 14.06it/s]


MAP score for es: 0.7664286772544779
fr


100%|██████████| 627/627 [00:45<00:00, 13.68it/s]


MAP score for fr: 0.7391015385448011
hi


100%|██████████| 627/627 [00:52<00:00, 11.96it/s]


MAP score for hi: 0.6040199596274182
ru


100%|██████████| 627/627 [00:47<00:00, 13.17it/s]
  recall = tps / tps[-1]


MAP score for ru: 0.6857346896929614
sw


100%|██████████| 627/627 [00:47<00:00, 13.19it/s]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]


MAP score for sw: 0.5798971496373715
th


100%|██████████| 627/627 [01:13<00:00,  8.50it/s]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]


MAP score for th: 0.931514867206828
tr


100%|██████████| 627/627 [00:46<00:00, 13.50it/s]


MAP score for tr: 0.6652962202865744
ur


100%|██████████| 627/627 [00:50<00:00, 12.50it/s]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]
  recall = tps / tps[-1]


MAP score for ur: 0.5745013451420691
vi


100%|██████████| 627/627 [00:45<00:00, 13.83it/s]


MAP score for vi: 0.5717117424783905
zh


100%|██████████| 627/627 [00:45<00:00, 13.74it/s]
  recall = tps / tps[-1]


MAP score for zh: 0.5432442886485477


In [10]:
for lang, map_score in map_scores.items():
  print(f"{lang} MAP score: {map_score} ")

ar MAP score: 0.6632450105527579 
bg MAP score: 0.7005667630172827 
de MAP score: 0.7320776542448928 
el MAP score: 0.6959715779373153 
en MAP score: 1.0 
es MAP score: 0.7664286772544779 
fr MAP score: 0.7391015385448011 
hi MAP score: 0.6040199596274182 
ru MAP score: 0.6857346896929614 
sw MAP score: 0.5798971496373715 
th MAP score: 0.931514867206828 
tr MAP score: 0.6652962202865744 
ur MAP score: 0.5745013451420691 
vi MAP score: 0.5717117424783905 
zh MAP score: 0.5432442886485477 
