# Download e-SNLI dev and test sets


In [None]:
%cd ../

In [None]:
!wget https://raw.githubusercontent.com/OanaMariaCamburu/e-SNLI/master/dataset/esnli_test.csv

In [None]:
!wget https://raw.githubusercontent.com/OanaMariaCamburu/e-SNLI/master/dataset/esnli_dev.csv

# Multilingual Plausiblity

## Find threshold on e-SNLI test set

In [None]:
from eval import multilingual_plausibility
from explainli.config import AttributionMethods, AttributionConfig, AggregationMethods, ForwardScoringOptions
from explainli.explainli import NLIAttribution

model_name = 'textattack/bert-base-uncased-snli'

In [None]:
attr_config = AttributionConfig(AttributionMethods.Saliency, remove_pad_tokens=True,
                                             remove_cls_token=True, remove_sep_tokens=True, join_subwords=True,
                                             normalize_scores=True,
                                             forward_scoring= ForwardScoringOptions.LOSS,
                                             aggregation_method=AggregationMethods.L2,
                                             label_names=['entailment', 'neutral', 'contradiction'])

attribution = NLIAttribution(model_name=model_name, config=attr_config)


In [None]:
f1, threshold = multilingual_plausibility.find_threshold_on_esnli(attribution, 'esnli_test.csv', 96)
print(f"best F1: {f1}, best threshold: {threshold}")

100%|██████████| 103/103 [00:43<00:00,  2.38it/s]


best F1: 0.48759531736655565, best threshold: 0.1673865020275116


## Create multilingual rationales dataset

### Initialize multilingual NLI model, attribution method and load XNLI test split

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel

nli_model_name = '../explainli/bert-base-multilingual-finetuned-mnli'

dataset = load_dataset('xnli', 'all_languages', split='test')


attr_config = AttributionConfig(AttributionMethods.Saliency, remove_pad_tokens=True,
                                             remove_cls_token=True, remove_sep_tokens=False, join_subwords=True,
                                             normalize_scores=True,
                                             forward_scoring= ForwardScoringOptions.LOSS,
                                             aggregation_method=AggregationMethods.L2,
                                             label_names=['entailment', 'neutral', 'contradiction'])

attribution = NLIAttribution(model_name=nli_model_name, config=attr_config)

### Extract rationales for source language, English by default, part of XNLI, align highlights to other languages and create dataset

In [None]:
word_aligner = '../explainli/awesome-align-finetuned-wo-co'

pairs, labels, src_highlight_idxs = multilingual_plausibility.extract_rationales(attribution, dataset, threshold, 10)
pairs, labels, translated_highlight_idxs = multilingual_plausibility.align_rationales(pairs, labels, src_highlight_idxs, word_aligner)

tokenizer = AutoTokenizer.from_pretrained(word_aligner)

multilingual_plausibility.create_dataset('exnli_test.csv', attribution, pairs, labels, translated_highlight_idxs, tokenizer)

### Sample from dataset

In [None]:
import pandas as pd
df = pd.read_csv('exnli_test.csv', engine='python', encoding='utf-8')
df[df.language == 'en']

Unnamed: 0,language,label,premise,hypothesis,premise_highlighted,hypothesis_highlighted
20040,en,contradiction,"Well, I wasn't even thinking about that, but I...",I havent spoken to him again.,"Well , I wasn ' t even thinking about that , b...",I *havent* *spoken* to him *again* .
20041,en,entailment,"Well, I wasn't even thinking about that, but I...",I was so upset that I just started talking to ...,"Well , I wasn ' t even thinking about that , b...",I was so *upset* that I *just* *started* talki...
20042,en,neutral,"Well, I wasn't even thinking about that, but I...",We had a great talk.,"Well , I wasn ' t even thinking about that , b...",We had a *great* *talk* .
20043,en,neutral,"And I thought that was a privilege, and it's s...",I was not aware that I was not the only person...,"And I *thought* that was a *privilege* , and i...",I was not *aware* that I was not the only pers...
20044,en,entailment,"And I thought that was a privilege, and it's s...",I was under the impression that I was the only...,"And I *thought* that was a *privilege* , and i...",I was *under* the *impression* that I was the ...
...,...,...,...,...,...,...
25045,en,entailment,Davidson should not adopt the pronunciation of...,Davidson shouldn't talk in a way where bone an...,Davidson should not *adopt* the *pronunciation...,Davidson *shouldn* ' t *talk* in a way where b...
25046,en,contradiction,Davidson should not adopt the pronunciation of...,It would be better if Davidson rhymed the word...,Davidson *should* not *adopt* the *pronunciati...,It *would* be *better* *if* Davidson *rhymed* ...
25047,en,neutral,"The average novel of 200,000 words for $25 wor...","A 200,000 word novel at $25 is a fair price.","The average novel of 200 , 000 words for $ 25 ...","A 200 , 000 *word* novel at $ 25 is a *fair* *..."
25048,en,contradiction,"The average novel of 200,000 words for $25 wor...","A 200,000 word novel for $25 is 4,000 words pe...","The average novel of 200 , 000 words for $ 25 ...","A 200 , 000 *word* novel for $ 25 is *4* , 000..."


## Evaluate Multilingual Plausibility for all languages

In [None]:
# clear previous attribution records
attribution.records.clear()
attribution.config.remove_sep_tokens = True
# get xnli languages
languages = dataset['hypothesis'][0]['language']
map_scores = multilingual_plausibility.evaluate_multilingual_plausibility('exnli_test.csv', attribution, languages, 8)

In [None]:
for lang, map_score in map_scores.items():
  print(f"{lang} MAP score: {map_score} ")

ar MAP score: 0.6632450105527579 
bg MAP score: 0.7005667630172827 
de MAP score: 0.7320776542448928 
el MAP score: 0.6959715779373153 
en MAP score: 1.0 
es MAP score: 0.7664286772544779 
fr MAP score: 0.7391015385448011 
hi MAP score: 0.6040199596274182 
ru MAP score: 0.6857346896929614 
sw MAP score: 0.5798971496373715 
th MAP score: 0.931514867206828 
tr MAP score: 0.6652962202865744 
ur MAP score: 0.5745013451420691 
vi MAP score: 0.5717117424783905 
zh MAP score: 0.5432442886485477 
