In [1]:
import torch
from custom_models import load_custom_class
import seaborn as sns
import matplotlib.pyplot as plt

# sub class from modelling_bert "RobertaForSequenceClassification" and override the forward method
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

In [2]:
from example_tokenization import custom_tokenization
from custom_tokenizer_abstract import CustomTokenizerGeneral

repo_link_nli = "cross-encoder/nli-distilroberta-base"
# repo_link_nli = "sentence-transformers/nli-bert-base"

model_args = {}

tokenizer_nli, model_nli = load_custom_class(repo_link_nli, device, **model_args)
vocabulary_id2tok = {tok_id:tok for tok, tok_id in tokenizer_nli.vocab.items()}

# BERT
# custom_tokenizer = CustomTokenizerGeneral(tokenizer_nli, custom_tokenization, separator_marker="##", special_space_token="")
# RoBERTa
custom_tokenizer = CustomTokenizerGeneral(tokenizer_nli, custom_tokenization, separator_marker="", special_space_token="Ġ")

Tensors and operations will be done on cuda:0.


In [3]:
import json
import pandas as pd

# data_path = "data/" + "/snli_1.0" + "/snli_1.0_test.jsonl"
data_path = "data/" + "/multinli_1.0" + "/multinli_1.0_dev_mismatched.jsonl"

data = []
limit = 500_000
with open(data_path, "r") as file:
    for _ in range(limit):
        json_obj = file.readline()
        if json_obj != "":
            data += [json.loads(json_obj)]
        else:
            break


In [4]:
data_df = {
    "label": [],
    "sent1": [],
    "sent2": []
}

for datum in data:
    data_df["label"] += [datum["gold_label"]]
    data_df["sent1"] += [datum["sentence1"]] # premise
    data_df["sent2"] += [datum["sentence2"]] # hypothesis

data_df = pd.DataFrame(data_df).iloc[:3]
print(data_df.shape)
data_df.head()

(3, 3)


Unnamed: 0,label,sent1,sent2
0,contradiction,Your contribution helped make it possible for ...,Your contributions were of no help with our st...
1,contradiction,"The answer has nothing to do with their cause,...",Dictionaries are indeed exercises in bi-unique...
2,entailment,We serve a classic Tuscan meal that includes ...,We serve a meal of Florentine terrine.


In [5]:
from prediction_utilities import get_prediction

tokenizer_args_normal = {
    "return_tensors": "pt"
}
tokenizer_args_custom = {
    "do_lowercase": True
}

responses = {
    "custom": [],
    "normal": []
}

def df_predict(row, model_nli, tokenizer,  is_custom=True, **tokenizer_args):
    if is_custom:
        input = (row["sent1"], row["sent2"])
    else:
        input = row["sent1"] + " " + row["sent2"]
    prediction = get_prediction(input, model_nli, tokenizer, **tokenizer_args)
    
    return prediction["label"], prediction["prob"]

from tqdm import tqdm
tqdm.pandas()
# to see progress during operation: progress_apply instead of apply
results_custom = data_df.apply(df_predict, axis=1, model_nli=model_nli, tokenizer=custom_tokenizer, is_custom=True, **tokenizer_args_custom)
# results_normal = data_df.apply(df_predict, axis=1, model_nli=model_nli, tokenizer=tokenizer_nli, is_custom=False, **tokenizer_args_normal)

83 55 138
519 58 577
84 32 116


In [8]:
results_custom.to_json("results_custom_tokenizer.json")

In [9]:
# import numpy as np
# labels = np.asarray([label for label, _ in results_normal])

In [10]:
# sum(labels == labels_gt)/len(labels)

In [6]:
idx_example = 9990
example = data_df.iloc[idx_example]["sent1"] + " " + data_df.iloc[idx_example]["sent2"]
example_custom = (data_df.iloc[idx_example]["sent1"], data_df.iloc[idx_example]["sent2"])

In [8]:
import shap
from prediction_utilities import get_prediction_model_outputs
from functools import partial

tokenizer_args_normal = {
    "return_tensors": "pt",
    "padding": "longest"
}

import numpy as np
print(get_prediction(premise_hypothesis=[example], device=device, model_nli=model_nli, custom_tokenizer=tokenizer_nli, **tokenizer_args_normal))

explainer = shap.Explainer(partial(get_prediction_model_outputs, model=model_nli, tokenizer=tokenizer_nli, **tokenizer_args_normal), tokenizer_nli, output_names=list(model_nli.config.label2id.keys()))
result = explainer([example])
from IPython.display import display, HTML

shap_plot = shap.plots.text(result, display=False)
print(f"Actual label: {data_df.iloc[idx_example]['label']}")
display(HTML(shap_plot))

{'label': 'contradiction', 'prob': 0.9257515668869019, 'all_probs': {'contradiction': 0.9257515668869019, 'neutral': 0.07057809084653854, 'entailment': 0.0036703646183013916}}
Actual label: contradiction


In [9]:
import shap
from functools import partial

import shap.maskers

tokenizer_args_custom = {
    "do_lowercase": False,
    "return_tensors": "pt",
    "padding": "longest",
    "return_offsets_mapping": True
}

import numpy as np
# print(get_prediction(premise_hypothesis=example_custom, device=device, model_nli=model_nli, custom_tokenizer=custom_tokenizer, **tokenizer_args_normal))

masker = shap.maskers.Text(custom_tokenizer, mask_token=tokenizer_nli.mask_token)
explainer = shap.Explainer(partial(get_prediction_model_outputs, model=model_nli, tokenizer=custom_tokenizer, **tokenizer_args_custom), masker, output_names=list(model_nli.config.label2id.keys()))
result = explainer([example])

  aux_token_list += self.get_token_id(token)


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [04:07, 247.89s/it]              


In [10]:
from IPython.display import display, HTML

# GLOBAL SHAPLEY COMPUTATIONS - WHICH TOKENS CONTRIBUTE MOST TO CERTAIN LABELS
shap_plot = shap.plots.text(result, display=False)
print(f"Actual label: {data_df.iloc[idx_example]['label']}")
display(HTML(shap_plot))

Actual label: contradiction
