In [1]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
sys.path.append('../../../')
from stud.modelsTests.utils.print_infos import print_summary, display_history, plot_confusion_matrix, print_classification_report

In [3]:
# model_id = "vasudevgupta/bigbird-roberta-natural-questions"
model_id = "facebook/bart-large-mnli"
model = AutoModelForSequenceClassification.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [4]:
classifier = pipeline("zero-shot-classification", model=model_id)

In [4]:
import csv
def read_dataset(file_path):
    data = []
    with open(file_path) as file:
        tsv_file = csv.reader(file, delimiter="\t")
        for sample in tsv_file:
            if sample[3] == 'Pronoun-offset':
                continue
            data_row1 = {
                'id': sample[0], 
                'text': sample[1], 
                'pron': sample[2], 'p_offset': int(sample[3]), 
                'entity': sample[4], 'offset': int(sample[5]), 'is_coref': sample[6], # candidate 1
                # 'url': sample[10], # not useful
            }
            data_row2 = {
                'id': sample[0], 
                'text': sample[1], 
                'pron': sample[2], 'p_offset': int(sample[3]), 
                'entity': sample[7], 'offset': int(sample[8]), 'is_coref': sample[9], # candidate 2
                # 'url': sample[10], # not useful
            }
            data.append(data_row1); data.append(data_row2)
    return data

data_test = read_dataset('../../../../data/dev.tsv')

In [5]:
def select_model_coref(sample):
    candidates = [ 'not ' + sample['entity'], sample['entity'] ]
    labels, scores = [], []
    for label in candidates:
        premise = sample['text']
        hypothesis = f"{sample['pron']} is {label}."

        # run through model pre-trained on MNLI
        x = tokenizer.encode(premise, hypothesis, return_tensors='pt',
                            truncation_strategy='only_first')
        with torch.no_grad():
            logits = model(x)[0]

        # we throw away "neutral" (dim 1) and take the probability of
        # "entailment" (2) as the probability of the label being true 
        entail_contradiction_logits = logits[:,[0,2]]
        probs = entail_contradiction_logits.softmax(dim=1)
        prob_label_is_true = probs[:,1].tolist()[0]

        labels.append(label)
        scores.append(prob_label_is_true)

    return labels, scores

def forward(sample):
    labels, scores = select_model_coref(sample)
    if all(x <= 0.5 for x in scores):
        return 0
    else:
        idx = scores.index( max(scores) )
        return idx

In [6]:
id_n = 5
data_test[id_n]

{'id': 'validation-3',
 'text': 'When she returns to her hotel room, a Liberian man (Tony Todd) forces her to smuggle $20 million worth of conflict diamonds to New York, or else fellow fight attendant and friend Angela will die. She is caught before she can board the flight, and the team now have nine hours until the plane lands, and save Angela. After the confiscated diamonds are stolen by the brother of Kaleo (Jason Scott Lee), whom Danny put away for murdering his partner last year, Five-0 and Chief Fryer team up and enlist the help of August March (Ed Asner), who served a 30-year sentence for smuggling diamonds.',
 'pron': 'his',
 'p_offset': 435,
 'entity': 'Danny',
 'offset': 406,
 'is_coref': 'TRUE'}

In [7]:
def select_model_coref_multi(data, batch_size = 16):
    labels, scores = [], []

    premises_inputs, hypothesis_inputs = [], []

    for i in range(0,len(data),batch_size):
        samples = data[i:i+batch_size]
        for sample in samples:
            candidates = [ 'not ' + sample['entity'] , sample['entity'] ]
            labels.append(candidates)
            for label in candidates:
                premise = sample['text']
                hypothesis = f"{sample['pron']} is {label}."

                premises_inputs.append(premise); hypothesis_inputs.append(hypothesis)

        # run through model pre-trained on MNLI
        x = tokenizer(premises_inputs, hypothesis_inputs, return_tensors='pt', is_split_into_words=False, padding=True)['input_ids']
        with torch.no_grad():
            logits = model(x)[0]
        probs = logits[:,[0,2]].softmax(dim=-1)[:,1]
        scores += probs.reshape((probs.shape[0]//2,-1)).tolist()

    return labels, scores

def forward(samples):
    results = []
    labels_multi, scores_multi = select_model_coref_multi(samples)
    for labels, scores in zip(labels_multi, scores_multi):
        res = 0 if all(x <= 0.5 for x in scores) else scores.index( max(scores) )
        results.append(res)
    return results

In [8]:
true_labels = [1 if s['is_coref']=='TRUE' else 0 for s in data_test]
correctly_predicted = 0

labels_multi, scores_multi = select_model_coref_multi(data_test)

for t_label, labels, scores in zip(true_labels, labels_multi, scores_multi):
    res = 0 if all(x <= 0.5 for x in scores) else scores.index( max(scores) )
    if res == t_label:
        correctly_predicted += 1

KeyboardInterrupt: 

In [44]:
p, t = 0, 0
for e in data_test:
    r = forward(e)
    l = 1 if e['is_coref'] == 'TRUE' else 0
    if r == l:
        p+=1
    t+=1
p,t, p/t

(611, 908, 0.6729074889867841)