In [37]:
import torch
from transformers import AutoModel, AutoTokenizer, BigBirdForQuestionAnswering, BigBirdTokenizer, AutoModelForQuestionAnswering, pipeline

In [55]:
import sys
sys.path.append('../../../')
from stud.modelsTests.utils.print_infos import print_summary, display_history, plot_confusion_matrix, print_classification_report

In [32]:
# model_id = "vasudevgupta/bigbird-roberta-natural-questions"
model_id = "deepset/roberta-base-squad2"
model = AutoModelForQuestionAnswering.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [38]:
pipeline_model = pipeline('question-answering', model=model_id, tokenizer=model_id)

In [42]:
def answer_question_coref(sentence,pron,possible_coref):
    tokenized_input = tokenizer(f"{pron}", sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**tokenized_input)
        return outputs
        print(outputs)
    answer_start_index = outputs.start_logits.argmax()
    answer_end_index = outputs.end_logits.argmax()

    answer = (None, possible_coref)

    if answer_end_index + 1 - answer_start_index <= 5:
        predict_answer_tokens = tokenized_input.input_ids[0, answer_start_index : answer_end_index + 1]
        answer_model = tokenizer.decode(predict_answer_tokens)

        answer = (answer_model, possible_coref)

    return answer

def answer_pipeline_coref(sentence,pron,possible_coref):
    input = {
        'question': pron,
        'context': sentence,
    }
    outputs = pipeline_model(input)

    return (outputs,possible_coref)

In [34]:
import csv
def read_dataset(file_path):
    data = []
    with open(file_path) as file:
        tsv_file = csv.reader(file, delimiter="\t")
        for sample in tsv_file:
            if sample[3] == 'Pronoun-offset':
                continue
            data_row = {
                'id': sample[0], 
                'text': sample[1], 
                'pron': sample[2], 'p_offset': int(sample[3]), 
            }
            if sample[6] == 'TRUE':
                data_row['entity'] = sample[4]
            elif sample[9] == 'TRUE':
                data_row['entity'] = sample[7]
            else:
                data_row['entity'] = None
            data.append(data_row)
    return data

data_test = read_dataset('../../../../data/dev.tsv')

In [35]:
data_test[1]

{'id': 'validation-2',
 'text': "Kathleen Nott was born in Camberwell, London. Her father, Philip, was a lithographic printer, and her mother, Ellen, ran a boarding house in Brixton; Kathleen was their third daughter. She was educated at Mary Datchelor Girls' School (now closed), London, before attending King's College, London.",
 'pron': 'She',
 'p_offset': 185,
 'entity': 'Kathleen'}

In [40]:
id_n = 1
answer_pipeline_coref(data_test[id_n]['text'], data_test[id_n]['pron'], data_test[id_n]['entity'])

({'score': 0.0008882174151949584,
  'start': 150,
  'end': 183,
  'answer': 'Kathleen was their third daughter'},
 'Kathleen')

In [43]:
dwada = answer_question_coref(data_test[id_n]['text'], data_test[id_n]['pron'], data_test[id_n]['entity'])

In [46]:
dwada

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[ 0.2113, -6.9608, -8.9482, -9.2454, -2.9486, -5.5707, -7.1608, -5.9401,
         -7.9667, -7.3378, -6.0768, -7.8847, -5.5627, -8.3789, -8.1483, -8.1586,
         -4.8637, -7.0746, -6.1378, -5.7220, -8.5980, -4.6735, -8.9260, -7.9592,
         -7.1347, -6.2822, -8.3666, -5.4385, -9.0219, -8.1133, -6.1346, -5.9671,
         -8.9614, -4.6513, -9.1758, -7.5530, -8.0446, -7.1105, -8.5883, -8.2964,
         -6.0017, -7.8218, -7.1155, -2.9285, -7.4679, -5.7129, -4.6358, -5.7354,
         -7.0751, -4.2725, -6.8378, -6.5235, -8.3858, -4.0208, -6.1957, -7.5050,
         -8.0050, -6.6290, -8.8395, -7.2864, -7.8868, -8.0800, -8.0854, -8.4041,
         -5.5269, -8.7176, -7.4721, -6.9913, -4.1658, -8.4755, -7.3071, -8.7859,
         -5.4515, -7.0750, -9.0985]]), end_logits=tensor([[ 0.5932, -8.2112, -8.2273, -8.5653, -7.8470, -7.8072, -3.9050, -8.2495,
         -3.3142, -8.0263, -5.9013, -8.3137, -8.6883, -8.3963, -4.7972, -7.2745,
      

In [52]:
torch.nn.Softmax(dim=-1)(dwada.start_logits).argmax()

tensor(0)

In [53]:
torch.nn.Softmax(dim=-1)(dwada.end_logits).argmax()

tensor(0)

In [21]:
answesds = []
for e in data_test:
    answesds.append( answer_question_coref(e['text'], e['pron'], e['entity']) )

In [22]:
answesds

[('', None),
 (None, 'Kathleen'),
 ('Kaleo', 'Danny'),
 (None, 'Reucassel'),
 (None, 'Beryl Markham'),
 ('', 'Jos* Alvarez'),
 (None, 'Faik Pasha'),
 (None, 'Jake Burns'),
 (None, 'Cowan'),
 (None, 'Beverley Callard'),
 (None, 'Kallergis'),
 (None, 'Nicole'),
 (None, 'Queen'),
 (None, 'Michael Kidd'),
 (None, 'Herring'),
 ('', 'Wright'),
 ('', 'Robert Fripp'),
 (None, 'Lenin'),
 (None, 'Andy'),
 ('Grassdale', 'David W. Taylor'),
 (None, 'Joe Christmas'),
 ('', 'Hicks'),
 (None, 'Bonavia'),
 ('', 'Marcia'),
 (None, 'Martin O*Malley'),
 (None, 'Shahjahan'),
 (None, 'Sheikh Isa Qassim'),
 (None, 'Wade'),
 ('', 'Haqqani'),
 (None, 'Wozniak'),
 (None, 'Thomas Coats'),
 (None, 'Mark Wright'),
 (None, 'Clarence Doust'),
 (None, 'Albert Blithe'),
 (None, 'Arun'),
 (None, 'Novak'),
 (None, 'Bawa Ardalan'),
 (None, 'Hamza Aziz'),
 (None, 'Kelder'),
 (None, 'Paul'),
 (None, 'George William'),
 (None, 'Sadiq Khan'),
 ('', 'Nicole'),
 ('', 'Walter Freeman'),
 (None, 'Beryl Agnes Farry'),
 ('', 'Ilv

In [3]:
sentence = "Twenty years ago, Lorenzo Uribe discovered true love with Maria Herrera and began a romance. Lorenzo was rich, married, and had a young son: Lautaro. Maria was poor and unknown to Lorenzo, had a daughter called Renata. Maria's mother, Gracia, wanted her daughter to catch this rich man at all costs and convinced her that pregnancy would assure this."
question = "her"

tokenized_input = tokenizer(question, sentence, return_tensors="pt")
tokenizer.decode(tokenizer.encode(question, sentence, return_tensors="pt")[0])

"[CLS] her[SEP] Twenty years ago, Lorenzo Uribe discovered true love with Maria Herrera and began a romance. Lorenzo was rich, married, and had a young son: Lautaro. Maria was poor and unknown to Lorenzo, had a daughter called Renata. Maria's mother, Gracia, wanted her daughter to catch this rich man at all costs and convinced her that pregnancy would assure this.[SEP]"

In [4]:
tokenized_input

{'input_ids': tensor([[   65,   708,    66, 22482,   913,  2185,   112, 41822,   572,  4993,
          5172,  2182,  1943,   452, 14301, 46609,   391,  2641,   358, 19762,
           114, 41822,   474,  5628,   112,  6506,   112,   391,   651,   358,
          1963,  3468,   126,   507,  2407, 12123,   114, 14301,   474,  3696,
           391,  6540,   385, 41822,   112,   651,   358,  5058,  1545,  7253,
          1146,   114, 14301,   439,  2903,   112,  2003, 47532,   112,  2328,
           708,  5058,   385,  5030,   529,  5628,   683,   480,   578,  3585,
           391,  9532,   708,   427, 10342,   662, 19933,   529,   114,    66]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])}

In [5]:
with torch.no_grad():
    outputs = model(**tokenized_input)

Attention type 'block_sparse' is not possible if sequence_length: 80 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


In [6]:
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()
predict_answer_tokens = tokenized_input.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'Maria Herrera'

In [7]:
len(tokenized_input.input_ids[0])

80