In [35]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
sep_model = torch.load("sep_DistilBertForQuestionAnswering.pth")
offical_model =  torch.load("DistilBertForQuestionAnswering.pth")
ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
cls_token_id = tokenizer.cls_token_id


def predict(model, inputs):
    output = model(inputs)
    return output.start_logits, output.end_logits


def construct_input_ref_pair(question, text, ref_token_id, sep_token_id, cls_token_id):
    question_ids = tokenizer.encode(question, add_special_tokens=False)
    text_ids = tokenizer.encode(text, add_special_tokens=False)

    # construct input token ids
    input_ids = [cls_token_id] + question_ids + [sep_token_id] + text_ids + [sep_token_id]

    # construct reference token ids
    ref_input_ids = [cls_token_id] + [ref_token_id] * len(question_ids) + [sep_token_id] + \
                    [ref_token_id] * len(text_ids) + [sep_token_id]

    return torch.tensor([input_ids], device=device), torch.tensor([ref_input_ids], device=device), len(question_ids)

def predict_qt(model, text, question):
    input_ids, ref_input_ids, sep_id = construct_input_ref_pair(question, text, ref_token_id, sep_token_id, cls_token_id)

    indices = input_ids[0].detach().tolist()
    all_tokens = tokenizer.convert_ids_to_tokens(indices)

    ground_truth = '13'


    start_scores, end_scores = predict(model, input_ids)


    return (' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])), float(torch.max(torch.softmax(start_scores[0], dim=0))), float(torch.max(torch.softmax(end_scores[0], dim=0)))
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0

    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)

    return 2 * (prec * rec) / (prec + rec)


question = """When did Beyonce start becoming popular?"""
text = """Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy"."""
answer, start_scores, end_scores= predict_qt(offical_model, question, text)
print(answer,start_scores, end_scores )

 0.34193170070648193 0.4017501771450043


In [31]:
answer

''

In [33]:
question = """What areas did Beyonce compete in when she was growing up?"""

answer, start_scores, end_scores = predict_qt(offical_model, question, text)
print(answer,start_scores, end_scores )

singing and dancing 0.9882990121841431 0.8989322185516357


In [36]:
import nltk as tk
import re
tokens = tk.sent_tokenize(text)
for token in tokens:
    print(token)

    answer, start_scores, end_scores = predict_qt(sep_model, question, token)
    print("========>", answer,start_scores, end_scores)
    print("*"*72)

Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress.
************************************************************************
Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child.
************************************************************************
Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time.
************************************************************************
Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
************************************************************************


In [1]:
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad(r'D:\software\github\GZK_Code\XAI\2022.03.03\squad\train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad(r'D:\software\github\GZK_Code\XAI\2022.03.03\squad\dev-v2.0.json')


In [22]:
len(train_contexts)

86821

In [21]:

sep_train_contexts = []
sep_train_questions = []
sep_train_answers = []
import nltk as tk
import re
temp = 0
null_answer = {'text': '[NULL]', 'answer_start': 0}
for i in range(len(train_contexts)):
    tokens = tk.sent_tokenize(train_contexts[i])
    for token in tokens:
        if train_answers[i]['text'] in token:
            try:
                answer_start = re.search(train_answers[i]['text'], token)
                answer = {'text': train_answers[i]['text'], 'answer_start':  answer_start.span()[0]}
                sep_train_contexts.append(token)

            
                sep_train_answers.append(answer)
                sep_train_questions.append(train_questions[i])
            except:
                temp = temp + 1
print(temp)

            


1753


In [20]:
tokens = tk.sent_tokenize(train_contexts[636])
print(tokens)
print(train_answers[636]['text'])
for token in tokens:
    if train_answers[636]['text'] in token:
        answer_start = re.search(train_answers[i]['text'], token)
        answer = {'text': train_answers[i]['text'], 'answer_start':  answer_start.span()[0]}
        print('text', train_answers[i]['text'])

["Beyoncé has won 20 Grammy Awards, both as a solo artist and member of Destiny's Child, making her the second most honored female artist by the Grammys, behind Alison Krauss and the most nominated woman in Grammy Award history with 52 nominations.", '"Single Ladies (Put a Ring on It)" won Song of the Year in 2010 while "Say My Name" and "Crazy in Love" had previously won Best R&B Song.', "Dangerously in Love, B'Day and I Am... Sasha Fierce have all won Best Contemporary R&B Album.", 'Beyoncé set the record for the most Grammy awards won by a female artist in one night in 2010 when she won six awards, breaking the tie she previously held with Alicia Keys, Norah Jones, Alison Krauss, and Amy Winehouse, with Adele equaling this in 2012.', 'Following her role in Dreamgirls she was nominated for Best Original Song for "Listen" and Best Actress at the Golden Globe Awards, and Outstanding Actress in a Motion Picture at the NAACP Image Awards.', 'Beyoncé won two awards at the Broadcast Film C

AttributeError: 'NoneType' object has no attribute 'span'

In [None]:
for i in range(len(sep_train_contexts)):
    print("CONTEXT ==>", sep_train_contexts[i])
    print("QUESTION ==>", sep_train_questions[i])
    print("ANSWERS ==>", sep_train_answers[i])
    print("*"*74)