In [1]:
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad(r'D:\software\github\GZK_Code\XAI\2022.03.03\squad\train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad(r'D:\software\github\GZK_Code\XAI\2022.03.03\squad\dev-v2.0.json')


In [2]:

sep_train_contexts = []
sep_train_questions = []
sep_train_answers = []

In [17]:
train_contexts[0]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [11]:
train_answers[1]

{'text': 'singing and dancing', 'answer_start': 207}

In [18]:
train_questions[0]

'When did Beyonce start becoming popular?'

In [13]:
sep_train_contexts[1]

"Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child."

In [3]:
import nltk as tk
import re
null_answer = {'text': '[NULL]', 'answer_start': 0}
for i in range(len(train_contexts)):
    tokens = tk.sent_tokenize(train_contexts[i])
    for token in tokens:
        if train_answers[i]['text'] in token:
            try:
                answer_start = re.search(train_answers[i]['text'], token)
                answer = {'text': train_answers[i]['text'], 'answer_start':  answer_start.span()[0]}
                sep_train_contexts.append(token)

            
                sep_train_answers.append(answer)
                sep_train_questions.append(train_questions[i])
            except:
                # print(i)
        # else:
        #     sep_train_contexts.append('[NULL]' + token)
        #     sep_train_answers.append(null_answer)
        #     sep_train_questions.append(train_questions[i])
            


636
640
647
648
726
729
734
1859
1968
1970
1988
1991
1993
1994
1995
1997
2035
2039
2286
2462
2505
2506
2507
2508
2511
2518
2522
2523
2530
2532
2781
2783
2835
2839
2945
2952
2965
2968
2972
3046
3047
3228
3485
3494
3495
3496
3497
3503
3508
3510
3518
3519
3524
3534
3538
3540
3546
3563
3570
3617
3680
3790
3890
3891
3943
4145
4186
4187
4209
4214
4222
4269
4276
4277
4308
4339
4356
4424
4657
4741
4746
4815
4818
4819
4826
4835
4948
4956
4978
4993
5005
5035
5040
5041
5054
5062
5066
5711
5828
6015
6128
6128
6129
6131
6131
6134
6135
6136
6221
6319
7332
7333
7361
7376
7465
7718
7753
7766
7767
7779
7866
7921
7965
7986
8010
8075
8076
8080
8179
8182
8281
8284
8942
8963
8980
8987
9059
9072
9108
9634
9635
9636
9650
9654
9678
9695
9696
9742
9816
9824
9828
9843
9874
9922
9929
9941
10022
10224
10225
10226
10257
10601
10606
10609
10635
10731
10752
11148
11152
11158
11159
11161
11174
11623
11845
11913
11935
12193
12220
12229
12244
12258
12261
12281
12441
12512
12928
12983
12984
12985
12986
13019
13023
13033

In [7]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters
        else:
            answer['answer_end'] = end_idx
add_end_idx(sep_train_answers, sep_train_contexts)
add_end_idx(val_answers, val_contexts)

In [8]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(sep_train_contexts, sep_train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [9]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    print("len len(answers) : ",len(answers))
    for i in range(len(answers)):
        print(i, encodings.char_to_token(i, answers[i]['answer_start']), answers[i]['answer_start'])

        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))

        
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
        
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, sep_train_answers)
add_token_positions(val_encodings, val_answers)

len len(answers) :  96715
0 25 122
1 13 60
2 17 76
3 5 19
4 27 129
5 39 173
6 13 55
7 6 23
8 27 129
9 30 143
10 13 55
11 17 76
12 30 140
13 30 143
14 13 55
15 5 27
16 5 23
17 52 219
18 15 80
19 56 240
20 8 36
21 1 0
22 1 0
23 8 45
24 14 60
25 13 48
26 23 95
27 15 80
28 5 23
29 29 120
30 22 104
31 8 35
32 1 0
33 18 111
34 22 100
35 1 0
36 6 18
37 14 70
38 12 57
39 23 109
40 12 57
41 4 12
42 18 111
43 2 3
44 14 70
45 13 72
46 1 0
47 17 73
48 9 31
49 3 10
50 6 20
51 41 152
52 29 101
53 6 25
54 14 62
55 41 152
56 33 117
57 6 25
58 14 62
59 6 20
60 11 49
61 9 56
62 18 88
63 7 39
64 10 42
65 14 65
66 26 125
67 11 49
68 9 56
69 18 95
70 10 42
71 10 37
72 3 8
73 12 69
74 10 37
75 2 3
76 5 14
77 15 77
78 12 69
79 2 3
80 3 7
81 12 56
82 1 0
83 10 37
84 2 3
85 5 14
86 7 20
87 26 114
88 43 154
89 1 0
90 10 47
91 12 51
92 20 85
93 26 114
94 44 155
95 2 1
96 10 47
97 20 85
98 26 114
99 21 101
100 62 250
101 18 82
102 38 162
103 10 47
104 3 20
105 2 4
106 10 53
107 5 18
108 6 29
109 6 45
110 9 43
111

In [10]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [1]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [2]:
total = sum([param.nelement() for param in model.parameters()])

print("Number of parameter: %.2fM" % (total/1e6))

Number of parameter: 66.36M


In [12]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()
torch.save(model, "DistilBertForQuestionAnswering.pth")



In [13]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = torch.load("DistilBertForQuestionAnswering.pth")
ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
cls_token_id = tokenizer.cls_token_id


def predict(inputs):
    output = model(inputs)
    return output.start_logits, output.end_logits


def construct_input_ref_pair(question, text, ref_token_id, sep_token_id, cls_token_id):
    question_ids = tokenizer.encode(question, add_special_tokens=False)
    text_ids = tokenizer.encode(text, add_special_tokens=False)

    # construct input token ids
    input_ids = [cls_token_id] + question_ids + [sep_token_id] + text_ids + [sep_token_id]

    # construct reference token ids
    ref_input_ids = [cls_token_id] + [ref_token_id] * len(question_ids) + [sep_token_id] + \
                    [ref_token_id] * len(text_ids) + [sep_token_id]

    return torch.tensor([input_ids], device=device), torch.tensor([ref_input_ids], device=device), len(question_ids)

def predict_qt(question, text):
    input_ids, ref_input_ids, sep_id = construct_input_ref_pair(question, text, ref_token_id, sep_token_id, cls_token_id)

    indices = input_ids[0].detach().tolist()
    all_tokens = tokenizer.convert_ids_to_tokens(indices)

    ground_truth = '13'


    start_scores, end_scores = predict(input_ids)


    return (' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0

    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)

    return 2 * (prec * rec) / (prec + rec)


question = """In what country is Normandy located?"""
text = """The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France."""
answer = predict_qt(question, text)
print( answer)

france
