In [1]:
from datasets import load_dataset

squad = load_dataset("squad")

Reusing dataset squad (C:\Users\GZK\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [3]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [4]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Loading cached processed dataset at C:\Users\GZK\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-1ec1cd9b8a7518e3.arrow
Loading cached processed dataset at C:\Users\GZK\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-82bbdcbaee462804.arrow


In [5]:
from transformers import default_data_collator

data_collator = default_data_collator

In [6]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [7]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [9]:
total = sum([param.nelement() for param in model.parameters()])

print("Number of parameter: %.2fM" % (total/1e6))

Number of parameter: 66.36M


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,1.2458,1.141883,60.8304,173.762
2,0.9164,1.09507,60.3606,175.114


In [3]:
import torch
# torch.save(model, 'net.pkl')  # save entire net # save only the parameters
model = torch.load('net.pkl')  # 加载神经网络

In [7]:
from transformers import DistilBertTokenizer, DistilBertModel
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# replace <PATd:/spofrte/modeH-TO-SAVED-MODEL> with the real path of the saved model
model.to(device)
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
encoded_input.to(device)
output = model(**encoded_input)
def construct_input_ref_pos_id_pair(input_ids):
    seq_length = input_ids.size(1)
    position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
    # we could potentially also use random permutation with `torch.randperm(seq_length, device=device)`
    ref_position_ids = torch.zeros(seq_length, dtype=torch.long, device=device)

    position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
    ref_position_ids = ref_position_ids.unsqueeze(0).expand_as(input_ids)
    return position_ids, ref_position_idsprint(output)

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-4.9537, -0.9243, -3.4084, -1.8041, -0.0718, -0.9083, -3.7310, -7.3723,
         -5.4588, -4.8745, -2.5794, -7.2470]], device='cuda:0',
       grad_fn=<SqueezeBackward1>), end_logits=tensor([[-6.3243, -4.3402, -3.5235, -6.2792, -5.2571, -0.5210, -6.0681, -4.8629,
         -6.4497, -2.6533, -2.0360, -4.7456]], device='cuda:0',
       grad_fn=<SqueezeBackward1>), hidden_states=None, attentions=None)


In [34]:
ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
cls_token_id = tokenizer.cls_token_id


def predict(inputs):
    output = model(inputs)
    return output.start_logits, output.end_logits


def construct_input_ref_pair(question, text, ref_token_id, sep_token_id, cls_token_id):
    question_ids = tokenizer.encode(question, add_special_tokens=False)
    text_ids = tokenizer.encode(text, add_special_tokens=False)

    # construct input token ids
    input_ids = [cls_token_id] + question_ids + [sep_token_id] + text_ids + [sep_token_id]

    # construct reference token ids
    ref_input_ids = [cls_token_id] + [ref_token_id] * len(question_ids) + [sep_token_id] + \
                    [ref_token_id] * len(text_ids) + [sep_token_id]

    return torch.tensor([input_ids], device=device), torch.tensor([ref_input_ids], device=device), len(question_ids)

def predict_qt(question, text):
    input_ids, ref_input_ids, sep_id = construct_input_ref_pair(question, text, ref_token_id, sep_token_id, cls_token_id)

    indices = input_ids[0].detach().tolist()
    all_tokens = tokenizer.convert_ids_to_tokens(indices)

    ground_truth = '13'


    start_scores, end_scores = predict(input_ids)


    return (' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0

    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)

    return 2 * (prec * rec) / (prec + rec)

In [37]:
question = """who is the boy in red"""
text = "the boy in red si zikun"
answer = predict_qt(question, text)
print(answer)

the boy in red si z ##ik ##un


In [40]:
datasets = load_dataset('squad')
f1score = 0
for i in range(len(datasets['validation'])):
    text = datasets['validation'][i]['context']
    question = datasets['validation'][i]['question']
    answers = datasets['validation'][i]['answers']['text'][0]

    answer = predict_qt(question, text)
    f1score += compute_f1(answer, answers)
    print(i)




Reusing dataset squad (C:\Users\GZK\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [23]:
import nltk as tk
from captum.attr import LayerConductance, LayerIntegratedGradients

def predict_qt(question, text):
    input_ids, ref_input_ids, sep_id = construct_input_ref_pair(question, text, ref_token_id, sep_token_id, cls_token_id)

    indices = input_ids[0].detach().tolist()
    all_tokens = tokenizer.convert_ids_to_tokens(indices)

    ground_truth = '13'


    start_scores, end_scores = predict(input_ids)


    print('Question: ', question)
    print('Predicted Answer: ', ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
    return input_ids, ref_input_ids, None, None, None, start_scores, end_scores, ground_truth, all_tokens,
def squad_pos_forward_func(inputs, token_type_ids=None, position_ids=None, attention_mask=None, position=0):
    pred = predict(inputs,
                   token_type_ids=token_type_ids,
                   position_ids=position_ids,
                   attention_mask=attention_mask)
    pred = pred[position]
    return pred.max(1).values
def explain(input_ids, ref_input_ids, token_type_ids, position_ids, attention_mask, start_scores, end_scores,
            ground_truth, all_tokens, ):
    lig = LayerIntegratedGradients(squad_pos_forward_func, model.bert.embeddings)

    attributions_start, delta_start = lig.attribute(inputs=input_ids,
                                                    baselines=ref_input_ids,
                                                    additional_forward_args=(
                                                        token_type_ids, position_ids, attention_mask, 0),
                                                    internal_batch_size=4,
                                                    return_convergence_delta=True)
    attributions_end, delta_end = lig.attribute(inputs=input_ids, baselines=ref_input_ids,
                                                additional_forward_args=(
                                                    token_type_ids, position_ids, attention_mask, 1),
                                                internal_batch_size=4,
                                                return_convergence_delta=True)

    attributions_start_sum = summarize_attributions(attributions_start)
    attributions_end_sum = summarize_attributions(attributions_end)
    # storing couple samples in an array for visualization purposes
    start_position_vis = viz.VisualizationDataRecord(
        attributions_start_sum,
        torch.max(torch.softmax(start_scores[0], dim=0)),
        torch.argmax(start_scores),
        torch.argmax(start_scores),
        str(ground_truth),
        attributions_start_sum.sum(),
        all_tokens,
        delta_start)

    end_position_vis = viz.VisualizationDataRecord(
        attributions_end_sum,
        torch.max(torch.softmax(end_scores[0], dim=0)),
        torch.argmax(end_scores),
        torch.argmax(end_scores),
        str(ground_truth),
        attributions_end_sum.sum(),
        all_tokens,
        delta_end)
    print(all_tokens)
    print('\033[1m', 'Visualizations For Start Position', '\033[0m')
    viz.visualize_text([start_position_vis])

    print('\033[1m', 'Visualizations For End Position', '\033[0m')

    print("attributions_start_sum:   ", len(attributions_start_sum))
    print("all tokens:    ", len(all_tokens))

    print(torch.max(torch.softmax(end_scores[0], dim=0)),
          torch.argmax(end_scores),
          torch.argmax(end_scores),
          torch.max(torch.softmax(start_scores[0], dim=0)))

    return all_tokens, attributions_start_sum


def pred_explain(question, text):
    input_ids, ref_input_ids, token_type_ids, position_ids, attention_mask, start_scores, end_scores, ground_truth, all_tokens, = predict_qt(
        question, text)

    all_tokens, attributions_start_sum = explain(input_ids, ref_input_ids, token_type_ids, position_ids, attention_mask,
                                                 start_scores, end_scores, ground_truth, all_tokens, )

    return all_tokens, attributions_start_sum, start_scores, end_scores


def generate_explain(i, question, text, answers):
    exp = {}
    l_attributions = []
    l_start_score = []
    l_end_score = []
    l_pred = []

    tokens = tk.sent_tokenize(text)
    all_tokens, attributions_start_sum, start_scores, end_scores = pred_explain(question, text)
    end_score = float(torch.max(torch.softmax(end_scores[0], dim=0)))
    start_score = float(torch.max(torch.softmax(start_scores[0], dim=0)))
    l_attributions.append(torch.sum(attributions_start_sum) / len(all_tokens))
    l_start_score.append(start_score)
    l_end_score.append(end_score)
    l_pred.append(' '.join(all_tokens[torch.argmax(start_scores): torch.argmax(end_scores) + 1]))

    for token in tokens:
        all_tokens, attributions_start_sum, start_scores, end_scores = pred_explain(question, token)
        end_score = float(torch.max(torch.softmax(end_scores[0], dim=0)))
        start_score = float(torch.max(torch.softmax(start_scores[0], dim=0)))
        l_attributions.append(torch.sum(attributions_start_sum) / len(all_tokens))
        l_start_score.append(start_score)
        l_end_score.append(end_score)
        l_pred.append(' '.join(all_tokens[torch.argmax(start_scores): torch.argmax(end_scores) + 1]))

    #转换成float
    l_attributions = [float(i) for i in l_attributions]

    exp["ids"] = i
    exp["preds"] = l_pred
    exp["attributions"] = l_attributions
    exp["start_score"] = l_start_score
    exp["end_score"] = l_end_score
    exp["answer"] = answers
    return exp
def write_explain(exp):
    exp_str = json.dumps(exp)
    with open(r"small_answers.txt", "a")as f:
        f.write(exp_str)
        f.write("\r\n")
    f.close()

def create_multi_bars(i, answers, labels, datas, tick_step=1, group_gap=0.2, bar_gap=0, ):
    '''
    labels : x轴坐标标签序列
    datas ：数据集，二维列表，要求列表每个元素的长度必须与labels的长度一致
    tick_step ：默认x轴刻度步长为1，通过tick_step可调整x轴刻度步长。
    group_gap : 柱子组与组之间的间隙，最好为正值，否则组与组之间重叠
    bar_gap ：每组柱子之间的空隙，默认为0，每组柱子紧挨，正值每组柱子之间有间隙，负值每组柱子之间重叠
    '''
    # ticks为x轴刻度
    ticks = np.arange(len(labels)) * tick_step
    # group_num为数据的组数，即每组柱子的柱子个数
    group_num = len(datas)
    # group_width为每组柱子的总宽度，group_gap 为柱子组与组之间的间隙。
    group_width = tick_step - group_gap
    # bar_span为每组柱子之间在x轴上的距离，即柱子宽度和间隙的总和
    bar_span = group_width / group_num
    # bar_width为每个柱子的实际宽度
    bar_width = bar_span - bar_gap
    # baseline_x为每组柱子第一个柱子的基准x轴位置，随后的柱子依次递增bar_span即可
    baseline_x = ticks - (group_width - bar_span) / 2
    for index, y in enumerate(datas):
        if index == 0:
            plt.bar(baseline_x + index*bar_span, y, bar_width, label='attribution score')
        if index == 1:
            plt.bar(baseline_x + index*bar_span, y, bar_width, label='forword attention score')
        if index == 2:
            plt.bar(baseline_x + index*bar_span, y, bar_width, label='backword attention score')
    plt.ylabel('Scores')
    plt.title('LONG SHORT TEST')
    # x轴刻度标签位置与x轴刻度一致
    plt.xticks(ticks, labels)
    plt.legend()

    plt.savefig('./longshorttest/' + str(i)+ '.jpg')
    #画布清除
    plt.clf()


def generate_directory(i):
    folder = "./longshorttest/" + str(i)
    # 获取此py文件路径，在此路径选创建在new_folder文件夹中的test文件夹
    save_path = folder  # './output/Modify_algorithm'
    try:
        if not os.path.exists(save_path):
            os.makedirs(save_path)

    except OSError:
        print('Error: Creating directory. ' + save_path)
def write_image(i, exp, answers):
    li = []
    # exp["preds"] = l_pred
    # exp["attributions"] = l_attributions
    # exp["start_score"] = l_start_score
    # exp["end_score"] = l_end_score
    l_attributions = exp["attributions"]
    l_start_score = exp["start_score"]
    l_end_score = exp["end_score"]

    for attribution in l_attributions:
        temp = attribution
        li.append(temp)
    l_attributions = li

    l_attributions = [i*6 for i in l_attributions]

    generate_directory(i)
    label = []
    for attribution in range(len(l_attributions)):
        label.append("sen" + str(attribution))

    data = [l_attributions, l_start_score, l_end_score]
    create_multi_bars(i, answers, label, data, bar_gap=0.1)

def write_exp(i, exp,answers):
    write_explain(exp)
    write_image(i, exp, answers)

from datasets import load_dataset

datasets = load_dataset('squad')

for i in range(len(datasets['validation'])):
    text = datasets['validation'][i]['context']
    question = datasets['validation'][i]['question']
    answers = datasets['validation'][i]['answers']['text'][0]

    exp = generate_explain(i, question, text, answers)

    write_exp(i, exp, answers)



Reusing dataset squad (C:\Users\GZK\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


Question:  Which NFL team represented the AFC at Super Bowl 50?
Predicted Answer:  denver broncos


AttributeError: 'DistilBertForQuestionAnswering' object has no attribute 'bert'