In [198]:
from datasets import load_dataset

squad = load_dataset("squad")
sep_squad = load_dataset('csv', data_files={'train': './sep_squad_train.csv', 'validation': './sep_squad_validation.csv'})

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
sep_tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

from transformers import default_data_collator

data_collator = default_data_collator

from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer



training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)


Reusing dataset squad (C:\Users\GZK\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

Using custom data configuration default-0ae3a52d115f18f2
Reusing dataset csv (C:\Users\GZK\.cache\huggingface\datasets\csv\default-0ae3a52d115f18f2\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/2 [00:00<?, ?it/s]

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\GZK/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.16.2",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at C:\Users\GZK/.cache\huggingface\transformers\0e1bbfda7f63a99

  0%|          | 0/11 [00:00<?, ?ba/s]

Loading cached processed dataset at C:\Users\GZK\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-55e3b8927d96553d.arrow
Loading cached processed dataset at C:\Users\GZK\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-c50c38f20d2316ae.arrow
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
model_name = str(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) + 'full_squadh'
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
torch.save(model, './models/' + model_name)
for epoch in range(10):
    model = torch.load("./models/" + model_name)
    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    )
    trainer.train()
    answers = {}
    for org_id in org:
        if org_id in ids:
            text = texts[ids.index(org_id)]
            if len(text) >=512:
                text = text[:512]
            question = ques[ids.index(org_id)]
            answer, start_scores, end_scores= predict_qt(model, text, question)
            answers[org_id] = answer
        elif org_id not in ids:
            print(org_id, "didn't found")
            answers[org_id] = ''
    json_str = json.dumps(answers)
    with open('./answers/'+model_name+'my_answers.txt', 'w') as json_file:
        json_file.write(json_str)
    model_name = str(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) + 'full_squad.pth'
    torch.save(model, './models/' + model_name )
    
    

In [None]:

for epoch in range(10):
    model = torch.load("./models/" + model_name)
    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=sep_tokenized_squad["train"],
    eval_dataset=sep_tokenized_squad["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    )
    trainer.train()
    answers = {}
    for org_id in org:
        if org_id in ids:
            text = texts[ids.index(org_id)]
            if len(text) >=512:
                text = text[:512]
            question = ques[ids.index(org_id)]
            answer, start_scores, end_scores= predict_qt(model, text, question)
            answers[org_id] = answer
        elif org_id not in ids:
            print(org_id, "didn't found")
            answers[org_id] = ''
    json_str = json.dumps(answers)
    with open('./answers/'+model_name+'my_answers.txt', 'w') as json_file:
        json_file.write(json_str)
    model_name = str(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) + 'sep_squad.pth'
    torch.save(model, './models/' + model_name )
    
    

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=sep_tokenized_squad["train"],
    eval_dataset=sep_tokenized_squad["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
model_name = str(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) + 'full_squadh'
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
torch.save(model, './models/' + model_name)
for epoch in range(10):
    model = torch.load("./models/" + model_name)
    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    )
    trainer.train()
    model_name = str(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) + 'full_squad.pth'
    torch.save(model, './models/' + model_name )
    

In [71]:
ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
cls_token_id = tokenizer.cls_token_id
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
def predict(model, inputs):
    output = model(inputs)
    return output.start_logits, output.end_logits


def construct_input_ref_pair(question, text, ref_token_id, sep_token_id, cls_token_id):
    question_ids = tokenizer.encode(question, add_special_tokens=False)
    text_ids = tokenizer.encode(text, add_special_tokens=False)

    # construct input token ids
    input_ids = [cls_token_id] + question_ids + [sep_token_id] + text_ids + [sep_token_id]

    # construct reference token ids
    ref_input_ids = [cls_token_id] + [ref_token_id] * len(question_ids) + [sep_token_id] + \
                    [ref_token_id] * len(text_ids) + [sep_token_id]

    return torch.tensor([input_ids], device=device), torch.tensor([ref_input_ids], device=device), len(question_ids)

def predict_qt(model, text, question):
    input_ids, ref_input_ids, sep_id = construct_input_ref_pair(question, text, ref_token_id, sep_token_id, cls_token_id)

    indices = input_ids[0].detach().tolist()
    all_tokens = tokenizer.convert_ids_to_tokens(indices)

    start_scores, end_scores = predict(model, input_ids)


    return (' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])), float(torch.max(torch.softmax(start_scores[0], dim=0))), float(torch.max(torch.softmax(end_scores[0], dim=0)))




In [95]:
ids = []
ques = []
texts = []
for i in squad["validation"]:
    ids.append(i['id'])
    ques.append(i['question'])
    texts.append(i['context'])
import json
with open("worksheets.codalab.org.txt", "r+")as f:
    org = json.load(f)
print("org : ", len(org))
print("ids : ", len(ids))


In [None]:
answers = {}
for org_id in org:
    if org_id in ids:
        text = texts[ids.index(org_id)]
        if len(text) >=512:
            text = text[:512]
        question = ques[ids.index(org_id)]
        answer, start_scores, end_scores= predict_qt(model, text, question)
        answers[org_id] = answer
    elif org_id not in ids:
        print(org_id, "didn't found")
        answers[org_id] = ''
json_str = json.dumps(answers)
with open('my_answers.txt', 'w') as json_file:
    json_file.write(json_str)

In [104]:
with open("my_answers.txt", "w", encoding='utf-8')as f:
    f.write(str(answers), )

In [105]:
json_str = json.dumps(answers)
with open('my_answers.txt', 'w') as json_file:
    json_file.write(json_str)

In [110]:
from transformers import DistilBertTokenizerFast
import torch
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')


import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
cls_token_id = tokenizer.cls_token_id

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at C:\Users\GZK/.cache\huggingface\transformers\0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json from cache at C:\Users\GZK/.cache\huggingface\transformers\75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer_config.json from cache at C:\Users\GZK/.cache\huggingface\transformers\8c8624b8ac8aa99c60c912161f8332de003484428c47906d7ff7eb7f73eecdbb.20430b

In [123]:
# from transformers import DistilBertForQuestionAnswering
# model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
# model.to(device)
model = torch.load('./models/epoch72022-03-20-23-56-16.pth')

In [124]:
import datetime
time = str(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))
print(time)
answers = {}
for org_id in org:
    if org_id in ids:
        text = texts[ids.index(org_id)]
        if len(text) >=512:
            text = text[:512]
        question = ques[ids.index(org_id)]
        answer, start_scores, end_scores= predict_qt(model, text, question)
        answers[org_id] = answer
    elif org_id not in ids:
        #print(org_id, "didn't found")
        answers[org_id] = ''
json_str = json.dumps(answers)
with open(time + 'my_answers.txt', 'w') as json_file:
    json_file.write(json_str)

2022-03-23-20-45-43


In [186]:
train_contexts = []
train_questions = []
train_answers = []
train_titles = []
train_ids = []

validation_contexts = []
validation_questions = []
validation_answers = []
validation_titles = []
validation_ids = []
for i in squad['train']:
    train_contexts.append(i['context'])
    train_questions.append(i['question'])
    train_answers.append(i['answers'])
    train_titles.append(i['title'])
    train_ids.append(i['id'])
    
    
for i in squad['validation']:
    validation_contexts.append(i['context'])
    validation_questions.append(i['question'])
    validation_answers.append(i['answers'])
    validation_titles.append(i['title'])
    validation_ids.append(i['id'])

In [140]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files={'train': './train.csv'})

Using custom data configuration default-201d4e794bdec093


Downloading and preparing dataset csv/default to C:\Users\GZK\.cache\huggingface\datasets\csv\default-201d4e794bdec093\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to C:\Users\GZK\.cache\huggingface\datasets\csv\default-201d4e794bdec093\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [187]:
import nltk as tk
import re

sep_train_contexts = []
sep_train_questions = []
sep_train_answers = []
sep_train_ids = []
sep_train_titles = []

sep_validation_contexts = []
sep_validation_questions = []
sep_validation_answers = []
sep_validation_ids = []
sep_validation_titles = []

error_index = []
null_answer = {'text': '[NULL]', 'answer_start': 0}
for i in range(len(train_contexts)):
    tokens = tk.sent_tokenize(train_contexts[i])
    for token in tokens:
        if train_answers[i]['text'][0] in token:
            try:
                answer_start = re.search(train_answers[i]['text'][0], token)
                answer = {'text': train_answers[i]['text'], 'answer_start': answer_start.span()[0]}
                sep_train_contexts.append(token)

                sep_train_answers.append(answer)
                sep_train_questions.append(train_questions[i])
                sep_train_ids.append(train_ids[i])
                sep_train_titles.append(train_titles[i])
            except:
                error_index.append(i)

                # print(i)
        # else:
        #     sep_train_contexts.append('[NULL]' + token)
        #     sep_train_answers.append(null_answer)
        #     sep_train_questions.append(train_questions[i])
print("error_index : ", len(error_index))

error_index = []
null_answer = {'text': '[NULL]', 'answer_start': 0}
for i in range(len(validation_contexts)):
    tokens = tk.sent_tokenize(validation_contexts[i])
    for token in tokens:
        if validation_answers[i]['text'][0] in token:
            try:
                answer_start = re.search(validation_answers[i]['text'][0], token)
                answer = {'text': validation_answers[i]['text'], 'answer_start': answer_start.span()[0]}
                sep_validation_contexts.append(token)

                sep_validation_answers.append(answer)
                sep_validation_questions.append(validation_questions[i])
                sep_validation_ids.append(validation_ids[i])
                sep_validation_titles.append(validation_titles[i])
            except:
                error_index.append(i)

                # print(i)
        # else:
        #     sep_train_contexts.append('[NULL]' + token)
        #     sep_train_answers.append(null_answer)
        #     sep_train_questions.append(train_questions[i])
print("error_index : ", len(error_index))

error_index :  1770
error_index :  213


In [None]:
print(len(sep_train_contexts), len(sep_train_questions), lsep_train_answers = []

In [168]:
import pandas as pd



#字典中的key值即为csv中列名
dataframe = pd.DataFrame({'id':train_ids,'title':train_titles, 'context':train_contexts, 'question':train_questions, 'answers':train_answers})

#将DataFrame存储为csv,index表示是否显示行名，default=True
dataframe.to_csv("full_squad_train.csv",index=False,sep=',')



In [190]:
sep_dataframe = pd.DataFrame({'id':sep_train_ids,'title':sep_train_titles, 'context':sep_train_contexts, 'question':sep_train_questions, 'answers':sep_train_answers})

#将DataFrame存储为csv,index表示是否显示行名，default=True
sep_dataframe.to_csv("sep_squad_train.csv",index=False,sep=',')

In [191]:
sep_dataframe = pd.DataFrame({'id':sep_validation_ids,
                              'title':sep_validation_titles, 
                              'context':sep_validation_contexts, 
                              'question':sep_validation_questions, 
                              'answers':sep_validation_answers})

#将DataFrame存储为csv,index表示是否显示行名，default=True
sep_dataframe.to_csv("sep_squad_validation.csv",index=False,sep=',')

In [194]:
squad = load_dataset('csv', data_files={'train': './sep_squad_train.csv', 'validation': './sep_squad_validation.csv'})

Using custom data configuration default-0ae3a52d115f18f2


Downloading and preparing dataset csv/default to C:\Users\GZK\.cache\huggingface\datasets\csv\default-0ae3a52d115f18f2\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to C:\Users\GZK\.cache\huggingface\datasets\csv\default-0ae3a52d115f18f2\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
print(os.popen("python evaluate-v2.0.py dev-v2.0.json worksheets.codalab.org.txt").read())

OPTS.na_prob_thresh-------> <class 'float'> 1.0
exact-------> 59.159919028340084
f1-------> 64.7655368790259
total-------> 5928
exact-------> 70.4457527333894
f1-------> 70.4457527333894
total-------> 5945
{
  "exact": 64.81091552261434,
  "f1": 67.60971132981278,
  "total": 11873,
  "HasAns_exact": 59.159919028340084,
  "HasAns_f1": 64.7655368790259,
  "HasAns_total": 5928,
  "NoAns_exact": 70.4457527333894,
  "NoAns_f1": 70.4457527333894,
  "NoAns_total": 5945
}



In [2]:
import os

def list_dir(text_list,dir_path):
    dir_files = os.listdir(dir_path)  # 得到该文件夹下所有的文件
    for file in dir_files:
        file_path = os.path.join(dir_path, file)  # 路径拼接成绝对路径
        if os.path.isfile(file_path):  # 如果是文件，就打印这个文件路径
            if file_path.endswith(".txt"):
                text_list.append(file_path)
        if os.path.isdir(file_path):  # 如果目录，就递归子目录
            list_dir(text_list,file_path)
    return text_list


if __name__ == '__main__':
    all_txt = []
    thesaurus_path = r"D:\software\github\GZK_Code\XAI\2022.03.23\answers"
    text_list = list_dir(all_txt,thesaurus_path)
    for text in text_list:
        print(text)


D:\software\github\GZK_Code\XAI\2022.03.23\answers\2022-03-23-22-21-00full_squadhmy_answers.txt
D:\software\github\GZK_Code\XAI\2022.03.23\answers\2022-03-23-22-49-31full_squad.pthmy_answers.txt
D:\software\github\GZK_Code\XAI\2022.03.23\answers\2022-03-23-23-18-11full_squad.pthmy_answers.txt
D:\software\github\GZK_Code\XAI\2022.03.23\answers\2022-03-23-23-46-30full_squad.pthmy_answers.txt
D:\software\github\GZK_Code\XAI\2022.03.23\answers\2022-03-24-00-14-43full_squad.pthmy_answers.txt
D:\software\github\GZK_Code\XAI\2022.03.23\answers\2022-03-24-00-43-04full_squad.pthmy_answers.txt
D:\software\github\GZK_Code\XAI\2022.03.23\answers\2022-03-24-01-11-19full_squad.pthmy_answers.txt
D:\software\github\GZK_Code\XAI\2022.03.23\answers\2022-03-24-01-39-32full_squad.pthmy_answers.txt
D:\software\github\GZK_Code\XAI\2022.03.23\answers\2022-03-24-02-07-47full_squad.pthmy_answers.txt
D:\software\github\GZK_Code\XAI\2022.03.23\answers\2022-03-24-02-36-00full_squad.pthmy_answers.txt
D:\software\g

In [25]:
for text in text_list:
    cmd = "python evaluate-v2.0.py dev-v2.0.json " + text
    print(cmd)
    print(os.popen(cmd).read())

python evaluate-v2.0.py dev-v2.0.json D:\software\github\GZK_Code\XAI\2022.03.23\answers\2022-03-23-22-21-00full_squadhmy_answers.txt
OPTS.na_prob_thresh-------> <class 'float'> 1.0
exact-------> 33.28272604588394
f1-------> 45.0971401088748
total-------> 5928
exact-------> 100.0
f1-------> 100.0
total-------> 5945
{
  "exact": 66.68912658974143,
  "f1": 72.58787556349768,
  "total": 11873,
  "HasAns_exact": 33.28272604588394,
  "HasAns_f1": 45.0971401088748,
  "HasAns_total": 5928,
  "NoAns_exact": 100.0,
  "NoAns_f1": 100.0,
  "NoAns_total": 5945
}

python evaluate-v2.0.py dev-v2.0.json D:\software\github\GZK_Code\XAI\2022.03.23\answers\2022-03-23-22-49-31full_squad.pthmy_answers.txt
OPTS.na_prob_thresh-------> <class 'float'> 1.0
exact-------> 34.64912280701754
f1-------> 46.52270055274909
total-------> 5928
exact-------> 100.0
f1-------> 100.0
total-------> 5945
{
  "exact": 67.37134675313737,
  "f1": 73.29963521238894,
  "total": 11873,
  "HasAns_exact": 34.64912280701754,
  "HasA

In [3]:
# 导入模块
import itchat
import datetime
import time
import itchat

itchat.auto_login()

itchat.send('hi', toUserName='小仙女')

ModuleNotFoundError: No module named 'itchat'

In [2]:
! pip install itchat

