In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Sat Apr 24 01:49:46 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os
path = "/content/drive/MyDrive/NLP2"
os.chdir(path)

In [3]:
! pip install transformers
! pip install seqeval
import pickle as pkl
import pandas as pd
import numpy as np
from itertools import chain
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import BertForTokenClassification, AutoTokenizer
from tqdm import tqdm, trange
from seqeval.metrics import f1_score,accuracy_score

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 7.2MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 37.1MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 51.8MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1
Collecting seqeval
[?25l  Downloading https://files.pythonh

In [4]:
train_dict = pkl.load(open("./train.pkl", "rb"))
val_dict = pkl.load(open("./val.pkl", "rb"))
test_dict = pkl.load(open("./test.pkl", "rb"))

In [5]:
print("keys in train_dict:", train_dict.keys())
print("keys in val_dict:", val_dict.keys())
print("keys in test_dict:", test_dict.keys())

keys in train_dict: dict_keys(['id', 'word_seq', 'tag_seq'])
keys in val_dict: dict_keys(['id', 'word_seq', 'tag_seq'])
keys in test_dict: dict_keys(['id', 'word_seq'])


In [6]:
print("index:", train_dict["id"][0])
print(*zip(train_dict["word_seq"][0], train_dict["tag_seq"][0]))

index: 0
('Protection', 'O') ('of', 'O') ('calves', 'LIVESTOCK') ('against', 'O') ('fatal', 'O') ('enteric', 'DISEASE_OR_SYNDROME') ('colibacillosis', 'DISEASE_OR_SYNDROME') ('by', 'O') ('orally', 'GENE_OR_GENOME') ('administered', 'GENE_OR_GENOME') ('Escherichia', 'GENE_OR_GENOME') ('coli', 'GENE_OR_GENOME') ('K99', 'GENE_OR_GENOME') ('-', 'O') ('specific', 'CARDINAL') ('monoclonal', 'CARDINAL') ('antibody', 'CARDINAL') ('.', 'O') ('A', 'O') ('monoclonal', 'CHEMICAL') ('antibody', 'CHEMICAL') ('(', 'O') ('MCA', 'GENE_OR_GENOME') (')', 'O') ('to', 'O') ('enterotoxigenic', 'CHEMICAL') ('Escherichia', 'CHEMICAL') ('coli', 'CHEMICAL') ('K99', 'O') ('antigen', 'O') ('agglutinated', 'O') ('K99+', 'GENE_OR_GENOME') ('enterotoxigenic', 'GENE_OR_GENOME') ('E', 'GENE_OR_GENOME') ('.', 'O') ('coli', 'CHEMICAL') ('strains', 'CHEMICAL') ('B44', 'CHEMICAL') ('(', 'O') ('O9', 'O') (':', 'O') ('K30', 'O') (';', 'O') ('K99', 'O') (';', 'O') ('F41', 'O') (':', 'O') ('H-', 'O') (')', 'O') ('and', 'O') (

In [7]:
#  tag2idx： tag内容 -> tag index
#  添加了bert的special tokens([CLS],[SEP])，一共67个tag
taglist = set(chain(*train_dict["tag_seq"]))
tag2idx = {}
tag2idx['_t_pad_'] = 0
tag2idx['[CLS]'] = 1
tag2idx['[SEP]'] = 2
for tag in taglist:
  if tag not in tag2idx:
    tag2idx[tag] = len(tag2idx)
tag2idx['PAD'] = 67

#  tags_vals保存每个tag index对应的tag内容
tags_vals = list(tag2idx.keys())

In [8]:
train_sentences = [word for word in train_dict['word_seq']]

In [9]:
train_labels = [labelline for labelline in train_dict["tag_seq"]]
train_labels = [[tag2idx.get(l) for l in lab] for lab in train_labels]

In [10]:
val_sentences = [word for word in val_dict['word_seq']]
val_labels = [labelline for labelline in val_dict["tag_seq"]]
val_labels = [[tag2idx.get(l) for l in lab] for lab in val_labels]

In [11]:
model_name = 'bert-base-cased'
MAX_LEN = 250
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
LEARNING_RATE = 3e-5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




In [13]:
#  添加token 以免Berttokenizer把 '_w_pad_' 分裂成多个token
tokenizer.add_tokens(['_w_pad_','_unk_'])

2

In [16]:
#  可以参照这个例子，对于不在词汇表里的token，Berttokenizer会把通过他们分裂成多个token来处理
#  这样label的数量和tokenize之后的token数量会对不上
#  处理的方法是计算一个token分裂出来的子串的数量，然后把该token对应的label也延长相应的长度
#  比如 "calvas" : "O"
#     "ca", "##lves":  "O","O"
#  因为这个数据集里有大量不在词汇表里专业名词，所以这种情况还挺多的，可能用BioBERT或者Bertlarge效果会好一点
tokenizer.tokenize(' '.join(train_dict['word_seq'][0]))

['Protection',
 'of',
 'ca',
 '##lves',
 'against',
 'fatal',
 'enter',
 '##ic',
 'co',
 '##li',
 '##ba',
 '##ci',
 '##llo',
 '##sis',
 'by',
 'oral',
 '##ly',
 'administered',
 'E',
 '##scher',
 '##ichi',
 '##a',
 'co',
 '##li',
 'K',
 '##9',
 '##9',
 '-',
 'specific',
 'mon',
 '##oc',
 '##lon',
 '##al',
 'anti',
 '##body',
 '.',
 'A',
 'mon',
 '##oc',
 '##lon',
 '##al',
 'anti',
 '##body',
 '(',
 'MCA',
 ')',
 'to',
 'enter',
 '##oto',
 '##xi',
 '##genic',
 'E',
 '##scher',
 '##ichi',
 '##a',
 'co',
 '##li',
 'K',
 '##9',
 '##9',
 'anti',
 '##gen',
 'a',
 '##gg',
 '##lut',
 '##inated',
 'K',
 '##9',
 '##9',
 '+',
 'enter',
 '##oto',
 '##xi',
 '##genic',
 'E',
 '.',
 'co',
 '##li',
 'strains',
 'B',
 '##44',
 '(',
 'O',
 '##9',
 ':',
 'K',
 '##30',
 ';',
 'K',
 '##9',
 '##9',
 ';',
 'F',
 '##41',
 ':',
 'H',
 '-',
 ')',
 'and',
 'B',
 '##41',
 '(',
 'O',
 '##10',
 '##1',
 ':',
 'K',
 '##9',
 '##9',
 ';',
 'F',
 '##41',
 ':',
 'H',
 '-',
 ')',
 'grown',
 'at',
 '37',
 'degrees',
 'C',


In [17]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, sentences, labels, max_len):
        self.len = len(sentences)
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        sentence = self.sentences[index]
        s = str(' '.join(sentence))
        label_s = self.labels[index]
        new_labels = []
        new_labels.extend([1])  # 添加[CLS]对应的label
        for word, label in zip(sentence, label_s):
          tokenized_word = tokenizer.tokenize(word)
          count_subwords = len(tokenized_word)  # 计算tokenize之后的subwords数量
          new_labels.extend([label] * count_subwords)  # 相应地延长对应的label
        inputs = self.tokenizer.encode_plus(
            s,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        new_labels.extend([2])  # 添加[SEP]对应的label
        #  对于后面的padding部分也添加label 保证label的长度和padding后的句子长度一致 (67:'PAD')
        new_labels.extend([67]*MAX_LEN)
        new_labels=new_labels[:MAX_LEN]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'tags': torch.tensor(new_labels, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [19]:
# Creating the dataset and dataloader for the neural network
print("TRAIN Dataset: {}".format(len(train_sentences)))
print("TEST Dataset: {}".format(len(val_sentences)))

training_set = CustomDataset(tokenizer, train_sentences, train_labels, MAX_LEN)  # 训练集
testing_set = CustomDataset(tokenizer, val_sentences, val_labels, MAX_LEN)  # 验证集

TRAIN Dataset: 23600
TEST Dataset: 2950


In [20]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [21]:
# 示例
training_set[0]

{'ids': tensor([  101,  8063,  1104, 11019, 14455,  1222, 11874,  3873,  1596,  1884,
          2646,  2822,  6617,  6643,  4863,  1118,  9619,  1193,  8318,   142,
         27826, 11985,  1161,  1884,  2646,   148,  1580,  1580,   118,  2747,
         19863, 13335,  4934,  1348,  2848, 14637,   119,   138, 19863, 13335,
          4934,  1348,  2848, 14637,   113, 24955,   114,  1106,  3873, 12355,
          8745, 19438,   142, 27826, 11985,  1161,  1884,  2646,   148,  1580,
          1580,  2848,  4915,   170,  9705, 25937, 16868,   148,  1580,  1580,
           116,  3873, 12355,  8745, 19438,   142,   119,  1884,  2646, 21116,
           139, 25041,   113,   152,  1580,   131,   148, 13144,   132,   148,
          1580,  1580,   132,   143, 25892,   131,   145,   118,   114,  1105,
           139, 25892,   113,   152, 10424,  1475,   131,   148,  1580,  1580,
           132,   143, 25892,   131,   145,   118,   114,  4215,  1120,  3413,
          4842,   140,  1133,  1136,  1120,  

BertForTokenClassification 

https://huggingface.co/transformers/model_doc/bert.html#bertfortokenclassification

In [None]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2idx),  # 67分类
    output_attentions = False,
    output_hidden_states = False
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [None]:
model.resize_token_embeddings(len(tokenizer))  # add_token之后相应的resize一下embedding的维度
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28998, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


In [None]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=LEARNING_RATE)

In [None]:
epochs = 15
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAINING
    model.train()
    tr_loss = 0
    tr_pred , tr_true_labels = [], []
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(training_loader):
        # add batch to gpu
        b_input_ids = batch['ids'].to(device)
        b_input_mask = batch['mask'].to(device)
        b_labels = batch['tags'].to(device)
        # forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs['loss']
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
        # Move logits and labels to CPU
        tr_logits = outputs['logits'].detach().cpu().numpy()
        tr_label = b_labels.to('cpu').numpy()
        tr_pred.extend([list(p) for p in np.argmax(tr_logits, axis=2)])
        tr_true_labels.extend(tr_label)

    # print train loss per epoch
    print("Train loss: {}".format(tr_loss / len(training_loader)))
    tr_pred_tags = [tags_vals[p_i] for p, l in zip(tr_pred, tr_true_labels) for p_i, l_i in zip(p, l) if tags_vals[l_i] != "PAD"]
    train_tags = [tags_vals[l_i] for l in tr_true_labels for l_i in l if tags_vals[l_i] != "PAD"]
    train_acc = accuracy_score(tr_pred_tags, train_tags)
    print("Train Accuracy: {}".format(train_acc))


    # VALIDATION
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in testing_loader:
        b_input_ids = batch['ids'].to(device)
        b_input_mask = batch['mask'].to(device)
        b_labels = batch['tags'].to(device)
        
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        logits = outputs['logits'].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        eval_loss += outputs['loss'].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)
        
    eval_loss = eval_loss/len(testing_loader)

    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tags_vals[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tags_vals[l_i] != "PAD"]
    valid_tags = [tags_vals[l_i] for l in true_labels
                                  for l_i in l if tags_vals[l_i] != "PAD"]
    valid_acc = accuracy_score(pred_tags, valid_tags)
    print("Validation Accuracy: {}".format(valid_acc))

Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Train loss: 0.7326920686294716
Train Accuracy: 0.8022816069538605
Validation loss: 0.5275163590908051


Epoch:   7%|▋         | 1/15 [08:40<2:01:32, 520.92s/it]

Validation Accuracy: 0.8478923971446044
Train loss: 0.4814035117545425
Train Accuracy: 0.8596164636330771
Validation loss: 0.45480387162517855


Epoch:  13%|█▎        | 2/15 [17:21<1:52:50, 520.83s/it]

Validation Accuracy: 0.8673142487380939
Train loss: 0.39775182815586646
Train Accuracy: 0.8821507032777565
Validation loss: 0.4271253922501126


Epoch:  20%|██        | 3/15 [26:04<1:44:18, 521.55s/it]

Validation Accuracy: 0.8758871681126299
Train loss: 0.3397463687915143
Train Accuracy: 0.8982986681784838
Validation loss: 0.41438167417371596


Epoch:  27%|██▋       | 4/15 [34:54<1:36:05, 524.11s/it]

Validation Accuracy: 0.8809329221424069
Train loss: 0.2941048635740267
Train Accuracy: 0.9109585791847358
Validation loss: 0.40953979556624953


Epoch:  33%|███▎      | 5/15 [43:41<1:27:28, 524.89s/it]

Validation Accuracy: 0.8844619576702755
Train loss: 0.2573375866261278
Train Accuracy: 0.9211067922640265
Validation loss: 0.40794235935082307


Epoch:  40%|████      | 6/15 [52:27<1:18:47, 525.27s/it]

Validation Accuracy: 0.8886081536243213
Train loss: 0.22628673028735932
Train Accuracy: 0.9300339761594344
Validation loss: 0.41440189666039234


Epoch:  47%|████▋     | 7/15 [1:01:12<1:09:59, 524.98s/it]

Validation Accuracy: 0.8904745963677304
Train loss: 0.1998931522049555
Train Accuracy: 0.9375195277035748
Validation loss: 0.4312961973048545


Epoch:  53%|█████▎    | 8/15 [1:10:02<1:01:26, 526.67s/it]

Validation Accuracy: 0.8876263074917665
Train loss: 0.17723059545202954
Train Accuracy: 0.9439125794516096
Validation loss: 0.4391350264484818


Epoch:  60%|██████    | 9/15 [1:18:50<52:41, 526.97s/it]  

Validation Accuracy: 0.8906522637631451
Train loss: 0.15757437568248772
Train Accuracy: 0.9496505987015728
Validation loss: 0.4444419260766055


Epoch:  67%|██████▋   | 10/15 [1:27:37<43:54, 526.89s/it]

Validation Accuracy: 0.8938633681623768
Train loss: 0.14120330273862777
Train Accuracy: 0.9545122812166918
Validation loss: 0.4624579313639048


Epoch:  73%|███████▎  | 11/15 [1:36:24<35:07, 526.98s/it]

Validation Accuracy: 0.8930554490590173
Train loss: 0.12599028873593018
Train Accuracy: 0.9589465450950031
Validation loss: 0.4583782102610614


Epoch:  80%|████████  | 12/15 [1:45:13<26:22, 527.54s/it]

Validation Accuracy: 0.8964236488394579
Train loss: 0.1120664231115725
Train Accuracy: 0.9633069629977914
Validation loss: 0.4787542662105045


Epoch:  87%|████████▋ | 13/15 [1:53:56<17:32, 526.39s/it]

Validation Accuracy: 0.8970295881669774
Train loss: 0.1008669943435163
Train Accuracy: 0.9667634687698126
Validation loss: 0.47672764497834286


Epoch:  93%|█████████▎| 14/15 [2:02:39<08:45, 525.22s/it]

Validation Accuracy: 0.8997376133097191
Train loss: 0.09146094284345949
Train Accuracy: 0.9697278238315452
Validation loss: 0.48220225814226514


Epoch: 100%|██████████| 15/15 [2:11:24<00:00, 525.64s/it]

Validation Accuracy: 0.9004782058211319





In [None]:
output_model = './model/model.pth'

In [None]:
def save(model, optimizer):
    # save
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, output_model)
    print('The model has been saved')

In [None]:
save(model, optimizer)

The best model has been saved


In [None]:
#  预测testset

model.eval()
total_labels , total_tokens = [], []
for i in trange(len(test_dict['word_seq'])):
    if i not in [528, 2485]:
      decoded_tokens, tag_predicted = [], []

      # push the text into GPU, get the output of the prediction, push it to CPU
      tokenized_index = tokenizer.encode(str(' '.join(word for word in test_dict['word_seq'][i] if word != '_w_pad_')),add_special_tokens=True,max_length=512,padding=False,truncation=True)
      input_text = torch.tensor([tokenized_index]).cuda()
      with torch.no_grad():
          output = model(input_text)
      tag_predicted = np.argmax(output[0].to('cpu').numpy(), axis=2)[0]
      decoded_tokens = tokenizer.convert_ids_to_tokens(input_text.to('cpu').numpy()[0])

      # Merge the divided tokens and tags together
      new_tokens, new_labels = [], []
      for token, tag in zip(decoded_tokens, tag_predicted):
          if token in ['[PAD]','[CLS]','[SEP]']:
              continue
          else:
              new_tokens.append(token)
              new_labels.append(tags_vals[tag])

      total_labels += new_labels
      total_tokens += new_tokens

100%|██████████| 2950/2950 [00:39<00:00, 75.54it/s]


In [None]:
len(total_labels)

499778

In [None]:
# for i in range(len(m)):
#   if m[i] != k[i]:
#     print(i)            528  2485

In [None]:
# test_dict['word_seq'][1012][80:84]  #'\ufeff\ufeff'
# w =  '\ufeff\ufeff'
# k = tokenizer.tokenize(w)
# len(k)

这里检查了一下total_labels的长度和要求的输出的长度对不上，然后逐行比较了一下每行的label数量和应该有的token数量，发现问题出现在测试集的第528行和第2485行，这两行Tokenize之后的长度直接超过512了..

原因是这两行压根不是英语，是两行俄语。

处理的方法是直接把这两行所有的tag都预测成O算了。

In [None]:
# 这里我是一行一行输出到dataframe里面的纯浪费时间,改成直接用total_labels给tag一列赋值就快了
# 但是要确认total_labels的长度和正常的输出长度一样(349105)

result = pd.DataFrame(columns=['id', 'count','tag'])
count = 0
for i in trange(len(test_dict['word_seq'])):
  if i not in [528, 2485]:
    for j in range(len(test_dict['word_seq'][i])):
      if test_dict['word_seq'][i][j] != "_w_pad_":
        tokenized_word = tokenizer.tokenize(test_dict['word_seq'][i][j])
        count_subwords = len(tokenized_word)
        id1 = str(i)+"_"+str(j)
        # 对于被分裂成多个subtoken的token直接取第一个subtoken的预测label值作为tag
        result = result.append({'id':id1, 'count':count_subwords, 'tag': total_labels[count]},ignore_index=True)
        count += count_subwords
  else:
    for j in range(len(test_dict['word_seq'][i])):
      id1 = str(i)+"_"+str(j)
      result = result.append({'id':id1, 'count':0, 'tag': 'O'},ignore_index=True)  # 对于line528&line2485直接全部预测成'O'


 18%|█▊        | 528/2950 [04:54<28:28,  1.42it/s]

528


 84%|████████▍ | 2485/2950 [1:00:42<22:34,  2.91s/it]

2485


100%|██████████| 2950/2950 [1:24:58<00:00,  1.73s/it]


In [None]:
result

Unnamed: 0,id,count,tags
0,0_0,4,O
1,0_1,2,O
2,0_2,1,IMMUNE_RESPONSE
3,0_3,1,IMMUNE_RESPONSE
4,0_4,1,O
...,...,...,...
349100,2949_123,1,O
349101,2949_124,1,THERAPEUTIC_OR_PREVENTIVE_PROCEDURE
349102,2949_125,1,O
349103,2949_126,1,O


In [None]:
pd.DataFrame(result, columns = ['id', 'tags']).to_csv("prediction.csv", index = False)

另外一种方法：对于被分裂成多个subtokens的token，取subtokens中出现最多的label(众数）作为预测值。

但是试了一次效果没有直接取第一个subtoken的label作为预测值的效果好。

不太确定

In [None]:
# result = pd.DataFrame(columns=['id', 'count','tag'])
# count = 0
# for i in trange(len(test_dict['word_seq'])):
#   if i not in [528, 2485]:
#     for j in range(len(test_dict['word_seq'][i])):
#       if test_dict['word_seq'][i][j] not in  ["_w_pad_","\ufeff\ufeff"]:  # 第1102行有个"\ufeff\ufeff"会导致len(tokenized_word)=0
#         tokenized_word = tokenizer.tokenize(test_dict['word_seq'][i][j])
#         count_subwords = len(tokenized_word) # if len(tokenized_word) != 0 else 1
#         id1 = str(i)+"_"+str(j)
#         count_range = count + count_subwords
#         label = max(set(total_labels[count:count_range]), key = total_labels[count:count_range].count)
#         result = result.append({'id':id1, 'count':count_subwords, 'tag': label},ignore_index=True)
#         count += count_subwords
#   else:
#     print(i)
#     for j in range(len(test_dict['word_seq'][i])):
#       id1 = str(i)+"_"+str(j)
#       result = result.append({'id':id1, 'count':0, 'tag': 'O'},ignore_index=True)

SyntaxError: ignored