From 2d72ee0f8c43ec7410d45d3d29da55d7698005de Mon Sep 17 00:00:00 2001 From: chenyuanzhao Date: Wed, 21 Oct 2020 10:45:50 +0800 Subject: [PATCH] refactor(nlp): fix some pylint problems of bert --- .github/workflows/ci.yml | 5 +- official/nlp/bert/config_args.py | 5 +- official/nlp/bert/model.py | 117 +++++++++++++++++------------- official/nlp/bert/mrpc_dataset.py | 11 ++- official/nlp/bert/test.py | 13 ++-- official/nlp/bert/tokenization.py | 16 ++-- official/nlp/bert/train.py | 20 +++-- setup.cfg | 4 +- 8 files changed, 104 insertions(+), 87 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5f110d4e..2ddac79a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,9 +38,10 @@ jobs: run: | export PYTHONPATH=$PWD:$PYTHONPATH - CHECK_DIR=official/vision/ + CHECK_VISION=official/vision/ + CHECK_NLP=official/nlp/ pip install pylint==2.5.2 - pylint $CHECK_DIR --rcfile=.pylintrc || pylint_ret=$? + pylint $CHECK_VISION $CHECK_NLP --rcfile=.pylintrc || pylint_ret=$? echo test, and deploy your project. if [ "$pylint_ret" ]; then exit $pylint_ret diff --git a/official/nlp/bert/config_args.py b/official/nlp/bert/config_args.py index ca620775..16a15f4a 100644 --- a/official/nlp/bert/config_args.py +++ b/official/nlp/bert/config_args.py @@ -14,13 +14,14 @@ def get_args(): parser = argparse.ArgumentParser() - ## parameters + # parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, - help="The input data dir. Should contain the .tsv files (or other data files) for the task.", + help="The input data dir. Should contain the .tsv files (or other data files)" + " for the task.", ) parser.add_argument( diff --git a/official/nlp/bert/model.py b/official/nlp/bert/model.py index 9eafacac..979ccb0d 100644 --- a/official/nlp/bert/model.py +++ b/official/nlp/bert/model.py @@ -16,9 +16,6 @@ """Megengine BERT model.""" -from __future__ import (absolute_import, division, print_function, - unicode_literals) - import copy import json import math @@ -27,10 +24,11 @@ import urllib.request from io import open +import numpy as np + import megengine as mge import megengine.functional as F import megengine.hub as hub -import numpy as np from megengine import Parameter from megengine.functional.loss import cross_entropy from megengine.module import Dropout, Embedding, Linear, Module, Sequential @@ -45,7 +43,8 @@ def transpose(inp, a, b): def gelu(x): """Implementation of the gelu activation function. - For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): + For information: OpenAI GPT's gelu is slightly different + (and gives slightly different results): x * 0.5 * (1.0 + F.tanh((F.sqrt(2 / math.pi) * (x + 0.044715 * (x ** 3))))) Also see https://arxiv.org/abs/1606.08415 """ @@ -98,7 +97,7 @@ def __init__( initializing all weight matrices. """ if isinstance(vocab_size_or_config_json_file, str): - with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: + with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: json_config = json.loads(reader.read()) for key, value in json_config.items(): self.__dict__[key] = value @@ -158,7 +157,7 @@ class BertLayerNorm(Module): """ def __init__(self, hidden_size, eps=1e-12): - super(BertLayerNorm, self).__init__() + super().__init__() self.weight = Parameter(np.ones(hidden_size).astype(np.float32)) self.bias = Parameter(np.zeros(hidden_size).astype(np.float32)) self.variance_epsilon = eps @@ -175,7 +174,7 @@ class BertEmbeddings(Module): """ def __init__(self, config): - super(BertEmbeddings, self).__init__() + super().__init__() self.word_embeddings = Embedding(config.vocab_size, config.hidden_size) self.position_embeddings = Embedding( config.max_position_embeddings, config.hidden_size @@ -184,8 +183,8 @@ def __init__(self, config): config.type_vocab_size, config.hidden_size ) - # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load - # any TensorFlow checkpoint file + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name + # and be able to load any TensorFlow checkpoint file self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) self.dropout = Dropout(config.hidden_dropout_prob) @@ -210,7 +209,7 @@ def forward(self, input_ids, token_type_ids=None): class BertSelfAttention(Module): def __init__(self, config): - super(BertSelfAttention, self).__init__() + super().__init__() if config.hidden_size % config.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " @@ -229,7 +228,9 @@ def __init__(self, config): def transpose_for_scores(self, x): # using symbolic shapes to make trace happy x_shape = mge.tensor(x.shape) - new_x_shape = F.concat([x_shape[:-1], (self.num_attention_heads, self.attention_head_size)]) + new_x_shape = F.concat( + [x_shape[:-1], (self.num_attention_heads, self.attention_head_size)] + ) x = x.reshape(new_x_shape) return x.transpose(0, 2, 1, 3) @@ -266,7 +267,7 @@ def forward(self, hidden_states, attention_mask): class BertSelfOutput(Module): def __init__(self, config): - super(BertSelfOutput, self).__init__() + super().__init__() self.dense = Linear(config.hidden_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) self.dropout = Dropout(config.hidden_dropout_prob) @@ -280,7 +281,7 @@ def forward(self, hidden_states, input_tensor): class BertAttention(Module): def __init__(self, config): - super(BertAttention, self).__init__() + super().__init__() self.self = BertSelfAttention(config) self.output = BertSelfOutput(config) @@ -292,7 +293,7 @@ def forward(self, input_tensor, attention_mask): class BertIntermediate(Module): def __init__(self, config): - super(BertIntermediate, self).__init__() + super().__init__() self.dense = Linear(config.hidden_size, config.intermediate_size) if isinstance(config.hidden_act, str): self.intermediate_act_fn = ACT2FN[config.hidden_act] @@ -307,7 +308,7 @@ def forward(self, hidden_states): class BertOutput(Module): def __init__(self, config): - super(BertOutput, self).__init__() + super().__init__() self.dense = Linear(config.intermediate_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) self.dropout = Dropout(config.hidden_dropout_prob) @@ -321,7 +322,7 @@ def forward(self, hidden_states, input_tensor): class BertLayer(Module): def __init__(self, config): - super(BertLayer, self).__init__() + super().__init__() self.attention = BertAttention(config) self.intermediate = BertIntermediate(config) self.output = BertOutput(config) @@ -335,7 +336,7 @@ def forward(self, hidden_states, attention_mask): class BertEncoder(Module): def __init__(self, config): - super(BertEncoder, self).__init__() + super().__init__() self.layer = Sequential( *[BertLayer(config) for _ in range(config.num_hidden_layers)] ) @@ -354,7 +355,7 @@ def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True) class BertPooler(Module): def __init__(self, config): - super(BertPooler, self).__init__() + super().__init__() self.dense = Linear(config.hidden_size, config.hidden_size) self.activation = F.tanh @@ -375,26 +376,34 @@ class BertModel(Module): Inputs: `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + with the word token indices in the vocabulary + (see the tokens preprocessing logic in the scripts `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + `token_type_ids`: an optional torch.LongTensor of shape + [batch_size, sequence_length] with the token types indices selected in [0, 1]. + Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] + with indices selected in [0, 1]. It's a mask to be used if the input sequence length + is smaller than the max input sequence length in the current batch. + It's the mask that we typically use for attention when a batch has varying length sentences. - `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`. + `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` + output as described below. Default: `True`. Outputs: Tuple of (encoded_layers, pooled_output) `encoded_layers`: controled by `output_all_encoded_layers` argument: - - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end - of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each - encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size], - - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding - to the last attention block of shape [batch_size, sequence_length, hidden_size], - `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a - classifier pretrained on top of the hidden state associated to the first character of the + - `output_all_encoded_layers=True`: outputs a list of the full sequences of + encoded-hidden-states at the end of each attention block + (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each + encoded-hidden-state is a torch.FloatTensor of size + [batch_size, sequence_length, hidden_size], + - `output_all_encoded_layers=False`: outputs only the full sequence of + hidden-states corresponding to the last attention block of shape + [batch_size, sequence_length, hidden_size], + `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] + which is the output of classifier pretrained on top of the hidden state + associated to the first character of the input (`CLS`) to train on the Next-Sentence task (see BERT's paper). Example usage: @@ -474,15 +483,17 @@ class BertForSequenceClassification(Module): Inputs: `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts + with the word token indices in the vocabulary. + Items in the batch should begin with the special "CLS" token. + (see the tokens preprocessing logic in the scripts `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] + with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` + and type 1 corresponds to a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] + with indices selected in [0, 1]. It's a mask to be used if the input sequence length + is smaller than the max input sequence length in the current batch. It's the mask + that we typically use for attention when a batch has varying length sentences. `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] with indices selected in [0, ..., num_labels]. @@ -580,7 +591,8 @@ def create_hub_bert(model_name, pretrained): @hub.pretrained( - "https://data.megengine.org.cn/models/weights/bert/uncased_L-12_H-768_A-12/bert_4f2157f7_uncased_L-12_H-768_A-12.pkl" + "https://data.megengine.org.cn/models/weights/bert/" + "uncased_L-12_H-768_A-12/bert_4f2157f7_uncased_L-12_H-768_A-12.pkl" ) def uncased_L_12_H_768_A_12(): config_dict = { @@ -601,7 +613,8 @@ def uncased_L_12_H_768_A_12(): @hub.pretrained( - "https://data.megengine.org.cn/models/weights/bert/cased_L-12_H-768_A-12/bert_b9727c2f_cased_L-12_H-768_A-12.pkl" + "https://data.megengine.org.cn/models/weights/bert/" + "cased_L-12_H-768_A-12/bert_b9727c2f_cased_L-12_H-768_A-12.pkl" ) def cased_L_12_H_768_A_12(): config_dict = { @@ -622,7 +635,8 @@ def cased_L_12_H_768_A_12(): @hub.pretrained( - "https://data.megengine.org.cn/models/weights/bert/uncased_L-24_H-1024_A-16/bert_222f5012_uncased_L-24_H-1024_A-16.pkl" + "https://data.megengine.org.cn/models/weights/bert/" + "uncased_L-24_H-1024_A-16/bert_222f5012_uncased_L-24_H-1024_A-16.pkl" ) def uncased_L_24_H_1024_A_16(): config_dict = { @@ -644,7 +658,8 @@ def uncased_L_24_H_1024_A_16(): @hub.pretrained( - "https://data.megengine.org.cn/models/weights/bert/cased_L-24_H-1024_A-16/bert_01f2a65f_cased_L-24_H-1024_A-16.pkl" + "https://data.megengine.org.cn/models/weights/bert/" + "cased_L-24_H-1024_A-16/bert_01f2a65f_cased_L-24_H-1024_A-16.pkl" ) def cased_L_24_H_1024_A_16(): config_dict = { @@ -672,7 +687,8 @@ def cased_L_24_H_1024_A_16(): @hub.pretrained( - "https://data.megengine.org.cn/models/weights/bert/chinese_L-12_H-768_A-12/bert_ee91be1a_chinese_L-12_H-768_A-12.pkl" + "https://data.megengine.org.cn/models/weights/bert/" + "chinese_L-12_H-768_A-12/bert_ee91be1a_chinese_L-12_H-768_A-12.pkl" ) def chinese_L_12_H_768_A_12(): config_dict = { @@ -699,7 +715,8 @@ def chinese_L_12_H_768_A_12(): @hub.pretrained( - "https://data.megengine.org.cn/models/weights/bert/multi_cased_L-12_H-768_A-12/bert_283ceec5_multi_cased_L-12_H-768_A-12.pkl" + "https://data.megengine.org.cn/models/weights/bert/" + "multi_cased_L-12_H-768_A-12/bert_283ceec5_multi_cased_L-12_H-768_A-12.pkl" ) def multi_cased_L_12_H_768_A_12(): config_dict = { @@ -727,7 +744,8 @@ def multi_cased_L_12_H_768_A_12(): @hub.pretrained( - "https://data.megengine.org.cn/models/weights/bert/wwm_uncased_L-24_H-1024_A-16/bert_e2780a6a_wwm_uncased_L-24_H-1024_A-16.pkl" + "https://data.megengine.org.cn/models/weights/bert/" + "wwm_uncased_L-24_H-1024_A-16/bert_e2780a6a_wwm_uncased_L-24_H-1024_A-16.pkl" ) def wwm_uncased_L_24_H_1024_A_16(): config_dict = { @@ -748,7 +766,8 @@ def wwm_uncased_L_24_H_1024_A_16(): @hub.pretrained( - "https://data.megengine.org.cn/models/weights/bert/wwm_cased_L-24_H-1024_A-16/bert_0a8f1389_wwm_cased_L-24_H-1024_A-16.pkl" + "https://data.megengine.org.cn/models/weights/bert/" + "wwm_cased_L-24_H-1024_A-16/bert_0a8f1389_wwm_cased_L-24_H-1024_A-16.pkl" ) def wwm_cased_L_24_H_1024_A_16(): config_dict = { diff --git a/official/nlp/bert/mrpc_dataset.py b/official/nlp/bert/mrpc_dataset.py index 48397812..8362ecef 100644 --- a/official/nlp/bert/mrpc_dataset.py +++ b/official/nlp/bert/mrpc_dataset.py @@ -9,14 +9,15 @@ import csv import os -import megengine as mge +from tokenization import BertTokenizer + import numpy as np + +import megengine as mge from megengine.data import DataLoader from megengine.data.dataset import ArrayDataset from megengine.data.sampler import RandomSampler, SequentialSampler -from tokenization import BertTokenizer - logger = mge.get_logger(__name__) @@ -199,7 +200,9 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer logger.info("tokens: {}".format(" ".join([str(x) for x in tokens]))) logger.info("input_ids: {}".format(" ".join([str(x) for x in input_ids]))) logger.info("input_mask: {}".format(" ".join([str(x) for x in input_mask]))) - logger.info("segment_ids: {}".format(" ".join([str(x) for x in segment_ids]))) + logger.info( + "segment_ids: {}".format(" ".join([str(x) for x in segment_ids])) + ) logger.info("label: {} (id = {})".format(example.label, label_id)) features.append( diff --git a/official/nlp/bert/test.py b/official/nlp/bert/test.py index 2f6d3e64..78201938 100644 --- a/official/nlp/bert/test.py +++ b/official/nlp/bert/test.py @@ -7,20 +7,21 @@ # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -import megengine as mge -import megengine.functional as F -from megengine.jit import trace from tqdm import tqdm -from model import BertForSequenceClassification, create_hub_bert -from mrpc_dataset import MRPCDataset # pylint: disable=import-outside-toplevel import config_args +from mrpc_dataset import MRPCDataset + +import megengine as mge +import megengine.functional as F + +from official.nlp.bert.model import BertForSequenceClassification, create_hub_bert + args = config_args.get_args() logger = mge.get_logger(__name__) -# @trace(symbolic=True) def net_eval(input_ids, segment_ids, input_mask, label_ids, net=None): net.eval() results = net(input_ids, segment_ids, input_mask, label_ids) diff --git a/official/nlp/bert/tokenization.py b/official/nlp/bert/tokenization.py index 20b06005..b0f6d72b 100644 --- a/official/nlp/bert/tokenization.py +++ b/official/nlp/bert/tokenization.py @@ -14,9 +14,7 @@ # ---------------------------------------------------------------------- """Tokenization classes.""" -from __future__ import (absolute_import, division, print_function, - unicode_literals) - +from __future__ import absolute_import, division, print_function, unicode_literals import collections import os import unicodedata @@ -81,8 +79,9 @@ def __init__( """ if not os.path.isfile(vocab_file): raise ValueError( - "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " - "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format( + "Can't find a vocabulary file at path '{}'. " + "To load the vocabulary from a Google pretrained model use " + "`tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format( vocab_file ) ) @@ -363,12 +362,7 @@ def _is_punctuation(char): # Characters such as "^", "$", and "`" are not in the Unicode # Punctuation class but we treat them as punctuation anyways, for # consistency. - if ( - (33 <= cp <= 47) - or (58 <= cp <= 64) - or (91 <= cp <= 96) - or (123 <= cp <= 126) - ): + if (33 <= cp <= 47) or (58 <= cp <= 64) or (91 <= cp <= 96) or (123 <= cp <= 126): return True cat = unicodedata.category(char) if cat.startswith("P"): diff --git a/official/nlp/bert/train.py b/official/nlp/bert/train.py index 5fa9dff5..75b15084 100644 --- a/official/nlp/bert/train.py +++ b/official/nlp/bert/train.py @@ -7,22 +7,23 @@ # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +from tqdm import tqdm + +# pylint: disable=import-outside-toplevel +import config_args +from mrpc_dataset import MRPCDataset + import megengine as mge import megengine.functional as F import megengine.optimizer as optim from megengine.autodiff import GradManager -from megengine.jit import trace -from tqdm import tqdm -from model import BertForSequenceClassification, create_hub_bert -from mrpc_dataset import MRPCDataset -# pylint: disable=import-outside-toplevel -import config_args +from official.nlp.bert.model import BertForSequenceClassification, create_hub_bert + args = config_args.get_args() logger = mge.get_logger(__name__) -# @trace(symbolic=True) def net_eval(input_ids, segment_ids, input_mask, label_ids, net=None): net.eval() results = net(input_ids, segment_ids, input_mask, label_ids) @@ -30,7 +31,6 @@ def net_eval(input_ids, segment_ids, input_mask, label_ids, net=None): return loss, logits -# @trace(symbolic=True) def net_train(input_ids, segment_ids, input_mask, label_ids, gm=None, net=None): net.train() with gm: @@ -53,9 +53,7 @@ def eval(dataloader, net): batch_size = input_ids.shape[0] if batch_size != args.eval_batch_size: break - loss, logits = net_eval( - input_ids, segment_ids, input_mask, label_ids, net=net - ) + loss, logits = net_eval(input_ids, segment_ids, input_mask, label_ids, net=net) sum_loss += loss.mean().item() sum_accuracy += F.topk_accuracy(logits, label_ids) * batch_size total_examples += batch_size diff --git a/setup.cfg b/setup.cfg index e36eeb0d..aaded5e9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [isort] line_length = 100 -skip=official/quantization,official/vision/gan,official/vision/keypoints,official/nlp +skip=official/quantization,official/vision/gan,official/vision/keypoints multi_line_output = 3 balanced_wrapping = True known_standard_library = setuptools @@ -17,6 +17,6 @@ ignore = W503 max-line-length = 100 max-complexity = 18 select = B,C,E,F,W,T4,B9 -exclude = official/quantization,official/vision/gan,official/vision/keypoints,official/nlp +exclude = official/quantization,official/vision/gan,official/vision/keypoints per-file-ignores = **/__init__.py:F401,F403