Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 68 additions & 91 deletions BingBertGlue/nvidia/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,13 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
)
raise
tf_path = os.path.abspath(tf_checkpoint_path)
print("Converting TensorFlow checkpoint from {}".format(tf_path))
print(f"Converting TensorFlow checkpoint from {tf_path}")
# Load weights from TF model
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
print("Loading TF weight {} with shape {}".format(name, shape))
print(f"Loading TF weight {name} with shape {shape}")
Comment on lines -84 to +90
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function load_tf_weights_in_bert refactored with the following changes:

array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
Expand All @@ -97,20 +97,22 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if any(n in ["adam_v", "adam_m"] for n in name):
print("Skipping {}".format("/".join(name)))
print(f'Skipping {"/".join(name)}')
continue
pointer = model
for m_name in name:
if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
l = re.split(r'_(\d+)', m_name)
else:
l = [m_name]
if l[0] == 'kernel' or l[0] == 'gamma':
if (
l[0] in ['kernel', 'gamma']
or l[0] not in ['output_bias', 'beta']
and l[0] == 'output_weights'
):
pointer = getattr(pointer, 'weight')
elif l[0] == 'output_bias' or l[0] == 'beta':
elif l[0] in ['output_bias', 'beta']:
pointer = getattr(pointer, 'bias')
elif l[0] == 'output_weights':
pointer = getattr(pointer, 'weight')
else:
pointer = getattr(pointer, l[0])
if len(l) >= 2:
Expand All @@ -125,7 +127,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
except AssertionError as e:
e.args += (pointer.shape, array.shape)
raise
print("Initialize PyTorch weight {}".format(name))
print(f"Initialize PyTorch weight {name}")
pointer.data = torch.from_numpy(array)
return model

Expand Down Expand Up @@ -207,8 +209,7 @@ def forward(self, input):
return self.act_fn(F.linear(input, self.weight, self.bias))

def extra_repr(self):
return 'in_features={}, out_features={}, bias={}'.format(
self.in_features, self.out_features, self.bias is not None)
return f'in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}'
Comment on lines -210 to +212
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function LinearActivation.extra_repr refactored with the following changes:



class BertConfig(object):
Expand Down Expand Up @@ -294,8 +295,7 @@ def __repr__(self):

def to_dict(self):
"""Serializes this instance to a Python dictionary."""
output = copy.deepcopy(self.__dict__)
return output
return copy.deepcopy(self.__dict__)
Comment on lines -297 to +298
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function BertConfig.to_dict refactored with the following changes:


def to_json_string(self):
"""Serializes this instance to a JSON string."""
Expand Down Expand Up @@ -450,8 +450,7 @@ def __init__(self, config):

def forward(self, input_tensor, attention_mask):
self_output = self.self(input_tensor, attention_mask)
attention_output = self.output(self_output, input_tensor)
return attention_output
return self.output(self_output, input_tensor)
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function BertAttention.forward refactored with the following changes:



class BertIntermediate(nn.Module):
Expand Down Expand Up @@ -490,8 +489,7 @@ def __init__(self, config):
def forward(self, hidden_states, attention_mask):
attention_output = self.attention(hidden_states, attention_mask)
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
return self.output(intermediate_output, attention_output)
Comment on lines -493 to +492
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function BertLayer.forward refactored with the following changes:



class BertEncoder(nn.Module):
Expand Down Expand Up @@ -606,8 +604,7 @@ def forward(self, hidden_states):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense_act(first_token_tensor)
return pooled_output
return self.dense_act(first_token_tensor)
Comment on lines -609 to +607
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function BertPooler.forward refactored with the following changes:



class BertPredictionHeadTransform(nn.Module):
Expand Down Expand Up @@ -641,8 +638,8 @@ def __init__(self, config, bert_model_embedding_weights):
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
torch.cuda.nvtx.range_push(
"decoder input.size() = {}, weight.size() = {}".format(
hidden_states.size(), self.decoder.weight.size()))
f"decoder input.size() = {hidden_states.size()}, weight.size() = {self.decoder.weight.size()}"
)
Comment on lines -644 to +642
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function BertLMPredictionHead.forward refactored with the following changes:

hidden_states = self.decoder(hidden_states) + self.bias
torch.cuda.nvtx.range_pop()
return hidden_states
Expand All @@ -655,8 +652,7 @@ def __init__(self, config, bert_model_embedding_weights):
bert_model_embedding_weights)

def forward(self, sequence_output):
prediction_scores = self.predictions(sequence_output)
return prediction_scores
return self.predictions(sequence_output)
Comment on lines -658 to +655
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function BertOnlyMLMHead.forward refactored with the following changes:



class BertOnlyNSPHead(nn.Module):
Expand All @@ -665,8 +661,7 @@ def __init__(self, config):
self.seq_relationship = nn.Linear(config.hidden_size, 2)

def forward(self, pooled_output):
seq_relationship_score = self.seq_relationship(pooled_output)
return seq_relationship_score
return self.seq_relationship(pooled_output)
Comment on lines -668 to +664
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function BertOnlyNSPHead.forward refactored with the following changes:



class BertPreTrainingHeads(nn.Module):
Expand All @@ -690,10 +685,8 @@ def __init__(self, config, *inputs, **kwargs):
super(BertPreTrainedModel, self).__init__()
if not isinstance(config, BertConfig):
raise ValueError(
"Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
"To create a model from a Google pretrained model use "
"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
self.__class__.__name__, self.__class__.__name__))
f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class `BertConfig`. To create a model from a Google pretrained model use `model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
Comment on lines -693 to +689
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function BertPreTrainedModel.__init__ refactored with the following changes:

self.config = config

def init_bert_weights(self, module):
Expand Down Expand Up @@ -834,15 +827,15 @@ def load(module, prefix=''):
s.startswith('bert.') for s in state_dict.keys()):
start_prefix = 'bert.'
load(model, prefix=start_prefix)
if len(missing_keys) > 0:
if missing_keys:
logger.info(
"Weights of {} not initialized from pretrained model: {}".
format(model.__class__.__name__, missing_keys))
if len(unexpected_keys) > 0:
if unexpected_keys:
logger.info(
"Weights from pretrained model not used in {}: {}".format(
model.__class__.__name__, unexpected_keys))
if len(error_msgs) > 0:
if error_msgs:
Comment on lines -837 to +838
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function BertPreTrainedModel.from_pretrained refactored with the following changes:

raise RuntimeError(
'Error(s) in loading state_dict for {}:\n\t{}'.format(
model.__class__.__name__, "\n\t".join(error_msgs)))
Expand Down Expand Up @@ -1016,20 +1009,15 @@ def forward(self, batch, log=True):
prediction_scores, seq_relationship_score = self.cls(
sequence_output, pooled_output)

if masked_lm_labels is not None and next_sentence_label is not None:
loss_fct = CrossEntropyLoss(ignore_index=-1)
masked_lm_loss = loss_fct(
prediction_scores.view(-1, self.config.vocab_size),
masked_lm_labels.view(-1))
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
next_sentence_label.view(-1))
#print("loss is {} {}".format(masked_lm_loss, next_sentence_loss))
total_loss = masked_lm_loss + next_sentence_loss
# if log:
# self.log_summary_writer(logs={'train_loss': total_loss.item()})
return total_loss
else:
if masked_lm_labels is None or next_sentence_label is None:
return prediction_scores, seq_relationship_score
loss_fct = CrossEntropyLoss(ignore_index=-1)
masked_lm_loss = loss_fct(
prediction_scores.view(-1, self.config.vocab_size),
masked_lm_labels.view(-1))
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
next_sentence_label.view(-1))
return masked_lm_loss + next_sentence_loss
Comment on lines -1019 to +1020
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function BertForPreTraining.forward refactored with the following changes:

This removes the following comments ( why? ):

#                self.log_summary_writer(logs={'train_loss': total_loss.item()})
#            if log:
#print("loss is {} {}".format(masked_lm_loss, next_sentence_loss))



class BertForMaskedLM(BertPreTrainedModel):
Expand Down Expand Up @@ -1089,10 +1077,10 @@ def forward(self,

if masked_lm_labels is not None:
loss_fct = CrossEntropyLoss(ignore_index=-1)
masked_lm_loss = loss_fct(
return loss_fct(
prediction_scores.view(-1, self.config.vocab_size),
masked_lm_labels.view(-1))
return masked_lm_loss
masked_lm_labels.view(-1),
)
Comment on lines -1092 to +1083
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function BertForMaskedLM.forward refactored with the following changes:

else:
return prediction_scores

Expand Down Expand Up @@ -1152,13 +1140,12 @@ def forward(self,
output_all_encoded_layers=False)
seq_relationship_score = self.cls(pooled_output)

if next_sentence_label is not None:
loss_fct = CrossEntropyLoss(ignore_index=-1)
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
next_sentence_label.view(-1))
return next_sentence_loss
else:
if next_sentence_label is None:
return seq_relationship_score
loss_fct = CrossEntropyLoss(ignore_index=-1)
return loss_fct(
seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)
)
Comment on lines -1155 to +1148
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function BertForNextSentencePrediction.forward refactored with the following changes:



class BertForSequenceClassification(BertPreTrainedModel):
Expand Down Expand Up @@ -1222,8 +1209,7 @@ def forward(self,

if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
return loss
return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
Comment on lines -1225 to +1212
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function BertForSequenceClassification.forward refactored with the following changes:

else:
return logits

Expand Down Expand Up @@ -1291,12 +1277,10 @@ def forward(self,
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, self.num_choices)

if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
return loss
else:
if labels is None:
return reshaped_logits
loss_fct = CrossEntropyLoss()
return loss_fct(reshaped_logits, labels)
Comment on lines -1294 to +1283
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function BertForMultipleChoice.forward refactored with the following changes:



class BertForTokenClassification(BertPreTrainedModel):
Expand Down Expand Up @@ -1358,20 +1342,15 @@ def forward(self,
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)

if labels is not None:
loss_fct = CrossEntropyLoss()
# Only keep active parts of the loss
if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)[active_loss]
active_labels = labels.view(-1)[active_loss]
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels),
labels.view(-1))
return loss
else:
if labels is None:
return logits
loss_fct = CrossEntropyLoss()
if attention_mask is None:
return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)[active_loss]
active_labels = labels.view(-1)[active_loss]
return loss_fct(active_logits, active_labels)


class BertForQuestionAnswering(BertPreTrainedModel):
Expand Down Expand Up @@ -1439,21 +1418,19 @@ def forward(self,
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)

if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions.clamp_(0, ignored_index)
end_positions.clamp_(0, ignored_index)

loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
return total_loss
else:
return start_logits, end_logits
if start_positions is None or end_positions is None:
return start_logits, end_logits
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions.clamp_(0, ignored_index)
end_positions.clamp_(0, ignored_index)

loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
return (start_loss + end_loss) / 2
Loading