Let0Pda · sourcery-ai · Aug 30, 2023 · sourcery-ai · Aug 30, 2023 · sourcery-ai
diff --git a/BingBertGlue/nvidia/modeling.py b/BingBertGlue/nvidia/modeling.py
@@ -81,13 +81,13 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         )
         raise
     tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    print(f"Converting TensorFlow checkpoint from {tf_path}")
     # Load weights from TF model
     init_vars = tf.train.list_variables(tf_path)
     names = []
     arrays = []
     for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
+        print(f"Loading TF weight {name} with shape {shape}")
         array = tf.train.load_variable(tf_path, name)
         names.append(name)
         arrays.append(array)
@@ -97,20 +97,22 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(n in ["adam_v", "adam_m"] for n in name):
-            print("Skipping {}".format("/".join(name)))
+            print(f'Skipping {"/".join(name)}')
             continue
         pointer = model
         for m_name in name:
             if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
                 l = re.split(r'_(\d+)', m_name)
             else:
                 l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
+            if (
+                l[0] in ['kernel', 'gamma']
+                or l[0] not in ['output_bias', 'beta']
+                and l[0] == 'output_weights'
+            ):
                 pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
+            elif l[0] in ['output_bias', 'beta']:
                 pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
             else:
                 pointer = getattr(pointer, l[0])
             if len(l) >= 2:
@@ -125,7 +127,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
-        print("Initialize PyTorch weight {}".format(name))
+        print(f"Initialize PyTorch weight {name}")
         pointer.data = torch.from_numpy(array)
     return model
 
@@ -207,8 +209,7 @@ def forward(self, input):
             return self.act_fn(F.linear(input, self.weight, self.bias))
 
     def extra_repr(self):
-        return 'in_features={}, out_features={}, bias={}'.format(
-            self.in_features, self.out_features, self.bias is not None)
+        return f'in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}'
 
 
 class BertConfig(object):
@@ -294,8 +295,7 @@ def __repr__(self):
 
     def to_dict(self):
         """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
+        return copy.deepcopy(self.__dict__)
 
     def to_json_string(self):
         """Serializes this instance to a JSON string."""
@@ -450,8 +450,7 @@ def __init__(self, config):
 
     def forward(self, input_tensor, attention_mask):
         self_output = self.self(input_tensor, attention_mask)
-        attention_output = self.output(self_output, input_tensor)
-        return attention_output
+        return self.output(self_output, input_tensor)
 
 
 class BertIntermediate(nn.Module):
@@ -490,8 +489,7 @@ def __init__(self, config):
     def forward(self, hidden_states, attention_mask):
         attention_output = self.attention(hidden_states, attention_mask)
         intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
+        return self.output(intermediate_output, attention_output)
 
 
 class BertEncoder(nn.Module):
@@ -606,8 +604,7 @@ def forward(self, hidden_states):
         # We "pool" the model by simply taking the hidden state corresponding
         # to the first token.
         first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense_act(first_token_tensor)
-        return pooled_output
+        return self.dense_act(first_token_tensor)
 
 
 class BertPredictionHeadTransform(nn.Module):
@@ -641,8 +638,8 @@ def __init__(self, config, bert_model_embedding_weights):
     def forward(self, hidden_states):
         hidden_states = self.transform(hidden_states)
         torch.cuda.nvtx.range_push(
-            "decoder input.size() = {}, weight.size() = {}".format(
-                hidden_states.size(), self.decoder.weight.size()))
+            f"decoder input.size() = {hidden_states.size()}, weight.size() = {self.decoder.weight.size()}"
+        )
         hidden_states = self.decoder(hidden_states) + self.bias
         torch.cuda.nvtx.range_pop()
         return hidden_states
@@ -655,8 +652,7 @@ def __init__(self, config, bert_model_embedding_weights):
                                                 bert_model_embedding_weights)
 
     def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
+        return self.predictions(sequence_output)
 
 
 class BertOnlyNSPHead(nn.Module):
@@ -665,8 +661,7 @@ def __init__(self, config):
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
 
     def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
+        return self.seq_relationship(pooled_output)
 
 
 class BertPreTrainingHeads(nn.Module):
@@ -690,10 +685,8 @@ def __init__(self, config, *inputs, **kwargs):
         super(BertPreTrainedModel, self).__init__()
         if not isinstance(config, BertConfig):
             raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__))
+                f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class `BertConfig`. To create a model from a Google pretrained model use `model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
         self.config = config
 
     def init_bert_weights(self, module):
@@ -834,15 +827,15 @@ def load(module, prefix=''):
                 s.startswith('bert.') for s in state_dict.keys()):
             start_prefix = 'bert.'
         load(model, prefix=start_prefix)
-        if len(missing_keys) > 0:
+        if missing_keys:
             logger.info(
                 "Weights of {} not initialized from pretrained model: {}".
                 format(model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
+        if unexpected_keys:
             logger.info(
                 "Weights from pretrained model not used in {}: {}".format(
                     model.__class__.__name__, unexpected_keys))
-        if len(error_msgs) > 0:
+        if error_msgs:
             raise RuntimeError(
                 'Error(s) in loading state_dict for {}:\n\t{}'.format(
                     model.__class__.__name__, "\n\t".join(error_msgs)))
@@ -1016,20 +1009,15 @@ def forward(self, batch, log=True):
         prediction_scores, seq_relationship_score = self.cls(
             sequence_output, pooled_output)
 
-        if masked_lm_labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
-                                          next_sentence_label.view(-1))
-            #print("loss is {} {}".format(masked_lm_loss, next_sentence_loss))
-            total_loss = masked_lm_loss + next_sentence_loss
-            #            if log:
-            #                self.log_summary_writer(logs={'train_loss': total_loss.item()})
-            return total_loss
-        else:
+        if masked_lm_labels is None or next_sentence_label is None:
             return prediction_scores, seq_relationship_score
+        loss_fct = CrossEntropyLoss(ignore_index=-1)
+        masked_lm_loss = loss_fct(
+            prediction_scores.view(-1, self.config.vocab_size),
+            masked_lm_labels.view(-1))
+        next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
+                                      next_sentence_label.view(-1))
+        return masked_lm_loss + next_sentence_loss
 
 
 class BertForMaskedLM(BertPreTrainedModel):
@@ -1089,10 +1077,10 @@ def forward(self,
 
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
+            return loss_fct(
                 prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            return masked_lm_loss
+                masked_lm_labels.view(-1),
+            )
         else:
             return prediction_scores
 
@@ -1152,13 +1140,12 @@ def forward(self,
                                      output_all_encoded_layers=False)
         seq_relationship_score = self.cls(pooled_output)
 
-        if next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
-                                          next_sentence_label.view(-1))
-            return next_sentence_loss
-        else:
+        if next_sentence_label is None:
             return seq_relationship_score
+        loss_fct = CrossEntropyLoss(ignore_index=-1)
+        return loss_fct(
+            seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)
+        )
 
 
 class BertForSequenceClassification(BertPreTrainedModel):
@@ -1222,8 +1209,7 @@ def forward(self,
 
         if labels is not None:
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
         else:
             return logits
 
@@ -1291,12 +1277,10 @@ def forward(self,
         logits = self.classifier(pooled_output)
         reshaped_logits = logits.view(-1, self.num_choices)
 
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            return loss
-        else:
+        if labels is None:
             return reshaped_logits
+        loss_fct = CrossEntropyLoss()
+        return loss_fct(reshaped_logits, labels)
 
 
 class BertForTokenClassification(BertPreTrainedModel):
@@ -1358,20 +1342,15 @@ def forward(self,
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
 
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels),
-                                labels.view(-1))
-            return loss
-        else:
+        if labels is None:
             return logits
+        loss_fct = CrossEntropyLoss()
+        if attention_mask is None:
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        active_loss = attention_mask.view(-1) == 1
+        active_logits = logits.view(-1, self.num_labels)[active_loss]
+        active_labels = labels.view(-1)[active_loss]
+        return loss_fct(active_logits, active_labels)
 
 
 class BertForQuestionAnswering(BertPreTrainedModel):
@@ -1439,21 +1418,19 @@ def forward(self,
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            return total_loss
-        else:
-            return start_logits, end_logits
+        if start_positions is None or end_positions is None:
+            return start_logits, end_logits
+        # If we are on multi-GPU, split add a dimension
+        if len(start_positions.size()) > 1:
+            start_positions = start_positions.squeeze(-1)
+        if len(end_positions.size()) > 1:
+            end_positions = end_positions.squeeze(-1)
+        # sometimes the start/end positions are outside our model inputs, we ignore these terms
+        ignored_index = start_logits.size(1)
+        start_positions.clamp_(0, ignored_index)
+        end_positions.clamp_(0, ignored_index)
+
+        loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+        start_loss = loss_fct(start_logits, start_positions)
+        end_loss = loss_fct(end_logits, end_positions)
+        return (start_loss + end_loss) / 2