diff --git a/BingBertGlue/nvidia/modeling.py b/BingBertGlue/nvidia/modeling.py
index d887246cd..88b828605 100755
--- a/BingBertGlue/nvidia/modeling.py
+++ b/BingBertGlue/nvidia/modeling.py
@@ -81,13 +81,13 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         )
         raise
     tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    print(f"Converting TensorFlow checkpoint from {tf_path}")
     # Load weights from TF model
     init_vars = tf.train.list_variables(tf_path)
     names = []
     arrays = []
     for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
+        print(f"Loading TF weight {name} with shape {shape}")
         array = tf.train.load_variable(tf_path, name)
         names.append(name)
         arrays.append(array)
@@ -97,7 +97,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(n in ["adam_v", "adam_m"] for n in name):
-            print("Skipping {}".format("/".join(name)))
+            print(f'Skipping {"/".join(name)}')
             continue
         pointer = model
         for m_name in name:
@@ -105,12 +105,14 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
                 l = re.split(r'_(\d+)', m_name)
             else:
                 l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
+            if (
+                l[0] in ['kernel', 'gamma']
+                or l[0] not in ['output_bias', 'beta']
+                and l[0] == 'output_weights'
+            ):
                 pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
+            elif l[0] in ['output_bias', 'beta']:
                 pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
             else:
                 pointer = getattr(pointer, l[0])
             if len(l) >= 2:
@@ -125,7 +127,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
-        print("Initialize PyTorch weight {}".format(name))
+        print(f"Initialize PyTorch weight {name}")
         pointer.data = torch.from_numpy(array)
     return model
 
@@ -207,8 +209,7 @@ def forward(self, input):
             return self.act_fn(F.linear(input, self.weight, self.bias))
 
     def extra_repr(self):
-        return 'in_features={}, out_features={}, bias={}'.format(
-            self.in_features, self.out_features, self.bias is not None)
+        return f'in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}'
 
 
 class BertConfig(object):
@@ -294,8 +295,7 @@ def __repr__(self):
 
     def to_dict(self):
         """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
+        return copy.deepcopy(self.__dict__)
 
     def to_json_string(self):
         """Serializes this instance to a JSON string."""
@@ -450,8 +450,7 @@ def __init__(self, config):
 
     def forward(self, input_tensor, attention_mask):
         self_output = self.self(input_tensor, attention_mask)
-        attention_output = self.output(self_output, input_tensor)
-        return attention_output
+        return self.output(self_output, input_tensor)
 
 
 class BertIntermediate(nn.Module):
@@ -490,8 +489,7 @@ def __init__(self, config):
     def forward(self, hidden_states, attention_mask):
         attention_output = self.attention(hidden_states, attention_mask)
         intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
+        return self.output(intermediate_output, attention_output)
 
 
 class BertEncoder(nn.Module):
@@ -606,8 +604,7 @@ def forward(self, hidden_states):
         # We "pool" the model by simply taking the hidden state corresponding
         # to the first token.
         first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense_act(first_token_tensor)
-        return pooled_output
+        return self.dense_act(first_token_tensor)
 
 
 class BertPredictionHeadTransform(nn.Module):
@@ -641,8 +638,8 @@ def __init__(self, config, bert_model_embedding_weights):
     def forward(self, hidden_states):
         hidden_states = self.transform(hidden_states)
         torch.cuda.nvtx.range_push(
-            "decoder input.size() = {}, weight.size() = {}".format(
-                hidden_states.size(), self.decoder.weight.size()))
+            f"decoder input.size() = {hidden_states.size()}, weight.size() = {self.decoder.weight.size()}"
+        )
         hidden_states = self.decoder(hidden_states) + self.bias
         torch.cuda.nvtx.range_pop()
         return hidden_states
@@ -655,8 +652,7 @@ def __init__(self, config, bert_model_embedding_weights):
                                                 bert_model_embedding_weights)
 
     def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
+        return self.predictions(sequence_output)
 
 
 class BertOnlyNSPHead(nn.Module):
@@ -665,8 +661,7 @@ def __init__(self, config):
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
 
     def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
+        return self.seq_relationship(pooled_output)
 
 
 class BertPreTrainingHeads(nn.Module):
@@ -690,10 +685,8 @@ def __init__(self, config, *inputs, **kwargs):
         super(BertPreTrainedModel, self).__init__()
         if not isinstance(config, BertConfig):
             raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__))
+                f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class `BertConfig`. To create a model from a Google pretrained model use `model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
         self.config = config
 
     def init_bert_weights(self, module):
@@ -834,15 +827,15 @@ def load(module, prefix=''):
                 s.startswith('bert.') for s in state_dict.keys()):
             start_prefix = 'bert.'
         load(model, prefix=start_prefix)
-        if len(missing_keys) > 0:
+        if missing_keys:
             logger.info(
                 "Weights of {} not initialized from pretrained model: {}".
                 format(model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
+        if unexpected_keys:
             logger.info(
                 "Weights from pretrained model not used in {}: {}".format(
                     model.__class__.__name__, unexpected_keys))
-        if len(error_msgs) > 0:
+        if error_msgs:
             raise RuntimeError(
                 'Error(s) in loading state_dict for {}:\n\t{}'.format(
                     model.__class__.__name__, "\n\t".join(error_msgs)))
@@ -1016,20 +1009,15 @@ def forward(self, batch, log=True):
         prediction_scores, seq_relationship_score = self.cls(
             sequence_output, pooled_output)
 
-        if masked_lm_labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
-                                          next_sentence_label.view(-1))
-            #print("loss is {} {}".format(masked_lm_loss, next_sentence_loss))
-            total_loss = masked_lm_loss + next_sentence_loss
-            #            if log:
-            #                self.log_summary_writer(logs={'train_loss': total_loss.item()})
-            return total_loss
-        else:
+        if masked_lm_labels is None or next_sentence_label is None:
             return prediction_scores, seq_relationship_score
+        loss_fct = CrossEntropyLoss(ignore_index=-1)
+        masked_lm_loss = loss_fct(
+            prediction_scores.view(-1, self.config.vocab_size),
+            masked_lm_labels.view(-1))
+        next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
+                                      next_sentence_label.view(-1))
+        return masked_lm_loss + next_sentence_loss
 
 
 class BertForMaskedLM(BertPreTrainedModel):
@@ -1089,10 +1077,10 @@ def forward(self,
 
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
+            return loss_fct(
                 prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            return masked_lm_loss
+                masked_lm_labels.view(-1),
+            )
         else:
             return prediction_scores
 
@@ -1152,13 +1140,12 @@ def forward(self,
                                      output_all_encoded_layers=False)
         seq_relationship_score = self.cls(pooled_output)
 
-        if next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
-                                          next_sentence_label.view(-1))
-            return next_sentence_loss
-        else:
+        if next_sentence_label is None:
             return seq_relationship_score
+        loss_fct = CrossEntropyLoss(ignore_index=-1)
+        return loss_fct(
+            seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)
+        )
 
 
 class BertForSequenceClassification(BertPreTrainedModel):
@@ -1222,8 +1209,7 @@ def forward(self,
 
         if labels is not None:
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
         else:
             return logits
 
@@ -1291,12 +1277,10 @@ def forward(self,
         logits = self.classifier(pooled_output)
         reshaped_logits = logits.view(-1, self.num_choices)
 
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            return loss
-        else:
+        if labels is None:
             return reshaped_logits
+        loss_fct = CrossEntropyLoss()
+        return loss_fct(reshaped_logits, labels)
 
 
 class BertForTokenClassification(BertPreTrainedModel):
@@ -1358,20 +1342,15 @@ def forward(self,
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
 
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels),
-                                labels.view(-1))
-            return loss
-        else:
+        if labels is None:
             return logits
+        loss_fct = CrossEntropyLoss()
+        if attention_mask is None:
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        active_loss = attention_mask.view(-1) == 1
+        active_logits = logits.view(-1, self.num_labels)[active_loss]
+        active_labels = labels.view(-1)[active_loss]
+        return loss_fct(active_logits, active_labels)
 
 
 class BertForQuestionAnswering(BertPreTrainedModel):
@@ -1439,21 +1418,19 @@ def forward(self,
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            return total_loss
-        else:
-            return start_logits, end_logits
\ No newline at end of file
+        if start_positions is None or end_positions is None:
+            return start_logits, end_logits
+        # If we are on multi-GPU, split add a dimension
+        if len(start_positions.size()) > 1:
+            start_positions = start_positions.squeeze(-1)
+        if len(end_positions.size()) > 1:
+            end_positions = end_positions.squeeze(-1)
+        # sometimes the start/end positions are outside our model inputs, we ignore these terms
+        ignored_index = start_logits.size(1)
+        start_positions.clamp_(0, ignored_index)
+        end_positions.clamp_(0, ignored_index)
+
+        loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+        start_loss = loss_fct(start_logits, start_positions)
+        end_loss = loss_fct(end_logits, end_positions)
+        return (start_loss + end_loss) / 2
\ No newline at end of file
diff --git a/BingBertGlue/nvidia/modelingpreln.py b/BingBertGlue/nvidia/modelingpreln.py
index 69c650b56..351c6c7a0 100755
--- a/BingBertGlue/nvidia/modelingpreln.py
+++ b/BingBertGlue/nvidia/modelingpreln.py
@@ -76,36 +76,35 @@ def get_deepspeed_config(args):
 
 
 def get_sparse_attention_config(args, num_heads):
-    if args.deepspeed_sparse_attention:
-        ds_config = get_deepspeed_config(args)
-        if hasattr(ds_config,
-                   'sparse_attention') and ds_config.sparse_attention:
-            sa_config = ds_config.sparse_attention
-            sa_mode = sa_config.get('mode')
-            if (sa_mode == 'dense'):
-                from deepspeed.ops.sparse_attention import DenseSparsityConfig as STConfig
-            elif (sa_mode == 'fixed'):
-                from deepspeed.ops.sparse_attention import FixedSparsityConfig as STConfig
-            elif (sa_mode == 'bigbird'):
-                from deepspeed.ops.sparse_attention import BigBirdSparsityConfig as STConfig
-            elif (sa_mode == 'bslongformer'):
-                from deepspeed.ops.sparse_attention import BSLongformerSparsityConfig as STConfig
-            elif (sa_mode == 'variable'):
-                from deepspeed.ops.sparse_attention import VariableSparsityConfig as STConfig
-            else:
-                raise NotImplementedError(
-                    f'Given sparsity mode, {sa_mode}, has not been implemented yet!'
-                )
-            del sa_config['mode']
-            return STConfig(num_heads=num_heads, **sa_config)
-        else:
+    if not args.deepspeed_sparse_attention:
+        return None
+    ds_config = get_deepspeed_config(args)
+    if hasattr(ds_config,
+               'sparse_attention') and ds_config.sparse_attention:
+        sa_config = ds_config.sparse_attention
+        sa_mode = sa_config.get('mode')
+        if (sa_mode == 'dense'):
+            from deepspeed.ops.sparse_attention import DenseSparsityConfig as STConfig
+        elif (sa_mode == 'fixed'):
             from deepspeed.ops.sparse_attention import FixedSparsityConfig as STConfig
-            print(
-                'deepspeed sparse attention is not set; Fixed sparsity is used as default.'
+        elif (sa_mode == 'bigbird'):
+            from deepspeed.ops.sparse_attention import BigBirdSparsityConfig as STConfig
+        elif (sa_mode == 'bslongformer'):
+            from deepspeed.ops.sparse_attention import BSLongformerSparsityConfig as STConfig
+        elif (sa_mode == 'variable'):
+            from deepspeed.ops.sparse_attention import VariableSparsityConfig as STConfig
+        else:
+            raise NotImplementedError(
+                f'Given sparsity mode, {sa_mode}, has not been implemented yet!'
             )
-            return STConfig(num_heads=num_heads)
+        del sa_config['mode']
+        return STConfig(num_heads=num_heads, **sa_config)
     else:
-        return None
+        from deepspeed.ops.sparse_attention import FixedSparsityConfig as STConfig
+        print(
+            'deepspeed sparse attention is not set; Fixed sparsity is used as default.'
+        )
+        return STConfig(num_heads=num_heads)
 
 def get_sparse_attention_utils(sparse_attention_config):
     if sparse_attention_config is not None:
@@ -127,13 +126,13 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         )
         raise
     tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    print(f"Converting TensorFlow checkpoint from {tf_path}")
     # Load weights from TF model
     init_vars = tf.train.list_variables(tf_path)
     names = []
     arrays = []
     for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
+        print(f"Loading TF weight {name} with shape {shape}")
         array = tf.train.load_variable(tf_path, name)
         names.append(name)
         arrays.append(array)
@@ -143,7 +142,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(n in ["adam_v", "adam_m"] for n in name):
-            print("Skipping {}".format("/".join(name)))
+            print(f'Skipping {"/".join(name)}')
             continue
         pointer = model
         for m_name in name:
@@ -151,12 +150,14 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
                 l = re.split(r'_(\d+)', m_name)
             else:
                 l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
+            if (
+                l[0] in ['kernel', 'gamma']
+                or l[0] not in ['output_bias', 'beta']
+                and l[0] == 'output_weights'
+            ):
                 pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
+            elif l[0] in ['output_bias', 'beta']:
                 pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
             else:
                 pointer = getattr(pointer, l[0])
             if len(l) >= 2:
@@ -171,7 +172,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
-        print("Initialize PyTorch weight {}".format(name))
+        print(f"Initialize PyTorch weight {name}")
         pointer.data = torch.from_numpy(array)
     return model
 
@@ -256,8 +257,7 @@ def forward(self, input):
             return self.act_fn(F.linear(input, self.weight, self.bias))
 
     def extra_repr(self):
-        return 'in_features={}, out_features={}, bias={}'.format(
-            self.in_features, self.out_features, self.bias is not None)
+        return f'in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}'
 
 
 class BertConfig(object):
@@ -344,8 +344,7 @@ def __repr__(self):
 
     def to_dict(self):
         """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
+        return copy.deepcopy(self.__dict__)
 
     def to_json_string(self):
         """Serializes this instance to a JSON string."""
@@ -502,8 +501,7 @@ def __init__(self, config):
 
     def forward(self, input_tensor, attention_mask):
         self_output = self.self(input_tensor, attention_mask)
-        attention_output = self.output(self_output, input_tensor)
-        return attention_output
+        return self.output(self_output, input_tensor)
 
 
 class BertIntermediate(nn.Module):
@@ -565,7 +563,7 @@ def __init__(self, config, args, sparse_attention_config=None):
 
         if args.deepspeed_transformer_kernel and args.deepspeed_sparse_attention:
             raise NotImplementedError(
-                f'Currently DeepSpeed Transformer Kernels do not support Sparse Attention. To use Sparse Attention, you need to disable Transformer Kernels!'
+                'Currently DeepSpeed Transformer Kernels do not support Sparse Attention. To use Sparse Attention, you need to disable Transformer Kernels!'
             )
 
         if args.deepspeed_transformer_kernel:
@@ -688,8 +686,7 @@ def forward(self, hidden_states):
         # We "pool" the model by simply taking the hidden state corresponding
         # to the first token.
         first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense_act(first_token_tensor)
-        return pooled_output
+        return self.dense_act(first_token_tensor)
 
 
 class BertPredictionHeadTransform(nn.Module):
@@ -729,8 +726,8 @@ def forward(self, hidden_states, masked_token_indexes):
                 masked_token_indexes)
 
         torch.cuda.nvtx.range_push(
-            "decoder input.size() = {}, weight.size() = {}".format(
-                hidden_states.size(), self.decoder.weight.size()))
+            f"decoder input.size() = {hidden_states.size()}, weight.size() = {self.decoder.weight.size()}"
+        )
         hidden_states = self.decoder(hidden_states) + self.bias
         torch.cuda.nvtx.range_pop()
         return hidden_states
@@ -743,8 +740,7 @@ def __init__(self, config, bert_model_embedding_weights):
                                                 bert_model_embedding_weights)
 
     def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
+        return self.predictions(sequence_output)
 
 
 class BertOnlyNSPHead(nn.Module):
@@ -753,8 +749,7 @@ def __init__(self, config):
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
 
     def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
+        return self.seq_relationship(pooled_output)
 
 
 class BertPreTrainingHeads(nn.Module):
@@ -782,10 +777,8 @@ def __init__(self, config, *inputs, **kwargs):
         super(BertPreTrainedModel, self).__init__()
         if not isinstance(config, BertConfig):
             raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__))
+                f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class `BertConfig`. To create a model from a Google pretrained model use `model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
         self.config = config
 
     def init_bert_weights(self, module):
@@ -933,15 +926,15 @@ def load(module, prefix=''):
                 s.startswith('bert.') for s in state_dict.keys()):
             start_prefix = 'bert.'
         load(model, prefix=start_prefix)
-        if len(missing_keys) > 0:
+        if missing_keys:
             logger.info(
                 "Weights of {} not initialized from pretrained model: {}".
                 format(model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
+        if unexpected_keys:
             logger.info(
                 "Weights from pretrained model not used in {}: {}".format(
                     model.__class__.__name__, unexpected_keys))
-        if len(error_msgs) > 0:
+        if error_msgs:
             raise RuntimeError(
                 'Error(s) in loading state_dict for {}:\n\t{}'.format(
                     model.__class__.__name__, "\n\t".join(error_msgs)))
@@ -1153,8 +1146,7 @@ def forward(self, batch, log=True):
                 prediction_scores.view(-1, self.config.vocab_size), target)
             next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
                                           next_sentence_label.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-            return total_loss
+            return masked_lm_loss + next_sentence_loss
         else:
             prediction_scores, seq_relationship_score = self.cls(
                 sequence_output, pooled_output)
@@ -1224,10 +1216,10 @@ def forward(self,
 
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
+            return loss_fct(
                 prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            return masked_lm_loss
+                masked_lm_labels.view(-1),
+            )
         else:
             return prediction_scores
 
@@ -1293,13 +1285,12 @@ def forward(self,
                                      output_all_encoded_layers=False)
         seq_relationship_score = self.cls(pooled_output)
 
-        if next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
-                                          next_sentence_label.view(-1))
-            return next_sentence_loss
-        else:
+        if next_sentence_label is None:
             return seq_relationship_score
+        loss_fct = CrossEntropyLoss(ignore_index=-1)
+        return loss_fct(
+            seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)
+        )
 
 
 class BertForSequenceClassification(BertPreTrainedModel):
@@ -1370,8 +1361,7 @@ def forward(self,
 
         if labels is not None:
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
         else:
             return logits
 
@@ -1445,12 +1435,10 @@ def forward(self,
         logits = self.classifier(pooled_output)
         reshaped_logits = logits.view(-1, self.num_choices)
 
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            return loss
-        else:
+        if labels is None:
             return reshaped_logits
+        loss_fct = CrossEntropyLoss()
+        return loss_fct(reshaped_logits, labels)
 
 
 class BertForTokenClassification(BertPreTrainedModel):
@@ -1519,20 +1507,15 @@ def forward(self,
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
 
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels),
-                                labels.view(-1))
-            return loss
-        else:
+        if labels is None:
             return logits
+        loss_fct = CrossEntropyLoss()
+        if attention_mask is None:
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        active_loss = attention_mask.view(-1) == 1
+        active_logits = logits.view(-1, self.num_labels)[active_loss]
+        active_labels = labels.view(-1)[active_loss]
+        return loss_fct(active_logits, active_labels)
 
 
 class BertForQuestionAnswering(BertPreTrainedModel):
@@ -1606,21 +1589,19 @@ def forward(self,
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            return total_loss
-        else:
+        if start_positions is None or end_positions is None:
             return start_logits, end_logits
+        # If we are on multi-GPU, split add a dimension
+        if len(start_positions.size()) > 1:
+            start_positions = start_positions.squeeze(-1)
+        if len(end_positions.size()) > 1:
+            end_positions = end_positions.squeeze(-1)
+        # sometimes the start/end positions are outside our model inputs, we ignore these terms
+        ignored_index = start_logits.size(1)
+        start_positions.clamp_(0, ignored_index)
+        end_positions.clamp_(0, ignored_index)
+
+        loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+        start_loss = loss_fct(start_logits, start_positions)
+        end_loss = loss_fct(end_logits, end_positions)
+        return (start_loss + end_loss) / 2
diff --git a/BingBertGlue/nvidia/modelingpreln_layerdrop.py b/BingBertGlue/nvidia/modelingpreln_layerdrop.py
index b5beb89af..dc365ee8a 100755
--- a/BingBertGlue/nvidia/modelingpreln_layerdrop.py
+++ b/BingBertGlue/nvidia/modelingpreln_layerdrop.py
@@ -77,36 +77,35 @@ def get_deepspeed_config(args):
 
 
 def get_sparse_attention_config(args, num_heads):
-    if args.deepspeed_sparse_attention:
-        ds_config = get_deepspeed_config(args)
-        if hasattr(ds_config,
-                   'sparse_attention') and ds_config.sparse_attention:
-            sa_config = ds_config.sparse_attention
-            sa_mode = sa_config.get('mode')
-            if (sa_mode == 'dense'):
-                from deepspeed.ops.sparse_attention import DenseSparsityConfig as STConfig
-            elif (sa_mode == 'fixed'):
-                from deepspeed.ops.sparse_attention import FixedSparsityConfig as STConfig
-            elif (sa_mode == 'bigbird'):
-                from deepspeed.ops.sparse_attention import BigBirdSparsityConfig as STConfig
-            elif (sa_mode == 'bslongformer'):
-                from deepspeed.ops.sparse_attention import BSLongformerSparsityConfig as STConfig
-            elif (sa_mode == 'variable'):
-                from deepspeed.ops.sparse_attention import VariableSparsityConfig as STConfig
-            else:
-                raise NotImplementedError(
-                    f'Given sparsity mode, {sa_mode}, has not been implemented yet!'
-                )
-            del sa_config['mode']
-            return STConfig(num_heads=num_heads, **sa_config)
-        else:
+    if not args.deepspeed_sparse_attention:
+        return None
+    ds_config = get_deepspeed_config(args)
+    if hasattr(ds_config,
+               'sparse_attention') and ds_config.sparse_attention:
+        sa_config = ds_config.sparse_attention
+        sa_mode = sa_config.get('mode')
+        if (sa_mode == 'dense'):
+            from deepspeed.ops.sparse_attention import DenseSparsityConfig as STConfig
+        elif (sa_mode == 'fixed'):
             from deepspeed.ops.sparse_attention import FixedSparsityConfig as STConfig
-            print(
-                'deepspeed sparse attention is not set; Fixed sparsity is used as default.'
+        elif (sa_mode == 'bigbird'):
+            from deepspeed.ops.sparse_attention import BigBirdSparsityConfig as STConfig
+        elif (sa_mode == 'bslongformer'):
+            from deepspeed.ops.sparse_attention import BSLongformerSparsityConfig as STConfig
+        elif (sa_mode == 'variable'):
+            from deepspeed.ops.sparse_attention import VariableSparsityConfig as STConfig
+        else:
+            raise NotImplementedError(
+                f'Given sparsity mode, {sa_mode}, has not been implemented yet!'
             )
-            return STConfig(num_heads=num_heads)
+        del sa_config['mode']
+        return STConfig(num_heads=num_heads, **sa_config)
     else:
-        return None
+        from deepspeed.ops.sparse_attention import FixedSparsityConfig as STConfig
+        print(
+            'deepspeed sparse attention is not set; Fixed sparsity is used as default.'
+        )
+        return STConfig(num_heads=num_heads)
 
 
 def get_sparse_attention_utils(sparse_attention_config):
@@ -130,13 +129,13 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         )
         raise
     tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    print(f"Converting TensorFlow checkpoint from {tf_path}")
     # Load weights from TF model
     init_vars = tf.train.list_variables(tf_path)
     names = []
     arrays = []
     for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
+        print(f"Loading TF weight {name} with shape {shape}")
         array = tf.train.load_variable(tf_path, name)
         names.append(name)
         arrays.append(array)
@@ -146,7 +145,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(n in ["adam_v", "adam_m"] for n in name):
-            print("Skipping {}".format("/".join(name)))
+            print(f'Skipping {"/".join(name)}')
             continue
         pointer = model
         for m_name in name:
@@ -154,12 +153,14 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
                 l = re.split(r'_(\d+)', m_name)
             else:
                 l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
+            if (
+                l[0] in ['kernel', 'gamma']
+                or l[0] not in ['output_bias', 'beta']
+                and l[0] == 'output_weights'
+            ):
                 pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
+            elif l[0] in ['output_bias', 'beta']:
                 pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
             else:
                 pointer = getattr(pointer, l[0])
             if len(l) >= 2:
@@ -174,7 +175,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
-        print("Initialize PyTorch weight {}".format(name))
+        print(f"Initialize PyTorch weight {name}")
         pointer.data = torch.from_numpy(array)
     return model
 
@@ -259,8 +260,7 @@ def forward(self, input):
             return self.act_fn(F.linear(input, self.weight, self.bias))
 
     def extra_repr(self):
-        return 'in_features={}, out_features={}, bias={}'.format(
-            self.in_features, self.out_features, self.bias is not None)
+        return f'in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}'
 
 
 class BertConfig(object):
@@ -347,8 +347,7 @@ def __repr__(self):
 
     def to_dict(self):
         """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
+        return copy.deepcopy(self.__dict__)
 
     def to_json_string(self):
         """Serializes this instance to a JSON string."""
@@ -505,8 +504,7 @@ def __init__(self, config):
 
     def forward(self, input_tensor, attention_mask):
         self_output = self.self(input_tensor, attention_mask)
-        attention_output = self.output(self_output, input_tensor)
-        return attention_output
+        return self.output(self_output, input_tensor)
 
 
 class BertIntermediate(nn.Module):
@@ -576,7 +574,7 @@ def __init__(self, config, args, sparse_attention_config=None):
 
         if args.deepspeed_transformer_kernel and args.deepspeed_sparse_attention:
             raise NotImplementedError(
-                f'Currently DeepSpeed Transformer Kernels do not support Sparse Attention. To use Sparse Attention, you need to disable Transformer Kernels!'
+                'Currently DeepSpeed Transformer Kernels do not support Sparse Attention. To use Sparse Attention, you need to disable Transformer Kernels!'
             )
 
         if args.deepspeed_transformer_kernel:
@@ -713,8 +711,7 @@ def forward(self, hidden_states):
         # We "pool" the model by simply taking the hidden state corresponding
         # to the first token.
         first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense_act(first_token_tensor)
-        return pooled_output
+        return self.dense_act(first_token_tensor)
 
 
 class BertPredictionHeadTransform(nn.Module):
@@ -754,8 +751,8 @@ def forward(self, hidden_states, masked_token_indexes):
                 masked_token_indexes)
 
         torch.cuda.nvtx.range_push(
-            "decoder input.size() = {}, weight.size() = {}".format(
-                hidden_states.size(), self.decoder.weight.size()))
+            f"decoder input.size() = {hidden_states.size()}, weight.size() = {self.decoder.weight.size()}"
+        )
         hidden_states = self.decoder(hidden_states) + self.bias
         torch.cuda.nvtx.range_pop()
         return hidden_states
@@ -768,8 +765,7 @@ def __init__(self, config, bert_model_embedding_weights):
                                                 bert_model_embedding_weights)
 
     def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
+        return self.predictions(sequence_output)
 
 
 class BertOnlyNSPHead(nn.Module):
@@ -778,8 +774,7 @@ def __init__(self, config):
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
 
     def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
+        return self.seq_relationship(pooled_output)
 
 
 class BertPreTrainingHeads(nn.Module):
@@ -807,10 +802,8 @@ def __init__(self, config, *inputs, **kwargs):
         super(BertPreTrainedModel, self).__init__()
         if not isinstance(config, BertConfig):
             raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__))
+                f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class `BertConfig`. To create a model from a Google pretrained model use `model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
         self.config = config
 
     def init_bert_weights(self, module):
@@ -958,15 +951,15 @@ def load(module, prefix=''):
                 s.startswith('bert.') for s in state_dict.keys()):
             start_prefix = 'bert.'
         load(model, prefix=start_prefix)
-        if len(missing_keys) > 0:
+        if missing_keys:
             logger.info(
                 "Weights of {} not initialized from pretrained model: {}".
                 format(model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
+        if unexpected_keys:
             logger.info(
                 "Weights from pretrained model not used in {}: {}".format(
                     model.__class__.__name__, unexpected_keys))
-        if len(error_msgs) > 0:
+        if error_msgs:
             raise RuntimeError(
                 'Error(s) in loading state_dict for {}:\n\t{}'.format(
                     model.__class__.__name__, "\n\t".join(error_msgs)))
@@ -1189,8 +1182,7 @@ def forward(self, batch, **kwargs):
                 prediction_scores.view(-1, self.config.vocab_size), target)
             next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
                                           next_sentence_label.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-            return total_loss
+            return masked_lm_loss + next_sentence_loss
         else:
             prediction_scores, seq_relationship_score = self.cls(
                 sequence_output, pooled_output)
@@ -1260,10 +1252,10 @@ def forward(self,
 
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
+            return loss_fct(
                 prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            return masked_lm_loss
+                masked_lm_labels.view(-1),
+            )
         else:
             return prediction_scores
 
@@ -1329,13 +1321,12 @@ def forward(self,
                                      output_all_encoded_layers=False)
         seq_relationship_score = self.cls(pooled_output)
 
-        if next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
-                                          next_sentence_label.view(-1))
-            return next_sentence_loss
-        else:
+        if next_sentence_label is None:
             return seq_relationship_score
+        loss_fct = CrossEntropyLoss(ignore_index=-1)
+        return loss_fct(
+            seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)
+        )
 
 
 class BertForSequenceClassification(BertPreTrainedModel):
@@ -1406,8 +1397,7 @@ def forward(self,
 
         if labels is not None:
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
         else:
             return logits
 
@@ -1481,12 +1471,10 @@ def forward(self,
         logits = self.classifier(pooled_output)
         reshaped_logits = logits.view(-1, self.num_choices)
 
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            return loss
-        else:
+        if labels is None:
             return reshaped_logits
+        loss_fct = CrossEntropyLoss()
+        return loss_fct(reshaped_logits, labels)
 
 
 class BertForTokenClassification(BertPreTrainedModel):
@@ -1555,20 +1543,15 @@ def forward(self,
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
 
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels),
-                                labels.view(-1))
-            return loss
-        else:
+        if labels is None:
             return logits
+        loss_fct = CrossEntropyLoss()
+        if attention_mask is None:
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        active_loss = attention_mask.view(-1) == 1
+        active_logits = logits.view(-1, self.num_labels)[active_loss]
+        active_labels = labels.view(-1)[active_loss]
+        return loss_fct(active_logits, active_labels)
 
 
 class BertForQuestionAnswering(BertPreTrainedModel):
@@ -1642,21 +1625,19 @@ def forward(self,
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            return total_loss
-        else:
+        if start_positions is None or end_positions is None:
             return start_logits, end_logits
+        # If we are on multi-GPU, split add a dimension
+        if len(start_positions.size()) > 1:
+            start_positions = start_positions.squeeze(-1)
+        if len(end_positions.size()) > 1:
+            end_positions = end_positions.squeeze(-1)
+        # sometimes the start/end positions are outside our model inputs, we ignore these terms
+        ignored_index = start_logits.size(1)
+        start_positions.clamp_(0, ignored_index)
+        end_positions.clamp_(0, ignored_index)
+
+        loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+        start_loss = loss_fct(start_logits, start_positions)
+        end_loss = loss_fct(end_logits, end_positions)
+        return (start_loss + end_loss) / 2
diff --git a/BingBertGlue/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py b/BingBertGlue/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
index 7fa79ebb5..6374bcbd7 100755
--- a/BingBertGlue/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
+++ b/BingBertGlue/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
@@ -32,21 +32,22 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file,
                                      pytorch_dump_path):
     config_path = os.path.abspath(bert_config_file)
     tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {} with config at {}".format(
-        tf_path, config_path))
+    print(
+        f"Converting TensorFlow checkpoint from {tf_path} with config at {config_path}"
+    )
     # Load weights from TF model
     init_vars = tf.train.list_variables(tf_path)
     names = []
     arrays = []
     for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
+        print(f"Loading TF weight {name} with shape {shape}")
         array = tf.train.load_variable(tf_path, name)
         names.append(name)
         arrays.append(array)
 
     # Initialise PyTorch model
     config = BertConfig.from_json_file(bert_config_file)
-    print("Building PyTorch model from configuration: {}".format(str(config)))
+    print(f"Building PyTorch model from configuration: {str(config)}")
     model = BertForPreTraining(config)
 
     for name, array in zip(names, arrays):
@@ -54,7 +55,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file,
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
-            print("Skipping {}".format("/".join(name)))
+            print(f'Skipping {"/".join(name)}')
             continue
         pointer = model
         for m_name in name:
@@ -62,12 +63,14 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file,
                 l = re.split(r'_(\d+)', m_name)
             else:
                 l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
+            if (
+                l[0] in ['kernel', 'gamma']
+                or l[0] not in ['output_bias', 'beta']
+                and l[0] == 'output_weights'
+            ):
                 pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
+            elif l[0] in ['output_bias', 'beta']:
                 pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
             else:
                 pointer = getattr(pointer, l[0])
             if len(l) >= 2:
@@ -82,11 +85,11 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file,
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
-        print("Initialize PyTorch weight {}".format(name))
+        print(f"Initialize PyTorch weight {name}")
         pointer.data = torch.from_numpy(array)
 
     # Save pytorch-model
-    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    print(f"Save PyTorch model to {pytorch_dump_path}")
     torch.save(model.state_dict(), pytorch_dump_path)
 
 
diff --git a/BingBertGlue/pytorch_pretrained_bert/file_utils.py b/BingBertGlue/pytorch_pretrained_bert/file_utils.py
index 3fb6f93a2..c4a474212 100755
--- a/BingBertGlue/pytorch_pretrained_bert/file_utils.py
+++ b/BingBertGlue/pytorch_pretrained_bert/file_utils.py
@@ -41,7 +41,7 @@ def url_to_filename(url: str, etag: str = None) -> str:
     if etag:
         etag_bytes = etag.encode('utf-8')
         etag_hash = sha256(etag_bytes)
-        filename += '.' + etag_hash.hexdigest()
+        filename += f'.{etag_hash.hexdigest()}'
 
     return filename
 
@@ -59,11 +59,11 @@ def filename_to_url(filename: str,
 
     cache_path = os.path.join(cache_dir, filename)
     if not os.path.exists(cache_path):
-        raise FileNotFoundError("file {} not found".format(cache_path))
+        raise FileNotFoundError(f"file {cache_path} not found")
 
-    meta_path = cache_path + '.json'
+    meta_path = f'{cache_path}.json'
     if not os.path.exists(meta_path):
-        raise FileNotFoundError("file {} not found".format(meta_path))
+        raise FileNotFoundError(f"file {meta_path} not found")
 
     with open(meta_path) as meta_file:
         metadata = json.load(meta_file)
@@ -98,19 +98,19 @@ def cached_path(url_or_filename: Union[str, Path],
         return url_or_filename
     elif parsed.scheme == '':
         # File, but it doesn't exist.
-        raise FileNotFoundError("file {} not found".format(url_or_filename))
+        raise FileNotFoundError(f"file {url_or_filename} not found")
     else:
         # Something unknown
         raise ValueError(
-            "unable to parse {} as a URL or as a local path".format(
-                url_or_filename))
+            f"unable to parse {url_or_filename} as a URL or as a local path"
+        )
 
 
 def split_s3_path(url: str) -> Tuple[str, str]:
     """Split a full s3 path into the bucket name and path."""
     parsed = urlparse(url)
     if not parsed.netloc or not parsed.path:
-        raise ValueError("bad s3 path {}".format(url))
+        raise ValueError(f"bad s3 path {url}")
     bucket_name = parsed.netloc
     s3_path = parsed.path
     # Remove '/' at beginning of path.
@@ -185,8 +185,8 @@ def get_from_cache(url: str, cache_dir: Union[str, Path] = None) -> str:
         response = requests.head(url, allow_redirects=True)
         if response.status_code != 200:
             raise IOError(
-                "HEAD request failed for url {} with status code {}".format(
-                    url, response.status_code))
+                f"HEAD request failed for url {url} with status code {response.status_code}"
+            )
         etag = response.headers.get("ETag")
 
     filename = url_to_filename(url, etag)
@@ -219,7 +219,7 @@ def get_from_cache(url: str, cache_dir: Union[str, Path] = None) -> str:
 
             logger.info("creating metadata file for %s", cache_path)
             meta = {'url': url, 'etag': etag}
-            meta_path = cache_path + '.json'
+            meta_path = f'{cache_path}.json'
             with open(meta_path, 'w') as meta_file:
                 json.dump(meta, meta_file)
 
diff --git a/BingBertGlue/pytorch_pretrained_bert/modeling.py b/BingBertGlue/pytorch_pretrained_bert/modeling.py
index 33553499d..58071693a 100755
--- a/BingBertGlue/pytorch_pretrained_bert/modeling.py
+++ b/BingBertGlue/pytorch_pretrained_bert/modeling.py
@@ -158,8 +158,7 @@ def __repr__(self):
 
     def to_dict(self):
         """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
+        return copy.deepcopy(self.__dict__)
 
     def to_json_string(self):
         """Serializes this instance to a JSON string."""
@@ -306,8 +305,7 @@ def __init__(self, config):
 
     def forward(self, input_tensor, attention_mask):
         self_output = self.self(input_tensor, attention_mask)
-        attention_output = self.output(self_output, input_tensor)
-        return attention_output
+        return self.output(self_output, input_tensor)
 
 
 class BertIntermediate(nn.Module):
@@ -447,8 +445,7 @@ def __init__(self, config, bert_model_embedding_weights):
                                                 bert_model_embedding_weights)
 
     def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
+        return self.predictions(sequence_output)
 
 
 class BertOnlyNSPHead(nn.Module):
@@ -457,8 +454,7 @@ def __init__(self, config):
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
 
     def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
+        return self.seq_relationship(pooled_output)
 
 
 class BertPreTrainingHeads(nn.Module):
@@ -482,10 +478,8 @@ def __init__(self, config, *inputs, **kwargs):
         super(PreTrainedBertModel, self).__init__()
         if not isinstance(config, BertConfig):
             raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__))
+                f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class `BertConfig`. To create a model from a Google pretrained model use `model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
         self.config = config
 
     def init_bert_weights(self, module):
@@ -615,11 +609,11 @@ def load(module, prefix=''):
                     load(child, prefix + name + '.')
 
         load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
-        if len(missing_keys) > 0:
+        if missing_keys:
             logger.info(
                 "Weights of {} not initialized from pretrained model: {}".
                 format(model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
+        if unexpected_keys:
             logger.info(
                 "Weights from pretrained model not used in {}: {}".format(
                     model.__class__.__name__, unexpected_keys))
@@ -790,17 +784,15 @@ def forward(self,
         prediction_scores, seq_relationship_score = self.cls(
             sequence_output, pooled_output)
 
-        if masked_lm_labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
-                                          next_sentence_label.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-            return total_loss
-        else:
+        if masked_lm_labels is None or next_sentence_label is None:
             return prediction_scores, seq_relationship_score
+        loss_fct = CrossEntropyLoss(ignore_index=-1)
+        masked_lm_loss = loss_fct(
+            prediction_scores.view(-1, self.config.vocab_size),
+            masked_lm_labels.view(-1))
+        next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
+                                      next_sentence_label.view(-1))
+        return masked_lm_loss + next_sentence_loss
 
 
 class BertForMaskedLM(PreTrainedBertModel):
@@ -865,10 +857,10 @@ def forward(self,
 
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
+            return loss_fct(
                 prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            return masked_lm_loss
+                masked_lm_labels.view(-1),
+            )
         else:
             return prediction_scores
 
@@ -933,13 +925,12 @@ def forward(self,
                                      output_all_encoded_layers=False)
         seq_relationship_score = self.cls(pooled_output)
 
-        if next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
-                                          next_sentence_label.view(-1))
-            return next_sentence_loss
-        else:
+        if next_sentence_label is None:
             return seq_relationship_score
+        loss_fct = CrossEntropyLoss(ignore_index=-1)
+        return loss_fct(
+            seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)
+        )
 
 
 class BertForSequenceClassification(PreTrainedBertModel):
@@ -1009,8 +1000,7 @@ def forward(self,
 
         if labels is not None:
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
         else:
             return logits
 
@@ -1083,12 +1073,10 @@ def forward(self,
         logits = self.classifier(pooled_output)
         reshaped_logits = logits.view(-1, self.num_choices)
 
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            return loss
-        else:
+        if labels is None:
             return reshaped_logits
+        loss_fct = CrossEntropyLoss()
+        return loss_fct(reshaped_logits, labels)
 
 
 class BertForTokenClassification(PreTrainedBertModel):
@@ -1158,8 +1146,7 @@ def forward(self,
 
         if labels is not None:
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
         else:
             return logits
 
@@ -1234,21 +1221,19 @@ def forward(self,
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            return total_loss
-        else:
+        if start_positions is None or end_positions is None:
             return start_logits, end_logits
+        # If we are on multi-GPU, split add a dimension
+        if len(start_positions.size()) > 1:
+            start_positions = start_positions.squeeze(-1)
+        if len(end_positions.size()) > 1:
+            end_positions = end_positions.squeeze(-1)
+        # sometimes the start/end positions are outside our model inputs, we ignore these terms
+        ignored_index = start_logits.size(1)
+        start_positions.clamp_(0, ignored_index)
+        end_positions.clamp_(0, ignored_index)
+
+        loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+        start_loss = loss_fct(start_logits, start_positions)
+        end_loss = loss_fct(end_logits, end_positions)
+        return (start_loss + end_loss) / 2
diff --git a/BingBertGlue/pytorch_pretrained_bert/optimization.py b/BingBertGlue/pytorch_pretrained_bert/optimization.py
index cbf7e34df..b25ea610d 100755
--- a/BingBertGlue/pytorch_pretrained_bert/optimization.py
+++ b/BingBertGlue/pytorch_pretrained_bert/optimization.py
@@ -22,15 +22,11 @@
 
 
 def warmup_cosine(x, warmup=0.002):
-    if x < warmup:
-        return x / warmup
-    return 0.5 * (1.0 + torch.cos(math.pi * x))
+    return x / warmup if x < warmup else 0.5 * (1.0 + torch.cos(math.pi * x))
 
 
 def warmup_constant(x, warmup=0.002):
-    if x < warmup:
-        return x / warmup
-    return 1.0
+    return x / warmup if x < warmup else 1.0
 
 
 def warmup_linear(x, warmup=0.002):
@@ -76,9 +72,7 @@ def warmup_exp_decay_poly(global_step,
                           warm_degree=1.5,
                           degree=2.0):
     x = global_step / total_steps
-    if x < warmup:
-        return (x / warmup)**warm_degree
-    return (1.0 - x)**degree
+    return (x / warmup)**warm_degree if x < warmup else (1.0 - x)**degree
 
 
 SCHEDULES = {
@@ -117,25 +111,17 @@ def __init__(self,
                  weight_decay=0.01,
                  max_grad_norm=1.0):
         if lr is not required and lr < 0.0:
-            raise ValueError(
-                "Invalid learning rate: {} - should be >= 0.0".format(lr))
+            raise ValueError(f"Invalid learning rate: {lr} - should be >= 0.0")
         if schedule not in SCHEDULES:
-            raise ValueError("Invalid schedule parameter: {}".format(schedule))
-        if not 0.0 <= warmup < 1.0 and not warmup == -1:
-            raise ValueError(
-                "Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(
-                    warmup))
+            raise ValueError(f"Invalid schedule parameter: {schedule}")
+        if not 0.0 <= warmup < 1.0 and warmup != -1:
+            raise ValueError(f"Invalid warmup: {warmup} - should be in [0.0, 1.0[ or -1")
         if not 0.0 <= b1 < 1.0:
-            raise ValueError(
-                "Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(
-                    b1))
+            raise ValueError(f"Invalid b1 parameter: {b1} - should be in [0.0, 1.0[")
         if not 0.0 <= b2 < 1.0:
-            raise ValueError(
-                "Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(
-                    b2))
-        if not e >= 0.0:
-            raise ValueError(
-                "Invalid epsilon value: {} - should be >= 0.0".format(e))
+            raise ValueError(f"Invalid b2 parameter: {b2} - should be in [0.0, 1.0[")
+        if e < 0.0:
+            raise ValueError(f"Invalid epsilon value: {e} - should be >= 0.0")
         defaults = dict(lr=lr,
                         schedule=schedule,
                         warmup=warmup,
@@ -170,10 +156,7 @@ def step(self, closure=None):
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
+        loss = closure() if closure is not None else None
         for group in self.param_groups:
             for p in group['params']:
                 if p.grad is None:
diff --git a/BingBertGlue/pytorch_pretrained_bert/tokenization.py b/BingBertGlue/pytorch_pretrained_bert/tokenization.py
index cdd1c7cc2..ee38770bf 100755
--- a/BingBertGlue/pytorch_pretrained_bert/tokenization.py
+++ b/BingBertGlue/pytorch_pretrained_bert/tokenization.py
@@ -73,10 +73,7 @@ def load_vocab(vocab_file):
 def whitespace_tokenize(text):
     """Runs basic whitespace cleaning and splitting on a peice of text."""
     text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
+    return [] if not text else text.split()
 
 
 class BertTokenizer(object):
@@ -88,9 +85,8 @@ def __init__(self,
                  never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
         if not os.path.isfile(vocab_file):
             raise ValueError(
-                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-                .format(vocab_file))
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
         self.vocab = load_vocab(vocab_file)
         self.ids_to_tokens = collections.OrderedDict([
             (ids, tok) for tok, ids in self.vocab.items()
@@ -103,29 +99,21 @@ def __init__(self,
     def tokenize(self, text):
         split_tokens = []
         for token in self.basic_tokenizer.tokenize(text):
-            for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                split_tokens.append(sub_token)
+            split_tokens.extend(iter(self.wordpiece_tokenizer.tokenize(token)))
         return split_tokens
 
     def convert_tokens_to_ids(self, tokens):
         """Converts a sequence of tokens into ids using the vocab."""
-        ids = []
-        for token in tokens:
-            ids.append(self.vocab[token])
+        ids = [self.vocab[token] for token in tokens]
         if len(ids) > self.max_len:
             raise ValueError(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this BERT model ({} > {}). Running this"
-                " sequence through BERT will result in indexing errors".format(
-                    len(ids), self.max_len))
+                f"Token indices sequence length is longer than the specified maximum  sequence length for this BERT model ({len(ids)} > {self.max_len}). Running this sequence through BERT will result in indexing errors"
+            )
         return ids
 
     def convert_ids_to_tokens(self, ids):
         """Converts a sequence of ids in wordpiece tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            tokens.append(self.ids_to_tokens[i])
-        return tokens
+        return [self.ids_to_tokens[i] for i in ids]
 
     @classmethod
     def from_pretrained(cls,
@@ -148,27 +136,22 @@ def from_pretrained(cls,
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
         except FileNotFoundError:
             logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find any file "
-                "associated to this path or url.".format(
-                    pretrained_model_name,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    vocab_file))
+                f"Model name '{pretrained_model_name}' was not found in model name list ({', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())}). We assumed '{vocab_file}' was a path or url but couldn't find any file associated to this path or url."
+            )
             return None
         if resolved_vocab_file == vocab_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
+            logger.info(f"loading vocabulary file {vocab_file}")
         else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
+            logger.info(
+                f"loading vocabulary file {vocab_file} from cache at {resolved_vocab_file}"
+            )
         if pretrained_model_name in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
             # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
             # than the number of positional embeddings
             max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
                 pretrained_model_name]
             kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-        # Instantiate tokenizer.
-        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
-        return tokenizer
+        return cls(resolved_vocab_file, *inputs, **kwargs)
 
 
 class BasicTokenizer(object):
@@ -202,8 +185,7 @@ def tokenize(self, text):
                 token = self._run_strip_accents(token)
             split_tokens.extend(self._run_split_on_punc(token))
 
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
+        return whitespace_tokenize(" ".join(split_tokens))
 
     def _run_strip_accents(self, text):
         """Strips accents from a piece of text."""
@@ -244,9 +226,7 @@ def _tokenize_chinese_chars(self, text):
         for char in text:
             cp = ord(char)
             if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
+                output.extend((" ", char, " "))
             else:
                 output.append(char)
         return "".join(output)
@@ -261,17 +241,16 @@ def _is_chinese_char(self, cp):
         # as is Japanese Hiragana and Katakana. Those alphabets are used to write
         # space-separated words, so they are not treated specially and handled
         # like the all of the other languages.
-        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-            (cp >= 0x3400 and cp <= 0x4DBF) or  #
-            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-            (cp >= 0x2B820 and cp <= 0x2CEAF) or
-            (cp >= 0xF900 and cp <= 0xFAFF) or  #
-            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
-            return True
-
-        return False
+        return (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)
+        )
 
     def _clean_text(self, text):
         """Performs invalid character removal and whitespace cleanup on text."""
@@ -328,7 +307,7 @@ def tokenize(self, text):
                 while start < end:
                     substr = "".join(chars[start:end])
                     if start > 0:
-                        substr = "##" + substr
+                        substr = f"##{substr}"
                     if substr in self.vocab:
                         cur_substr = substr
                         break
@@ -350,24 +329,20 @@ def _is_whitespace(char):
     """Checks whether `chars` is a whitespace character."""
     # \t, \n, and \r are technically contorl characters but we treat them
     # as whitespace since they are generally considered as such.
-    if char == " " or char == "\t" or char == "\n" or char == "\r":
+    if char in [" ", "\t", "\n", "\r"]:
         return True
     cat = unicodedata.category(char)
-    if cat == "Zs":
-        return True
-    return False
+    return cat == "Zs"
 
 
 def _is_control(char):
     """Checks whether `chars` is a control character."""
     # These are technically control characters but we count them as whitespace
     # characters.
-    if char == "\t" or char == "\n" or char == "\r":
+    if char in ["\t", "\n", "\r"]:
         return False
     cat = unicodedata.category(char)
-    if cat.startswith("C"):
-        return True
-    return False
+    return bool(cat.startswith("C"))
 
 
 def _is_punctuation(char):
@@ -381,6 +356,4 @@ def _is_punctuation(char):
             or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
         return True
     cat = unicodedata.category(char)
-    if cat.startswith("P"):
-        return True
-    return False
+    return bool(cat.startswith("P"))
diff --git a/BingBertGlue/run_glue_classifier_bert_base.py b/BingBertGlue/run_glue_classifier_bert_base.py
index 08409ae60..e6d122f5b 100755
--- a/BingBertGlue/run_glue_classifier_bert_base.py
+++ b/BingBertGlue/run_glue_classifier_bert_base.py
@@ -101,7 +101,7 @@ def _read_tsv(cls, input_file, quotechar=None):
             lines = []
             for line in reader:
                 if sys.version_info[0] == 2:
-                    line = list(unicode(cell, 'utf-8') for cell in line)
+                    line = [unicode(cell, 'utf-8') for cell in line]
                 lines.append(line)
             return lines
 
@@ -111,8 +111,7 @@ class MrpcProcessor(DataProcessor):
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        logger.info("LOOKING AT {}".format(
-            os.path.join(data_dir, "train.tsv")))
+        logger.info(f'LOOKING AT {os.path.join(data_dir, "train.tsv")}')
         return self._create_examples(
             self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
@@ -131,7 +130,7 @@ def _create_examples(self, lines, set_type):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = "%s-%s" % (set_type, i)
+            guid = f"{set_type}-{i}"
             text_a = line[3]
             text_b = line[4]
             label = line[0]
@@ -164,7 +163,7 @@ def _create_examples(self, lines, set_type):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = "%s-%s" % (set_type, line[0])
+            guid = f"{set_type}-{line[0]}"
             text_a = line[8]
             text_b = line[9]
             label = line[-1]
@@ -204,7 +203,7 @@ def _create_examples(self, lines, set_type):
         """Creates examples for the training and dev sets."""
         examples = []
         for (i, line) in enumerate(lines):
-            guid = "%s-%s" % (set_type, i)
+            guid = f"{set_type}-{i}"
             text_a = line[3]
             label = line[1]
             examples.append(
@@ -235,7 +234,7 @@ def _create_examples(self, lines, set_type):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = "%s-%s" % (set_type, i)
+            guid = f"{set_type}-{i}"
             text_a = line[0]
             label = line[1]
             examples.append(
@@ -266,7 +265,7 @@ def _create_examples(self, lines, set_type):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = "%s-%s" % (set_type, line[0])
+            guid = f"{set_type}-{line[0]}"
             text_a = line[7]
             text_b = line[8]
             label = line[-1]
@@ -298,7 +297,7 @@ def _create_examples(self, lines, set_type):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = "%s-%s" % (set_type, line[0])
+            guid = f"{set_type}-{line[0]}"
             try:
                 text_a = line[3]
                 text_b = line[4]
@@ -334,7 +333,7 @@ def _create_examples(self, lines, set_type):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = "%s-%s" % (set_type, line[0])
+            guid = f"{set_type}-{line[0]}"
             text_a = line[1]
             text_b = line[2]
             label = line[-1]
@@ -366,7 +365,7 @@ def _create_examples(self, lines, set_type):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = "%s-%s" % (set_type, line[0])
+            guid = f"{set_type}-{line[0]}"
             text_a = line[1]
             text_b = line[2]
             label = line[-1]
@@ -398,7 +397,7 @@ def _create_examples(self, lines, set_type):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = "%s-%s" % (set_type, line[0])
+            guid = f"{set_type}-{line[0]}"
             text_a = line[1]
             text_b = line[2]
             label = line[-1]
@@ -427,10 +426,8 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
             # length is less than the specified length.
             # Account for [CLS], [SEP], [SEP] with "- 3"
             _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-        else:
-            # Account for [CLS] and [SEP] with "- 2"
-            if len(tokens_a) > max_seq_length - 2:
-                tokens_a = tokens_a[:(max_seq_length - 2)]
+        elif len(tokens_a) > max_seq_length - 2:
+            tokens_a = tokens_a[:(max_seq_length - 2)]
 
         # The convention in BERT is:
         # (a) For sequence pairs:
@@ -482,15 +479,12 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
 
         if ex_index < 5:
             logger.info("*** Example ***")
-            logger.info("guid: %s" % (example.guid))
+            logger.info(f"guid: {example.guid}")
             logger.info("tokens: %s" % " ".join(
                 [str(x) for x in tokens]))
-            logger.info("input_ids: %s" %
-                        " ".join([str(x) for x in input_ids]))
-            logger.info("input_mask: %s" %
-                        " ".join([str(x) for x in input_mask]))
-            logger.info(
-                "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            logger.info(f'input_ids: {" ".join([str(x) for x in input_ids])}')
+            logger.info(f'input_mask: {" ".join([str(x) for x in input_mask])}')
+            logger.info(f'segment_ids: {" ".join([str(x) for x in segment_ids])}')
             logger.info("label: %s (id = %d)" % (example.label, label_id))
 
         features.append(
diff --git a/BingBertGlue/run_glue_classifier_bert_large.py b/BingBertGlue/run_glue_classifier_bert_large.py
index 2f37d1c9c..427778c70 100755
--- a/BingBertGlue/run_glue_classifier_bert_large.py
+++ b/BingBertGlue/run_glue_classifier_bert_large.py
@@ -56,15 +56,12 @@ def checkpoint_model(PATH, ckpt_id, model, epoch, last_global_step,
     checkpoint_state_dict = {
         'epoch': epoch,
         'last_global_step': last_global_step,
-        'last_global_data_samples': last_global_data_samples
-    }
-    # Add extra kwargs too
-    checkpoint_state_dict.update(kwargs)
-
+        'last_global_data_samples': last_global_data_samples,
+    } | kwargs
     #success = model.network.save_checkpoint(PATH, ckpt_id,
     success = model.save_checkpoint(PATH, ckpt_id,
                                             checkpoint_state_dict)
-    status_msg = 'checkpointing: PATH={}, ckpt_id={}'.format(PATH, ckpt_id)
+    status_msg = f'checkpointing: PATH={PATH}, ckpt_id={ckpt_id}'
     if success:
         logging.info(f"Success {status_msg}")
     else:
@@ -129,7 +126,7 @@ def _read_tsv(cls, input_file, quotechar=None):
             lines = []
             for line in reader:
                 if sys.version_info[0] == 2:
-                    line = list(unicode(cell, 'utf-8') for cell in line)
+                    line = [unicode(cell, 'utf-8') for cell in line]
                 lines.append(line)
             return lines
 
@@ -158,7 +155,7 @@ def _create_examples(self, lines, set_type):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = "%s-%s" % (set_type, i)
+            guid = f"{set_type}-{i}"
             text_a = line[3]
             text_b = line[4]
             label = line[0]
@@ -193,7 +190,7 @@ def _create_examples(self, lines, set_type):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = "%s-%s" % (set_type, line[0])
+            guid = f"{set_type}-{line[0]}"
             text_a = line[8]
             text_b = line[9]
             label = line[-1]
@@ -234,7 +231,7 @@ def _create_examples(self, lines, set_type):
         """Creates examples for the training and dev sets."""
         examples = []
         for (i, line) in enumerate(lines):
-            guid = "%s-%s" % (set_type, i)
+            guid = f"{set_type}-{i}"
             text_a = line[3]
             label = line[1]
             examples.append(
@@ -267,7 +264,7 @@ def _create_examples(self, lines, set_type):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = "%s-%s" % (set_type, i)
+            guid = f"{set_type}-{i}"
             text_a = line[0]
             label = line[1]
             examples.append(
@@ -300,7 +297,7 @@ def _create_examples(self, lines, set_type):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = "%s-%s" % (set_type, line[0])
+            guid = f"{set_type}-{line[0]}"
             text_a = line[7]
             text_b = line[8]
             label = line[-1]
@@ -334,7 +331,7 @@ def _create_examples(self, lines, set_type):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = "%s-%s" % (set_type, line[0])
+            guid = f"{set_type}-{line[0]}"
             try:
                 text_a = line[3]
                 text_b = line[4]
@@ -371,7 +368,7 @@ def _create_examples(self, lines, set_type):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = "%s-%s" % (set_type, line[0])
+            guid = f"{set_type}-{line[0]}"
             text_a = line[1]
             text_b = line[2]
             label = line[-1]
@@ -405,7 +402,7 @@ def _create_examples(self, lines, set_type):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = "%s-%s" % (set_type, line[0])
+            guid = f"{set_type}-{line[0]}"
             text_a = line[1]
             text_b = line[2]
             label = line[-1]
@@ -439,7 +436,7 @@ def _create_examples(self, lines, set_type):
         for (i, line) in enumerate(lines):
             if i == 0:
                 continue
-            guid = "%s-%s" % (set_type, line[0])
+            guid = f"{set_type}-{line[0]}"
             text_a = line[1]
             text_b = line[2]
             label = line[-1]
@@ -471,10 +468,8 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
             # length is less than the specified length.
             # Account for [CLS], [SEP], [SEP] with "- 3"
             _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-        else:
-            # Account for [CLS] and [SEP] with "- 2"
-            if len(tokens_a) > max_seq_length - 2:
-                tokens_a = tokens_a[:(max_seq_length - 2)]
+        elif len(tokens_a) > max_seq_length - 2:
+            tokens_a = tokens_a[:(max_seq_length - 2)]
 
         # The convention in BERT is:
         # (a) For sequence pairs:
@@ -526,14 +521,12 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
 
         if ex_index < 5:
             logger.info("*** Example ***")
-            logger.info("guid: %s" % (example.guid))
-            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
+            logger.info(f"guid: {example.guid}")
+            logger.info(f'tokens: {" ".join([str(x) for x in tokens])}')
             logger.info("input_ids: %s" % " ".join([str(x)
                                                     for x in input_ids]))
-            logger.info("input_mask: %s" %
-                        " ".join([str(x) for x in input_mask]))
-            logger.info("segment_ids: %s" %
-                        " ".join([str(x) for x in segment_ids]))
+            logger.info(f'input_mask: {" ".join([str(x) for x in input_mask])}')
+            logger.info(f'segment_ids: {" ".join([str(x) for x in segment_ids])}')
             logger.info("label: %s (id = %d)" % (example.label, label_id))
 
         features.append(
diff --git a/BingBertGlue/turing/dataset.py b/BingBertGlue/turing/dataset.py
index c1758c54e..2a699111c 100755
--- a/BingBertGlue/turing/dataset.py
+++ b/BingBertGlue/turing/dataset.py
@@ -153,21 +153,23 @@ def __getitem__(self, index):
         passage_tokens = self.tokenizer.tokenize(passage)
 
         if (len(query_tokens) > self.max_seq_len // 2):
-            query_tokens = query_tokens[0:self.max_seq_len // 2]
+            query_tokens = query_tokens[:self.max_seq_len // 2]
 
         max_passage_tokens = self.max_seq_len - \
-            len(query_tokens) - 3  # Removing 3 for SEP and CLS
+                len(query_tokens) - 3  # Removing 3 for SEP and CLS
 
         if (len(passage_tokens) > max_passage_tokens):
-            passage_tokens = passage_tokens[0:max_passage_tokens]
+            passage_tokens = passage_tokens[:max_passage_tokens]
 
         input_ids, input_mask, sequence_ids = encode_sequence(
             query_tokens, passage_tokens, self.max_seq_len, self.tokenizer)
-        return tuple([
-            map_to_torch([BatchType.QP_BATCH]), input_ids, input_mask,
+        return (
+            map_to_torch([BatchType.QP_BATCH]),
+            input_ids,
+            input_mask,
             sequence_ids,
-            map_to_torch_float([label])
-        ])
+            map_to_torch_float([label]),
+        )
         # return QABatch(input_ids=input_ids, input_mask=input_mask, sequence_ids=sequence_ids, label=map_to_torch([label]))
 
 
@@ -232,21 +234,23 @@ def __getitem__(self, index):
         # instance_tokens = self.tokenizer.tokenize(instance)
 
         if (len(query_tokens) > self.max_seq_len // 2):
-            query_tokens = query_tokens[0:self.max_seq_len // 2]
+            query_tokens = query_tokens[:self.max_seq_len // 2]
 
         max_instance_tokens = self.max_seq_len - \
-            len(query_tokens) - 3  # Removing 3 for SEP and CLS
+                len(query_tokens) - 3  # Removing 3 for SEP and CLS
 
         if (len(instance_tokens) > max_instance_tokens):
-            instance_tokens = instance_tokens[0:max_instance_tokens]
+            instance_tokens = instance_tokens[:max_instance_tokens]
 
         input_ids, input_mask, sequence_ids = encode_sequence(
             query_tokens, instance_tokens, self.max_seq_len, self.tokenizer)
-        return tuple([
-            map_to_torch([BatchType.RANKING_BATCH]), input_ids, input_mask,
+        return (
+            map_to_torch([BatchType.RANKING_BATCH]),
+            input_ids,
+            input_mask,
             sequence_ids,
-            map_to_torch_float([label])
-        ])
+            map_to_torch_float([label]),
+        )
 
 
 class PreTrainingDataset(Dataset):
@@ -300,12 +304,8 @@ def __getitem__(self, index):
 
     def create_training_instance(self, instance: TokenInstance):
         tokens_a, tokens_b, is_next = instance.get_values()
-        # print(f'is_next label:{is_next}')
-        # Create mapper
-        tokens = []
-        segment_ids = []
-        tokens.append("[CLS]")
-        segment_ids.append(0)
+        tokens = ["[CLS]"]
+        segment_ids = [0]
         for token in tokens_a:
             tokens.append(token)
             segment_ids.append(0)
@@ -342,12 +342,9 @@ def create_training_instance(self, instance: TokenInstance):
         ])
 
     def create_masked_lm_predictions(self, tokens):
-        cand_indexes = []
-        for i, token in enumerate(tokens):
-            if token == "[CLS]" or token == "[SEP]":
-                continue
-            cand_indexes.append(i)
-
+        cand_indexes = [
+            i for i, token in enumerate(tokens) if token not in ["[CLS]", "[SEP]"]
+        ]
         random.shuffle(cand_indexes)
         output_tokens = list(tokens)
 
@@ -368,15 +365,12 @@ def create_masked_lm_predictions(self, tokens):
             # 80% mask
             if random.random() < 0.8:
                 masked_token = "[MASK]"
+            elif random.random() < 0.5:
+                masked_token = tokens[index]
             else:
-                # 10% Keep Original
-                if random.random() < 0.5:
-                    masked_token = tokens[index]
-                # 10% replace w/ random word
-                else:
-                    masked_token = self.vocab_words[random.randint(
-                        0,
-                        len(self.vocab_words) - 1)]
+                masked_token = self.vocab_words[random.randint(
+                    0,
+                    len(self.vocab_words) - 1)]
 
             output_tokens[index] = masked_token
             masked_lms.append(
diff --git a/BingBertGlue/turing/file_utils.py b/BingBertGlue/turing/file_utils.py
index a1396decf..4e8ab0000 100755
--- a/BingBertGlue/turing/file_utils.py
+++ b/BingBertGlue/turing/file_utils.py
@@ -52,7 +52,7 @@ def url_to_filename(url, etag=None):
     if etag:
         etag_bytes = etag.encode('utf-8')
         etag_hash = sha256(etag_bytes)
-        filename += '.' + etag_hash.hexdigest()
+        filename += f'.{etag_hash.hexdigest()}'
 
     return filename
 
@@ -69,11 +69,11 @@ def filename_to_url(filename, cache_dir=None):
 
     cache_path = os.path.join(cache_dir, filename)
     if not os.path.exists(cache_path):
-        raise EnvironmentError("file {} not found".format(cache_path))
+        raise EnvironmentError(f"file {cache_path} not found")
 
-    meta_path = cache_path + '.json'
+    meta_path = f'{cache_path}.json'
     if not os.path.exists(meta_path):
-        raise EnvironmentError("file {} not found".format(meta_path))
+        raise EnvironmentError(f"file {meta_path} not found")
 
     with open(meta_path, encoding="utf-8") as meta_file:
         metadata = json.load(meta_file)
@@ -107,19 +107,19 @@ def cached_path(url_or_filename, cache_dir=None):
         return url_or_filename
     elif parsed.scheme == '':
         # File, but it doesn't exist.
-        raise EnvironmentError("file {} not found".format(url_or_filename))
+        raise EnvironmentError(f"file {url_or_filename} not found")
     else:
         # Something unknown
         raise ValueError(
-            "unable to parse {} as a URL or as a local path".format(
-                url_or_filename))
+            f"unable to parse {url_or_filename} as a URL or as a local path"
+        )
 
 
 def split_s3_path(url):
     """Split a full s3 path into the bucket name and path."""
     parsed = urlparse(url)
     if not parsed.netloc or not parsed.path:
-        raise ValueError("bad s3 path {}".format(url))
+        raise ValueError(f"bad s3 path {url}")
     bucket_name = parsed.netloc
     s3_path = parsed.path
     # Remove '/' at beginning of path.
@@ -195,8 +195,8 @@ def get_from_cache(url, cache_dir=None):
         response = requests.head(url, allow_redirects=True)
         if response.status_code != 200:
             raise IOError(
-                "HEAD request failed for url {} with status code {}".format(
-                    url, response.status_code))
+                f"HEAD request failed for url {url} with status code {response.status_code}"
+            )
         etag = response.headers.get("ETag")
 
     filename = url_to_filename(url, etag)
@@ -229,7 +229,7 @@ def get_from_cache(url, cache_dir=None):
 
             logger.info("creating metadata file for %s", cache_path)
             meta = {'url': url, 'etag': etag}
-            meta_path = cache_path + '.json'
+            meta_path = f'{cache_path}.json'
             with open(meta_path, 'w', encoding="utf-8") as meta_file:
                 json.dump(meta, meta_file)
 
diff --git a/BingBertGlue/turing/loss.py b/BingBertGlue/turing/loss.py
index c6bfe8b15..d711ccc11 100755
--- a/BingBertGlue/turing/loss.py
+++ b/BingBertGlue/turing/loss.py
@@ -25,10 +25,7 @@ def __init__(self, class_num, alpha=None, gamma=2, size_average=True):
         if alpha is None:
             self.alpha = torch.ones(class_num, 1)
         else:
-            if isinstance(alpha, Variable):
-                self.alpha = alpha
-            else:
-                self.alpha = Variable(alpha)
+            self.alpha = alpha if isinstance(alpha, Variable) else Variable(alpha)
         self.gamma = gamma
         self.class_num = class_num
         self.size_average = size_average
@@ -53,8 +50,4 @@ def forward(self, inputs, targets):
 
         batch_loss = -alpha * (torch.pow((1 - probs), self.gamma)) * log_p
 
-        if self.size_average:
-            loss = batch_loss.mean()
-        else:
-            loss = batch_loss.sum()
-        return loss
+        return batch_loss.mean() if self.size_average else batch_loss.sum()
diff --git a/BingBertGlue/turing/models.py b/BingBertGlue/turing/models.py
index 35a8d202f..bedd42d2f 100755
--- a/BingBertGlue/turing/models.py
+++ b/BingBertGlue/turing/models.py
@@ -30,17 +30,15 @@ def forward(self,
         prediction_scores, seq_relationship_score = self.cls(
             sequence_output, pooled_output)
 
-        if masked_lm_labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
-                                          next_sentence_label.view(-1))
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-            return total_loss
-        else:
+        if masked_lm_labels is None or next_sentence_label is None:
             return prediction_scores, seq_relationship_score
+        loss_fct = CrossEntropyLoss(ignore_index=-1)
+        next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
+                                      next_sentence_label.view(-1))
+        masked_lm_loss = loss_fct(
+            prediction_scores.view(-1, self.config.vocab_size),
+            masked_lm_labels.view(-1))
+        return masked_lm_loss + next_sentence_loss
 
 
 class BertClassificationLoss(PreTrainedBertModel):
@@ -65,9 +63,7 @@ def forward(self,
         scores = self.classifier(pooled_output)
         if labels is not None:
             loss_fct = nn.BCEWithLogitsLoss()
-            loss = loss_fct(scores.view(-1, self.num_labels),
-                            labels.view(-1, 1))
-            return loss
+            return loss_fct(scores.view(-1, self.num_labels), labels.view(-1, 1))
         else:
             return scores
 
@@ -92,12 +88,10 @@ def forward(self,
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
 
-        if labels is not None:
-            loss_fct = MSELoss()
-            loss = loss_fct(logits.view(-1, 1), labels.view(-1, 1))
-            return loss
-        else:
+        if labels is None:
             return logits
+        loss_fct = MSELoss()
+        return loss_fct(logits.view(-1, 1), labels.view(-1, 1))
 
 
 class BertMultiTask:
@@ -121,12 +115,14 @@ def __init__(self, args):
             print("VOCAB SIZE:", bert_config.vocab_size)
 
             self.network = BertForPreTrainingPreLN(bert_config, args)
-        # Use pretrained bert weights
         else:
             self.bert_encoder = BertModel.from_pretrained(
                 self.config['bert_model_file'],
-                cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
-                'distributed_{}'.format(args.local_rank))
+                cache_dir=(
+                    PYTORCH_PRETRAINED_BERT_CACHE
+                    / f'distributed_{args.local_rank}'
+                ),
+            )
             bert_config = self.bert_encoder.config
 
         self.device = None
diff --git a/BingBertGlue/turing/sources.py b/BingBertGlue/turing/sources.py
index 7413336d4..ba4befd80 100755
--- a/BingBertGlue/turing/sources.py
+++ b/BingBertGlue/turing/sources.py
@@ -117,7 +117,7 @@ def __init__(self,
         documents = []
         instances = []
         with open(path, encoding='utf-8') as fd:
-            for i, line in enumerate(tqdm(fd)):
+            for line in tqdm(fd):
                 line = line.replace('\n', '')
                 # Expected format (Q,T,U,S,D)
                 # query, title, url, snippet, document = line.split('\t')
@@ -126,9 +126,7 @@ def __init__(self,
                 if len(document.split("<sep>")) <= 3:
                     continue
                 lines = document.split("<sep>")
-                document = []
-                for seq in lines:
-                    document.append(tokenizer.tokenize(seq))
+                document = [tokenizer.tokenize(seq) for seq in lines]
                 # document = list(map(tokenizer.tokenize, lines))
                 documents.append(document)
 
@@ -149,8 +147,7 @@ def __len__(self):
         return self.len
 
     def __getstate__(self):
-        state = self.__dict__.copy()
-        return state
+        return self.__dict__.copy()
 
     def __setstate__(self, state):
         self.__dict__.update(state)
@@ -241,12 +238,12 @@ def create_training_instance(self, index):
 
                     truncate_input_sequence(tokens_a, tokens_b, max_num_tokens)
 
-                    assert len(tokens_a) >= 1
-                    assert len(tokens_b) >= 1
+                    assert tokens_a
+                    assert tokens_b
 
                     instances.append(
                         TokenInstance(tokens_a, tokens_b, int(is_random_next)))
-                    # print(instances[-1])
+                                # print(instances[-1])
                 current_chunk = []
                 current_length = 0
             i += 1
@@ -269,7 +266,7 @@ def __init__(self,
         documents = []
         instances = []
         with open(path, encoding='utf-8') as fd:
-            for i, line in enumerate(tqdm(fd)):
+            for line in tqdm(fd):
                 line = line.replace('\n', '')
                 url, cleanbody, rand_int = line.rstrip("\n").split("\t")
                 cleanbody = cleanbody.replace("#TAB#", " ").replace(
@@ -317,7 +314,7 @@ def __init__(self,
         instances = []
         with open(path, encoding='utf-8') as fd:
             document = []
-            for i, line in enumerate(tqdm(fd)):
+            for line in tqdm(fd):
                 line = line.replace('\n', '')
                 # document = line
                 # if len(document.split("<sep>")) <= 3:
@@ -361,7 +358,7 @@ def __init__(self,
         instances = []
         with open(path, encoding='utf-8') as fd:
             document = []
-            for i, line in enumerate(tqdm(fd)):
+            for line in tqdm(fd):
                 line = line.replace('\n', '')
                 # document = line
                 # if len(document.split("<sep>")) <= 3:
diff --git a/BingBertGlue/turing/utils.py b/BingBertGlue/turing/utils.py
index 6eb2004d9..85d1314b7 100755
--- a/BingBertGlue/turing/utils.py
+++ b/BingBertGlue/turing/utils.py
@@ -149,7 +149,7 @@ def namedtorchbatch(typename: str,
 
     # Execute the template string in a temporary namespace and support
     # tracing utilities by setting a value for frame.f_globals['__name__']
-    namespace = dict(__name__='namedtuple_%s' % typename)
+    namespace = dict(__name__=f'namedtuple_{typename}')
     exec(class_definition, namespace)
     result = namespace[typename]
     result._source = class_definition  # type: ignore
diff --git a/BingBertSquad/convert_bert_ckpt_to_deepspeed.py b/BingBertSquad/convert_bert_ckpt_to_deepspeed.py
index 3f11fab7f..4d38f63fa 100755
--- a/BingBertSquad/convert_bert_ckpt_to_deepspeed.py
+++ b/BingBertSquad/convert_bert_ckpt_to_deepspeed.py
@@ -36,13 +36,13 @@ def load_tf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
         )
         raise
     tf_path = os.path.abspath(ckpt_path)
-    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
     # Load weights from TF model
     init_vars = tf.train.list_variables(tf_path)
     names = []
     arrays = []
     for name, shape in init_vars:
-        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        logger.info(f"Loading TF weight {name} with shape {shape}")
         array = tf.train.load_variable(tf_path, name)
         names.append(name)
         arrays.append(array)
@@ -56,7 +56,7 @@ def load_tf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
             n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
             for n in name
         ):
-            logger.info("Skipping {}".format("/".join(name)))
+            logger.info(f'Skipping {"/".join(name)}')
             continue
         pointer = model
         key = None
@@ -67,15 +67,14 @@ def load_tf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
             else:
                 scope_names = [m_name]
 
-            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+            if scope_names[0] in ["kernel", "gamma"]:
                 pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+            elif scope_names[0] in ["output_bias", "beta"]:
                 pointer = getattr(pointer, "bias")
             elif scope_names[0] == "output_weights":
                 pointer = getattr(pointer, "weight")
             elif scope_names[0] == "squad":
                 pointer = getattr(pointer, "classifier")
-            # Special in deepspeed.
             elif name_str.find("bert/pooler/dense") >= 0 and scope_names[0] == "dense":
                 pointer = getattr(pointer, "dense_act")
             elif name_str.find("bert/embeddings/LayerNorm/gamma") >= 0 and scope_names[0] == "gamma":
@@ -86,7 +85,7 @@ def load_tf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
                 try:
                     pointer = getattr(pointer, scope_names[0])
                 except AttributeError:
-                    logger.info("Skipping {}".format("/".join(name)))
+                    logger.info(f'Skipping {"/".join(name)}')
                     skipping = True
                     break
 
@@ -166,7 +165,7 @@ def load_tf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
             array = np.concatenate((array, z), axis=0)
 
         set_data(pointer, array)
-        logger.info("Initialize DeepSpeed weight {}".format(name))
+        logger.info(f"Initialize DeepSpeed weight {name}")
 
     return model
 
@@ -174,14 +173,14 @@ def load_hf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
     """ Load huggingface checkpoints and convert to a deepspeed model.
     """
     hf_path = os.path.abspath(ckpt_path)
-    logger.info("Converting Huggingface checkpoint from {}".format(hf_path))
+    logger.info(f"Converting Huggingface checkpoint from {hf_path}")
     # Load weights from Huggingface model
     ckpt = torch.load(hf_path, map_location=torch.device("cpu"))
 
     qkv = {}
     for name_str in ckpt.keys():
         array = ckpt[name_str].numpy()
-        logger.info("Loading Huggingface weight {} with shape {}".format(name_str, array.shape))
+        logger.info(f"Loading Huggingface weight {name_str} with shape {array.shape}")
         name = name_str.split(".")
         pointer = model
         key = None
@@ -191,13 +190,11 @@ def load_hf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
             # Special in deepspeed.
             if name_str.find("bert.pooler.dense") >= 0 and m_name == "dense":
                 pointer = getattr(pointer, "dense_act")
-            elif is_layer:
-                pass
-            else:
+            elif not is_layer:
                 try:
                     pointer = getattr(pointer, m_name)
                 except AttributeError:
-                    logger.info("Skipping {}".format(".".join(name)))
+                    logger.info(f'Skipping {".".join(name)}')
                     skipping = True
                     break
 
@@ -275,7 +272,7 @@ def load_hf_weights_in_bert_kernel(model, ckpt_path, voc_size_diff):
             array = np.concatenate((array, z), axis=0)
 
         set_data(pointer, array)
-        logger.info("Initialize DeepSpeed weight {}".format(name))
+        logger.info(f"Initialize DeepSpeed weight {name}")
 
     return model
 
@@ -283,18 +280,18 @@ def load_hf_weights_in_bert_torch(model, ckpt_path, voc_size_diff):
     """ Load huggingface checkpoints and convert to a deepspeed model.
     """
     hf_path = os.path.abspath(ckpt_path)
-    logger.info("Converting Huggingface checkpoint from {}".format(hf_path))
+    logger.info(f"Converting Huggingface checkpoint from {hf_path}")
     # Load weights from Huggingface model
     ckpt = torch.load(hf_path, map_location=torch.device("cpu"))
 
     qkv = {}
+    key = None
+    is_layer = False
     for name_str in ckpt.keys():
         array = ckpt[name_str].numpy()
-        logger.info("Loading Huggingface weight {} with shape {}".format(name_str, array.shape))
+        logger.info(f"Loading Huggingface weight {name_str} with shape {array.shape}")
         name = name_str.split(".")
         pointer = model
-        key = None
-        is_layer = False
         skipping = False
         for m_name in name:
             # Special in deepspeed.
@@ -306,7 +303,7 @@ def load_hf_weights_in_bert_torch(model, ckpt_path, voc_size_diff):
                 try:
                     pointer = getattr(pointer, m_name)
                 except AttributeError:
-                    logger.info("Skipping {}".format(".".join(name)))
+                    logger.info(f'Skipping {".".join(name)}')
                     skipping = True
                     break
 
@@ -319,7 +316,7 @@ def load_hf_weights_in_bert_torch(model, ckpt_path, voc_size_diff):
             array = np.concatenate((array, z), axis=0)
 
         set_data(pointer, array)
-        logger.info("Initialize DeepSpeed weight {}".format(name))
+        logger.info(f"Initialize DeepSpeed weight {name}")
 
     return model
 
@@ -337,4 +334,4 @@ def convert_ckpt_to_deepspeed(model, ckpt_type, ckpt_path, vocab_diff, kernel_en
         else:
             raise ValueError("--deepspeed_transformer_kernel is required for loading TF checkpoint.")
     else:
-        raise ValueError(f"Invalid ckpt_type.")
+        raise ValueError("Invalid ckpt_type.")
diff --git a/BingBertSquad/evaluate.py b/BingBertSquad/evaluate.py
index c43a5ab0d..61e1f9dc9 100755
--- a/BingBertSquad/evaluate.py
+++ b/BingBertSquad/evaluate.py
@@ -35,8 +35,7 @@ def f1_score(prediction, ground_truth):
         return 0
     precision = 1.0 * num_same / len(prediction_tokens)
     recall = 1.0 * num_same / len(ground_truth_tokens)
-    f1 = (2 * precision * recall) / (precision + recall)
-    return f1
+    return (2 * precision * recall) / (precision + recall)
 
 
 def exact_match_score(prediction, ground_truth):
@@ -55,9 +54,13 @@ def evaluate(expected_version, ds_file, pred_file):
     with open(ds_file) as dataset_file:
         dataset_json = json.load(dataset_file)
         if (dataset_json['version'] != expected_version):
-            print('Evaluation expects v-' + expected_version +
-                  ', but got dataset with v-' + dataset_json['version'],
-                  file=sys.stderr)
+            print(
+                (
+                    f'Evaluation expects v-{expected_version}, but got dataset with v-'
+                    + dataset_json['version']
+                ),
+                file=sys.stderr,
+            )
         dataset = dataset_json['data']
     with open(pred_file) as prediction_file:
         predictions = json.load(prediction_file)
diff --git a/BingBertSquad/nvidia_run_squad_baseline.py b/BingBertSquad/nvidia_run_squad_baseline.py
index 188ba7838..71657f7e1 100755
--- a/BingBertSquad/nvidia_run_squad_baseline.py
+++ b/BingBertSquad/nvidia_run_squad_baseline.py
@@ -72,12 +72,11 @@ def __str__(self):
 
     def __repr__(self):
         s = ""
-        s += "qas_id: %s" % (self.qas_id)
-        s += ", question_text: %s" % (self.question_text)
-        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        s += f"qas_id: {self.qas_id}"
+        s += f", question_text: {self.question_text}"
+        s += f', doc_tokens: [{" ".join(self.doc_tokens)}]'
         if self.start_position:
             s += ", start_position: %d" % (self.start_position)
-        if self.start_position:
             s += ", end_position: %d" % (self.end_position)
         return s
 
@@ -115,9 +114,7 @@ def read_squad_examples(input_file, is_training):
         input_data = json.load(reader)["data"]
 
     def is_whitespace(c):
-        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-            return True
-        return False
+        return c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F
 
     examples = []
     for entry in input_data:
@@ -165,7 +162,7 @@ def is_whitespace(c):
                         doc_tokens[start_position:(end_position + 1)])
                     cleaned_answer_text = " ".join(
                         whitespace_tokenize(orig_answer_text))
-                    if actual_text.find(cleaned_answer_text) == -1:
+                    if cleaned_answer_text not in actual_text:
                         logger.warning("Could not find answer: '%s' vs. '%s'",
                                        actual_text, cleaned_answer_text)
                         continue
@@ -184,14 +181,13 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                                  doc_stride, max_query_length, is_training):
     """Loads a data file into a list of `InputBatch`s."""
 
-    unique_id = 1000000000
-
     features = []
-    for (example_index, example) in enumerate(examples):
+    unique_id = 1000000000
+    for example_index, example in enumerate(examples):
         query_tokens = tokenizer.tokenize(example.question_text)
 
         if len(query_tokens) > max_query_length:
-            query_tokens = query_tokens[0:max_query_length]
+            query_tokens = query_tokens[:max_query_length]
 
         tok_to_orig_index = []
         orig_to_tok_index = []
@@ -228,20 +224,17 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
         start_offset = 0
         while start_offset < len(all_doc_tokens):
             length = len(all_doc_tokens) - start_offset
-            if length > max_tokens_for_doc:
-                length = max_tokens_for_doc
+            length = min(length, max_tokens_for_doc)
             doc_spans.append(_DocSpan(start=start_offset, length=length))
             if start_offset + length == len(all_doc_tokens):
                 break
             start_offset += min(length, doc_stride)
 
-        for (doc_span_index, doc_span) in enumerate(doc_spans):
-            tokens = []
+        for doc_span_index, doc_span in enumerate(doc_spans):
             token_to_orig_map = {}
             token_is_max_context = {}
-            segment_ids = []
-            tokens.append("[CLS]")
-            segment_ids.append(0)
+            tokens = ["[CLS]"]
+            segment_ids = [0]
             for token in query_tokens:
                 tokens.append(token)
                 segment_ids.append(0)
@@ -297,29 +290,25 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
 
             if example_index < 20:
                 logger.info("*** Example ***")
-                logger.info("unique_id: %s" % (unique_id))
-                logger.info("example_index: %s" % (example_index))
-                logger.info("doc_span_index: %s" % (doc_span_index))
-                logger.info("tokens: %s" % " ".join(tokens))
-                logger.info("token_to_orig_map: %s" % " ".join(
-                    ["%d:%d" % (x, y)
-                     for (x, y) in token_to_orig_map.items()]))
-                logger.info("token_is_max_context: %s" % " ".join([
-                    "%d:%s" % (x, y)
-                    for (x, y) in token_is_max_context.items()
-                ]))
-                logger.info("input_ids: %s" %
-                            " ".join([str(x) for x in input_ids]))
-                logger.info("input_mask: %s" %
-                            " ".join([str(x) for x in input_mask]))
-                logger.info("segment_ids: %s" %
-                            " ".join([str(x) for x in segment_ids]))
+                logger.info(f"unique_id: {unique_id}")
+                logger.info(f"example_index: {example_index}")
+                logger.info(f"doc_span_index: {doc_span_index}")
+                logger.info(f'tokens: {" ".join(tokens)}')
+                logger.info(
+                    f'token_to_orig_map: {" ".join(["%d:%d" % (x, y) for x, y in token_to_orig_map.items()])}'
+                )
+                logger.info(
+                    f'token_is_max_context: {" ".join(["%d:%s" % (x, y) for x, y in token_is_max_context.items()])}'
+                )
+                logger.info(f'input_ids: {" ".join([str(x) for x in input_ids])}')
+                logger.info(f'input_mask: {" ".join([str(x) for x in input_mask])}')
+                logger.info(f'segment_ids: {" ".join([str(x) for x in segment_ids])}')
                 if is_training:
                     answer_text = " ".join(
                         tokens[start_position:(end_position + 1)])
                     logger.info("start_position: %d" % (start_position))
                     logger.info("end_position: %d" % (end_position))
-                    logger.info("answer: %s" % (answer_text))
+                    logger.info(f"answer: {answer_text}")
 
             features.append(
                 InputFeatures(unique_id=unique_id,
diff --git a/BingBertSquad/nvidia_run_squad_deepspeed.py b/BingBertSquad/nvidia_run_squad_deepspeed.py
index 558eafcfe..29f4de660 100755
--- a/BingBertSquad/nvidia_run_squad_deepspeed.py
+++ b/BingBertSquad/nvidia_run_squad_deepspeed.py
@@ -76,12 +76,11 @@ def __str__(self):
 
     def __repr__(self):
         s = ""
-        s += "qas_id: %s" % (self.qas_id)
-        s += ", question_text: %s" % (self.question_text)
-        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        s += f"qas_id: {self.qas_id}"
+        s += f", question_text: {self.question_text}"
+        s += f', doc_tokens: [{" ".join(self.doc_tokens)}]'
         if self.start_position:
             s += ", start_position: %d" % (self.start_position)
-        if self.start_position:
             s += ", end_position: %d" % (self.end_position)
         return s
 
@@ -119,9 +118,7 @@ def read_squad_examples(input_file, is_training):
         input_data = json.load(reader)["data"]
 
     def is_whitespace(c):
-        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-            return True
-        return False
+        return c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F
 
     examples = []
     for entry in input_data:
@@ -169,7 +166,7 @@ def is_whitespace(c):
                         doc_tokens[start_position:(end_position + 1)])
                     cleaned_answer_text = " ".join(
                         whitespace_tokenize(orig_answer_text))
-                    if actual_text.find(cleaned_answer_text) == -1:
+                    if cleaned_answer_text not in actual_text:
                         logger.warning("Could not find answer: '%s' vs. '%s'",
                                        actual_text, cleaned_answer_text)
                         continue
@@ -188,14 +185,13 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                                  doc_stride, max_query_length, is_training):
     """Loads a data file into a list of `InputBatch`s."""
 
-    unique_id = 1000000000
-
     features = []
-    for (example_index, example) in enumerate(examples):
+    unique_id = 1000000000
+    for example_index, example in enumerate(examples):
         query_tokens = tokenizer.tokenize(example.question_text)
 
         if len(query_tokens) > max_query_length:
-            query_tokens = query_tokens[0:max_query_length]
+            query_tokens = query_tokens[:max_query_length]
 
         tok_to_orig_index = []
         orig_to_tok_index = []
@@ -232,20 +228,17 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
         start_offset = 0
         while start_offset < len(all_doc_tokens):
             length = len(all_doc_tokens) - start_offset
-            if length > max_tokens_for_doc:
-                length = max_tokens_for_doc
+            length = min(length, max_tokens_for_doc)
             doc_spans.append(_DocSpan(start=start_offset, length=length))
             if start_offset + length == len(all_doc_tokens):
                 break
             start_offset += min(length, doc_stride)
 
-        for (doc_span_index, doc_span) in enumerate(doc_spans):
-            tokens = []
+        for doc_span_index, doc_span in enumerate(doc_spans):
             token_to_orig_map = {}
             token_is_max_context = {}
-            segment_ids = []
-            tokens.append("[CLS]")
-            segment_ids.append(0)
+            tokens = ["[CLS]"]
+            segment_ids = [0]
             for token in query_tokens:
                 tokens.append(token)
                 segment_ids.append(0)
@@ -301,29 +294,25 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
 
             if example_index < 20:
                 logger.info("*** Example ***")
-                logger.info("unique_id: %s" % (unique_id))
-                logger.info("example_index: %s" % (example_index))
-                logger.info("doc_span_index: %s" % (doc_span_index))
-                logger.info("tokens: %s" % " ".join(tokens))
-                logger.info("token_to_orig_map: %s" % " ".join(
-                    ["%d:%d" % (x, y)
-                     for (x, y) in token_to_orig_map.items()]))
-                logger.info("token_is_max_context: %s" % " ".join([
-                    "%d:%s" % (x, y)
-                    for (x, y) in token_is_max_context.items()
-                ]))
-                logger.info("input_ids: %s" %
-                            " ".join([str(x) for x in input_ids]))
-                logger.info("input_mask: %s" %
-                            " ".join([str(x) for x in input_mask]))
-                logger.info("segment_ids: %s" %
-                            " ".join([str(x) for x in segment_ids]))
+                logger.info(f"unique_id: {unique_id}")
+                logger.info(f"example_index: {example_index}")
+                logger.info(f"doc_span_index: {doc_span_index}")
+                logger.info(f'tokens: {" ".join(tokens)}')
+                logger.info(
+                    f'token_to_orig_map: {" ".join(["%d:%d" % (x, y) for x, y in token_to_orig_map.items()])}'
+                )
+                logger.info(
+                    f'token_is_max_context: {" ".join(["%d:%s" % (x, y) for x, y in token_is_max_context.items()])}'
+                )
+                logger.info(f'input_ids: {" ".join([str(x) for x in input_ids])}')
+                logger.info(f'input_mask: {" ".join([str(x) for x in input_mask])}')
+                logger.info(f'segment_ids: {" ".join([str(x) for x in segment_ids])}')
                 if is_training:
                     answer_text = " ".join(
                         tokens[start_position:(end_position + 1)])
                     logger.info("start_position: %d" % (start_position))
                     logger.info("end_position: %d" % (end_position))
-                    logger.info("answer: %s" % (answer_text))
+                    logger.info(f"answer: {answer_text}")
 
             features.append(
                 InputFeatures(unique_id=unique_id,
diff --git a/BingBertSquad/pytorch_pretrained_bert/file_utils.py b/BingBertSquad/pytorch_pretrained_bert/file_utils.py
index 3fb6f93a2..c4a474212 100644
--- a/BingBertSquad/pytorch_pretrained_bert/file_utils.py
+++ b/BingBertSquad/pytorch_pretrained_bert/file_utils.py
@@ -41,7 +41,7 @@ def url_to_filename(url: str, etag: str = None) -> str:
     if etag:
         etag_bytes = etag.encode('utf-8')
         etag_hash = sha256(etag_bytes)
-        filename += '.' + etag_hash.hexdigest()
+        filename += f'.{etag_hash.hexdigest()}'
 
     return filename
 
@@ -59,11 +59,11 @@ def filename_to_url(filename: str,
 
     cache_path = os.path.join(cache_dir, filename)
     if not os.path.exists(cache_path):
-        raise FileNotFoundError("file {} not found".format(cache_path))
+        raise FileNotFoundError(f"file {cache_path} not found")
 
-    meta_path = cache_path + '.json'
+    meta_path = f'{cache_path}.json'
     if not os.path.exists(meta_path):
-        raise FileNotFoundError("file {} not found".format(meta_path))
+        raise FileNotFoundError(f"file {meta_path} not found")
 
     with open(meta_path) as meta_file:
         metadata = json.load(meta_file)
@@ -98,19 +98,19 @@ def cached_path(url_or_filename: Union[str, Path],
         return url_or_filename
     elif parsed.scheme == '':
         # File, but it doesn't exist.
-        raise FileNotFoundError("file {} not found".format(url_or_filename))
+        raise FileNotFoundError(f"file {url_or_filename} not found")
     else:
         # Something unknown
         raise ValueError(
-            "unable to parse {} as a URL or as a local path".format(
-                url_or_filename))
+            f"unable to parse {url_or_filename} as a URL or as a local path"
+        )
 
 
 def split_s3_path(url: str) -> Tuple[str, str]:
     """Split a full s3 path into the bucket name and path."""
     parsed = urlparse(url)
     if not parsed.netloc or not parsed.path:
-        raise ValueError("bad s3 path {}".format(url))
+        raise ValueError(f"bad s3 path {url}")
     bucket_name = parsed.netloc
     s3_path = parsed.path
     # Remove '/' at beginning of path.
@@ -185,8 +185,8 @@ def get_from_cache(url: str, cache_dir: Union[str, Path] = None) -> str:
         response = requests.head(url, allow_redirects=True)
         if response.status_code != 200:
             raise IOError(
-                "HEAD request failed for url {} with status code {}".format(
-                    url, response.status_code))
+                f"HEAD request failed for url {url} with status code {response.status_code}"
+            )
         etag = response.headers.get("ETag")
 
     filename = url_to_filename(url, etag)
@@ -219,7 +219,7 @@ def get_from_cache(url: str, cache_dir: Union[str, Path] = None) -> str:
 
             logger.info("creating metadata file for %s", cache_path)
             meta = {'url': url, 'etag': etag}
-            meta_path = cache_path + '.json'
+            meta_path = f'{cache_path}.json'
             with open(meta_path, 'w') as meta_file:
                 json.dump(meta, meta_file)
 
diff --git a/BingBertSquad/pytorch_pretrained_bert/modeling.py b/BingBertSquad/pytorch_pretrained_bert/modeling.py
index e667d372c..7dbdc7887 100644
--- a/BingBertSquad/pytorch_pretrained_bert/modeling.py
+++ b/BingBertSquad/pytorch_pretrained_bert/modeling.py
@@ -154,8 +154,7 @@ def __repr__(self):
 
     def to_dict(self):
         """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
+        return copy.deepcopy(self.__dict__)
 
     def to_json_string(self):
         """Serializes this instance to a JSON string."""
@@ -300,8 +299,7 @@ def __init__(self, config):
 
     def forward(self, input_tensor, attention_mask):
         self_output = self.self(input_tensor, attention_mask)
-        attention_output = self.output(self_output, input_tensor)
-        return attention_output
+        return self.output(self_output, input_tensor)
 
 
 class BertIntermediate(nn.Module):
@@ -341,8 +339,7 @@ def __init__(self, config):
     def forward(self, hidden_states, attention_mask):
         attention_output = self.attention(hidden_states, attention_mask)
         intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
+        return self.output(intermediate_output, attention_output)
 
 
 class BertEncoder(nn.Module):
@@ -423,8 +420,7 @@ def __init__(self, config, bert_model_embedding_weights):
                                                 bert_model_embedding_weights)
 
     def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
+        return self.predictions(sequence_output)
 
 
 class BertOnlyNSPHead(nn.Module):
@@ -433,8 +429,7 @@ def __init__(self, config):
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
 
     def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
+        return self.seq_relationship(pooled_output)
 
 
 class BertPreTrainingHeads(nn.Module):
@@ -458,10 +453,8 @@ def __init__(self, config, *inputs, **kwargs):
         super(PreTrainedBertModel, self).__init__()
         if not isinstance(config, BertConfig):
             raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__))
+                f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class `BertConfig`. To create a model from a Google pretrained model use `model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
         self.config = config
 
     def init_bert_weights(self, module):
@@ -584,11 +577,11 @@ def load(module, prefix=''):
                     load(child, prefix + name + '.')
 
         load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
-        if len(missing_keys) > 0:
+        if missing_keys:
             logger.info(
                 "Weights of {} not initialized from pretrained model: {}".
                 format(model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
+        if unexpected_keys:
             logger.info(
                 "Weights from pretrained model not used in {}: {}".format(
                     model.__class__.__name__, unexpected_keys))
@@ -758,17 +751,15 @@ def forward(self,
         prediction_scores, seq_relationship_score = self.cls(
             sequence_output, pooled_output)
 
-        if masked_lm_labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
-                                          next_sentence_label.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-            return total_loss
-        else:
+        if masked_lm_labels is None or next_sentence_label is None:
             return prediction_scores, seq_relationship_score
+        loss_fct = CrossEntropyLoss(ignore_index=-1)
+        masked_lm_loss = loss_fct(
+            prediction_scores.view(-1, self.config.vocab_size),
+            masked_lm_labels.view(-1))
+        next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
+                                      next_sentence_label.view(-1))
+        return masked_lm_loss + next_sentence_loss
 
 
 class BertForMaskedLM(PreTrainedBertModel):
@@ -833,10 +824,10 @@ def forward(self,
 
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
+            return loss_fct(
                 prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            return masked_lm_loss
+                masked_lm_labels.view(-1),
+            )
         else:
             return prediction_scores
 
@@ -901,13 +892,12 @@ def forward(self,
                                      output_all_encoded_layers=False)
         seq_relationship_score = self.cls(pooled_output)
 
-        if next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
-                                          next_sentence_label.view(-1))
-            return next_sentence_loss
-        else:
+        if next_sentence_label is None:
             return seq_relationship_score
+        loss_fct = CrossEntropyLoss(ignore_index=-1)
+        return loss_fct(
+            seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)
+        )
 
 
 class BertForSequenceClassification(PreTrainedBertModel):
@@ -977,9 +967,7 @@ def forward(self,
 
         if labels is not None:
             loss_fct = CrossEntropyLoss()
-            # loss_fct = FocalLoss(class_num=self.num_labels, gamma=0.5)
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
         else:
             return logits
 
@@ -1052,12 +1040,10 @@ def forward(self,
         logits = self.classifier(pooled_output)
         reshaped_logits = logits.view(-1, self.num_choices)
 
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            return loss
-        else:
+        if labels is None:
             return reshaped_logits
+        loss_fct = CrossEntropyLoss()
+        return loss_fct(reshaped_logits, labels)
 
 
 class BertForTokenClassification(PreTrainedBertModel):
@@ -1127,8 +1113,7 @@ def forward(self,
 
         if labels is not None:
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
         else:
             return logits
 
@@ -1203,21 +1188,19 @@ def forward(self,
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            return total_loss
-        else:
+        if start_positions is None or end_positions is None:
             return start_logits, end_logits
+        # If we are on multi-GPU, split add a dimension
+        if len(start_positions.size()) > 1:
+            start_positions = start_positions.squeeze(-1)
+        if len(end_positions.size()) > 1:
+            end_positions = end_positions.squeeze(-1)
+        # sometimes the start/end positions are outside our model inputs, we ignore these terms
+        ignored_index = start_logits.size(1)
+        start_positions.clamp_(0, ignored_index)
+        end_positions.clamp_(0, ignored_index)
+
+        loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+        start_loss = loss_fct(start_logits, start_positions)
+        end_loss = loss_fct(end_logits, end_positions)
+        return (start_loss + end_loss) / 2
diff --git a/BingBertSquad/pytorch_pretrained_bert/optimization.py b/BingBertSquad/pytorch_pretrained_bert/optimization.py
index dd85bb54c..c09f5b52d 100644
--- a/BingBertSquad/pytorch_pretrained_bert/optimization.py
+++ b/BingBertSquad/pytorch_pretrained_bert/optimization.py
@@ -22,15 +22,11 @@
 
 
 def warmup_cosine(x, warmup=0.002):
-    if x < warmup:
-        return x / warmup
-    return 0.5 * (1.0 + torch.cos(math.pi * x))
+    return x / warmup if x < warmup else 0.5 * (1.0 + torch.cos(math.pi * x))
 
 
 def warmup_constant(x, warmup=0.002):
-    if x < warmup:
-        return x / warmup
-    return 1.0
+    return x / warmup if x < warmup else 1.0
 
 
 def warmup_linear(x, warmup=0.002):
@@ -89,25 +85,17 @@ def __init__(self,
                  weight_decay=0.01,
                  max_grad_norm=1.0):
         if lr is not required and lr < 0.0:
-            raise ValueError(
-                "Invalid learning rate: {} - should be >= 0.0".format(lr))
+            raise ValueError(f"Invalid learning rate: {lr} - should be >= 0.0")
         if schedule not in SCHEDULES:
-            raise ValueError("Invalid schedule parameter: {}".format(schedule))
-        if not 0.0 <= warmup < 1.0 and not warmup == -1:
-            raise ValueError(
-                "Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(
-                    warmup))
+            raise ValueError(f"Invalid schedule parameter: {schedule}")
+        if not 0.0 <= warmup < 1.0 and warmup != -1:
+            raise ValueError(f"Invalid warmup: {warmup} - should be in [0.0, 1.0[ or -1")
         if not 0.0 <= b1 < 1.0:
-            raise ValueError(
-                "Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(
-                    b1))
+            raise ValueError(f"Invalid b1 parameter: {b1} - should be in [0.0, 1.0[")
         if not 0.0 <= b2 < 1.0:
-            raise ValueError(
-                "Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(
-                    b2))
-        if not e >= 0.0:
-            raise ValueError(
-                "Invalid epsilon value: {} - should be >= 0.0".format(e))
+            raise ValueError(f"Invalid b2 parameter: {b2} - should be in [0.0, 1.0[")
+        if e < 0.0:
+            raise ValueError(f"Invalid epsilon value: {e} - should be >= 0.0")
         defaults = dict(lr=lr,
                         schedule=schedule,
                         warmup=warmup,
@@ -142,10 +130,7 @@ def step(self, closure=None):
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
+        loss = closure() if closure is not None else None
         for group in self.param_groups:
             for p in group['params']:
                 if p.grad is None:
diff --git a/BingBertSquad/pytorch_pretrained_bert/tokenization.py b/BingBertSquad/pytorch_pretrained_bert/tokenization.py
index cdd1c7cc2..ee38770bf 100644
--- a/BingBertSquad/pytorch_pretrained_bert/tokenization.py
+++ b/BingBertSquad/pytorch_pretrained_bert/tokenization.py
@@ -73,10 +73,7 @@ def load_vocab(vocab_file):
 def whitespace_tokenize(text):
     """Runs basic whitespace cleaning and splitting on a peice of text."""
     text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
+    return [] if not text else text.split()
 
 
 class BertTokenizer(object):
@@ -88,9 +85,8 @@ def __init__(self,
                  never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
         if not os.path.isfile(vocab_file):
             raise ValueError(
-                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-                .format(vocab_file))
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
         self.vocab = load_vocab(vocab_file)
         self.ids_to_tokens = collections.OrderedDict([
             (ids, tok) for tok, ids in self.vocab.items()
@@ -103,29 +99,21 @@ def __init__(self,
     def tokenize(self, text):
         split_tokens = []
         for token in self.basic_tokenizer.tokenize(text):
-            for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                split_tokens.append(sub_token)
+            split_tokens.extend(iter(self.wordpiece_tokenizer.tokenize(token)))
         return split_tokens
 
     def convert_tokens_to_ids(self, tokens):
         """Converts a sequence of tokens into ids using the vocab."""
-        ids = []
-        for token in tokens:
-            ids.append(self.vocab[token])
+        ids = [self.vocab[token] for token in tokens]
         if len(ids) > self.max_len:
             raise ValueError(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this BERT model ({} > {}). Running this"
-                " sequence through BERT will result in indexing errors".format(
-                    len(ids), self.max_len))
+                f"Token indices sequence length is longer than the specified maximum  sequence length for this BERT model ({len(ids)} > {self.max_len}). Running this sequence through BERT will result in indexing errors"
+            )
         return ids
 
     def convert_ids_to_tokens(self, ids):
         """Converts a sequence of ids in wordpiece tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            tokens.append(self.ids_to_tokens[i])
-        return tokens
+        return [self.ids_to_tokens[i] for i in ids]
 
     @classmethod
     def from_pretrained(cls,
@@ -148,27 +136,22 @@ def from_pretrained(cls,
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
         except FileNotFoundError:
             logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find any file "
-                "associated to this path or url.".format(
-                    pretrained_model_name,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    vocab_file))
+                f"Model name '{pretrained_model_name}' was not found in model name list ({', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())}). We assumed '{vocab_file}' was a path or url but couldn't find any file associated to this path or url."
+            )
             return None
         if resolved_vocab_file == vocab_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
+            logger.info(f"loading vocabulary file {vocab_file}")
         else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
+            logger.info(
+                f"loading vocabulary file {vocab_file} from cache at {resolved_vocab_file}"
+            )
         if pretrained_model_name in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
             # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
             # than the number of positional embeddings
             max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
                 pretrained_model_name]
             kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-        # Instantiate tokenizer.
-        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
-        return tokenizer
+        return cls(resolved_vocab_file, *inputs, **kwargs)
 
 
 class BasicTokenizer(object):
@@ -202,8 +185,7 @@ def tokenize(self, text):
                 token = self._run_strip_accents(token)
             split_tokens.extend(self._run_split_on_punc(token))
 
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
+        return whitespace_tokenize(" ".join(split_tokens))
 
     def _run_strip_accents(self, text):
         """Strips accents from a piece of text."""
@@ -244,9 +226,7 @@ def _tokenize_chinese_chars(self, text):
         for char in text:
             cp = ord(char)
             if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
+                output.extend((" ", char, " "))
             else:
                 output.append(char)
         return "".join(output)
@@ -261,17 +241,16 @@ def _is_chinese_char(self, cp):
         # as is Japanese Hiragana and Katakana. Those alphabets are used to write
         # space-separated words, so they are not treated specially and handled
         # like the all of the other languages.
-        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-            (cp >= 0x3400 and cp <= 0x4DBF) or  #
-            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-            (cp >= 0x2B820 and cp <= 0x2CEAF) or
-            (cp >= 0xF900 and cp <= 0xFAFF) or  #
-            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
-            return True
-
-        return False
+        return (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)
+        )
 
     def _clean_text(self, text):
         """Performs invalid character removal and whitespace cleanup on text."""
@@ -328,7 +307,7 @@ def tokenize(self, text):
                 while start < end:
                     substr = "".join(chars[start:end])
                     if start > 0:
-                        substr = "##" + substr
+                        substr = f"##{substr}"
                     if substr in self.vocab:
                         cur_substr = substr
                         break
@@ -350,24 +329,20 @@ def _is_whitespace(char):
     """Checks whether `chars` is a whitespace character."""
     # \t, \n, and \r are technically contorl characters but we treat them
     # as whitespace since they are generally considered as such.
-    if char == " " or char == "\t" or char == "\n" or char == "\r":
+    if char in [" ", "\t", "\n", "\r"]:
         return True
     cat = unicodedata.category(char)
-    if cat == "Zs":
-        return True
-    return False
+    return cat == "Zs"
 
 
 def _is_control(char):
     """Checks whether `chars` is a control character."""
     # These are technically control characters but we count them as whitespace
     # characters.
-    if char == "\t" or char == "\n" or char == "\r":
+    if char in ["\t", "\n", "\r"]:
         return False
     cat = unicodedata.category(char)
-    if cat.startswith("C"):
-        return True
-    return False
+    return bool(cat.startswith("C"))
 
 
 def _is_punctuation(char):
@@ -381,6 +356,4 @@ def _is_punctuation(char):
             or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
         return True
     cat = unicodedata.category(char)
-    if cat.startswith("P"):
-        return True
-    return False
+    return bool(cat.startswith("P"))
diff --git a/BingBertSquad/turing/file_utils.py b/BingBertSquad/turing/file_utils.py
index a1396decf..4e8ab0000 100755
--- a/BingBertSquad/turing/file_utils.py
+++ b/BingBertSquad/turing/file_utils.py
@@ -52,7 +52,7 @@ def url_to_filename(url, etag=None):
     if etag:
         etag_bytes = etag.encode('utf-8')
         etag_hash = sha256(etag_bytes)
-        filename += '.' + etag_hash.hexdigest()
+        filename += f'.{etag_hash.hexdigest()}'
 
     return filename
 
@@ -69,11 +69,11 @@ def filename_to_url(filename, cache_dir=None):
 
     cache_path = os.path.join(cache_dir, filename)
     if not os.path.exists(cache_path):
-        raise EnvironmentError("file {} not found".format(cache_path))
+        raise EnvironmentError(f"file {cache_path} not found")
 
-    meta_path = cache_path + '.json'
+    meta_path = f'{cache_path}.json'
     if not os.path.exists(meta_path):
-        raise EnvironmentError("file {} not found".format(meta_path))
+        raise EnvironmentError(f"file {meta_path} not found")
 
     with open(meta_path, encoding="utf-8") as meta_file:
         metadata = json.load(meta_file)
@@ -107,19 +107,19 @@ def cached_path(url_or_filename, cache_dir=None):
         return url_or_filename
     elif parsed.scheme == '':
         # File, but it doesn't exist.
-        raise EnvironmentError("file {} not found".format(url_or_filename))
+        raise EnvironmentError(f"file {url_or_filename} not found")
     else:
         # Something unknown
         raise ValueError(
-            "unable to parse {} as a URL or as a local path".format(
-                url_or_filename))
+            f"unable to parse {url_or_filename} as a URL or as a local path"
+        )
 
 
 def split_s3_path(url):
     """Split a full s3 path into the bucket name and path."""
     parsed = urlparse(url)
     if not parsed.netloc or not parsed.path:
-        raise ValueError("bad s3 path {}".format(url))
+        raise ValueError(f"bad s3 path {url}")
     bucket_name = parsed.netloc
     s3_path = parsed.path
     # Remove '/' at beginning of path.
@@ -195,8 +195,8 @@ def get_from_cache(url, cache_dir=None):
         response = requests.head(url, allow_redirects=True)
         if response.status_code != 200:
             raise IOError(
-                "HEAD request failed for url {} with status code {}".format(
-                    url, response.status_code))
+                f"HEAD request failed for url {url} with status code {response.status_code}"
+            )
         etag = response.headers.get("ETag")
 
     filename = url_to_filename(url, etag)
@@ -229,7 +229,7 @@ def get_from_cache(url, cache_dir=None):
 
             logger.info("creating metadata file for %s", cache_path)
             meta = {'url': url, 'etag': etag}
-            meta_path = cache_path + '.json'
+            meta_path = f'{cache_path}.json'
             with open(meta_path, 'w', encoding="utf-8") as meta_file:
                 json.dump(meta, meta_file)
 
diff --git a/BingBertSquad/turing/loss.py b/BingBertSquad/turing/loss.py
index c6bfe8b15..d711ccc11 100644
--- a/BingBertSquad/turing/loss.py
+++ b/BingBertSquad/turing/loss.py
@@ -25,10 +25,7 @@ def __init__(self, class_num, alpha=None, gamma=2, size_average=True):
         if alpha is None:
             self.alpha = torch.ones(class_num, 1)
         else:
-            if isinstance(alpha, Variable):
-                self.alpha = alpha
-            else:
-                self.alpha = Variable(alpha)
+            self.alpha = alpha if isinstance(alpha, Variable) else Variable(alpha)
         self.gamma = gamma
         self.class_num = class_num
         self.size_average = size_average
@@ -53,8 +50,4 @@ def forward(self, inputs, targets):
 
         batch_loss = -alpha * (torch.pow((1 - probs), self.gamma)) * log_p
 
-        if self.size_average:
-            loss = batch_loss.mean()
-        else:
-            loss = batch_loss.sum()
-        return loss
+        return batch_loss.mean() if self.size_average else batch_loss.sum()
diff --git a/BingBertSquad/turing/modelingpreln_layerdrop.py b/BingBertSquad/turing/modelingpreln_layerdrop.py
index 4224cf208..8c8991e29 100755
--- a/BingBertSquad/turing/modelingpreln_layerdrop.py
+++ b/BingBertSquad/turing/modelingpreln_layerdrop.py
@@ -77,36 +77,35 @@ def get_deepspeed_config(args):
 
 
 def get_sparse_attention_config(args, num_heads):
-    if args.deepspeed_sparse_attention:
-        ds_config = get_deepspeed_config(args)
-        if hasattr(ds_config,
-                   'sparse_attention') and ds_config.sparse_attention:
-            sa_config = ds_config.sparse_attention
-            sa_mode = sa_config.get('mode')
-            if (sa_mode == 'dense'):
-                from deepspeed.ops.sparse_attention import DenseSparsityConfig as STConfig
-            elif (sa_mode == 'fixed'):
-                from deepspeed.ops.sparse_attention import FixedSparsityConfig as STConfig
-            elif (sa_mode == 'bigbird'):
-                from deepspeed.ops.sparse_attention import BigBirdSparsityConfig as STConfig
-            elif (sa_mode == 'bslongformer'):
-                from deepspeed.ops.sparse_attention import BSLongformerSparsityConfig as STConfig
-            elif (sa_mode == 'variable'):
-                from deepspeed.ops.sparse_attention import VariableSparsityConfig as STConfig
-            else:
-                raise NotImplementedError(
-                    f'Given sparsity mode, {sa_mode}, has not been implemented yet!'
-                )
-            del sa_config['mode']
-            return STConfig(num_heads=num_heads, **sa_config)
-        else:
+    if not args.deepspeed_sparse_attention:
+        return None
+    ds_config = get_deepspeed_config(args)
+    if hasattr(ds_config,
+               'sparse_attention') and ds_config.sparse_attention:
+        sa_config = ds_config.sparse_attention
+        sa_mode = sa_config.get('mode')
+        if (sa_mode == 'dense'):
+            from deepspeed.ops.sparse_attention import DenseSparsityConfig as STConfig
+        elif (sa_mode == 'fixed'):
             from deepspeed.ops.sparse_attention import FixedSparsityConfig as STConfig
-            print(
-                'deepspeed sparse attention is not set; Fixed sparsity is used as default.'
+        elif (sa_mode == 'bigbird'):
+            from deepspeed.ops.sparse_attention import BigBirdSparsityConfig as STConfig
+        elif (sa_mode == 'bslongformer'):
+            from deepspeed.ops.sparse_attention import BSLongformerSparsityConfig as STConfig
+        elif (sa_mode == 'variable'):
+            from deepspeed.ops.sparse_attention import VariableSparsityConfig as STConfig
+        else:
+            raise NotImplementedError(
+                f'Given sparsity mode, {sa_mode}, has not been implemented yet!'
             )
-            return STConfig(num_heads=num_heads)
+        del sa_config['mode']
+        return STConfig(num_heads=num_heads, **sa_config)
     else:
-        return None
+        from deepspeed.ops.sparse_attention import FixedSparsityConfig as STConfig
+        print(
+            'deepspeed sparse attention is not set; Fixed sparsity is used as default.'
+        )
+        return STConfig(num_heads=num_heads)
 
 def get_sparse_attention_utils(sparse_attention_config):
     if sparse_attention_config is not None:
@@ -128,13 +127,13 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         )
         raise
     tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    print(f"Converting TensorFlow checkpoint from {tf_path}")
     # Load weights from TF model
     init_vars = tf.train.list_variables(tf_path)
     names = []
     arrays = []
     for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
+        print(f"Loading TF weight {name} with shape {shape}")
         array = tf.train.load_variable(tf_path, name)
         names.append(name)
         arrays.append(array)
@@ -144,7 +143,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(n in ["adam_v", "adam_m"] for n in name):
-            print("Skipping {}".format("/".join(name)))
+            print(f'Skipping {"/".join(name)}')
             continue
         pointer = model
         for m_name in name:
@@ -152,12 +151,14 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
                 l = re.split(r'_(\d+)', m_name)
             else:
                 l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
+            if (
+                l[0] in ['kernel', 'gamma']
+                or l[0] not in ['output_bias', 'beta']
+                and l[0] == 'output_weights'
+            ):
                 pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
+            elif l[0] in ['output_bias', 'beta']:
                 pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
             else:
                 pointer = getattr(pointer, l[0])
             if len(l) >= 2:
@@ -172,7 +173,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
-        print("Initialize PyTorch weight {}".format(name))
+        print(f"Initialize PyTorch weight {name}")
         pointer.data = torch.from_numpy(array)
     return model
 
@@ -257,8 +258,7 @@ def forward(self, input):
             return self.act_fn(F.linear(input, self.weight, self.bias))
 
     def extra_repr(self):
-        return 'in_features={}, out_features={}, bias={}'.format(
-            self.in_features, self.out_features, self.bias is not None)
+        return f'in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}'
 
 
 class BertConfig(object):
@@ -345,8 +345,7 @@ def __repr__(self):
 
     def to_dict(self):
         """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
+        return copy.deepcopy(self.__dict__)
 
     def to_json_string(self):
         """Serializes this instance to a JSON string."""
@@ -503,8 +502,7 @@ def __init__(self, config):
 
     def forward(self, input_tensor, attention_mask):
         self_output = self.self(input_tensor, attention_mask)
-        attention_output = self.output(self_output, input_tensor)
-        return attention_output
+        return self.output(self_output, input_tensor)
 
 
 class BertIntermediate(nn.Module):
@@ -573,7 +571,7 @@ def __init__(self, config, args, sparse_attention_config=None):
 
         if args.deepspeed_transformer_kernel and args.deepspeed_sparse_attention:
             raise NotImplementedError(
-                f'Currently DeepSpeed Transformer Kernels do not support Sparse Attention. To use Sparse Attention, you need to disable Transformer Kernels!'
+                'Currently DeepSpeed Transformer Kernels do not support Sparse Attention. To use Sparse Attention, you need to disable Transformer Kernels!'
             )
 
         if args.deepspeed_transformer_kernel:
@@ -707,8 +705,7 @@ def forward(self, hidden_states):
         # We "pool" the model by simply taking the hidden state corresponding
         # to the first token.
         first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense_act(first_token_tensor)
-        return pooled_output
+        return self.dense_act(first_token_tensor)
 
 
 class BertPredictionHeadTransform(nn.Module):
@@ -748,8 +745,8 @@ def forward(self, hidden_states, masked_token_indexes):
                 masked_token_indexes)
 
         torch.cuda.nvtx.range_push(
-            "decoder input.size() = {}, weight.size() = {}".format(
-                hidden_states.size(), self.decoder.weight.size()))
+            f"decoder input.size() = {hidden_states.size()}, weight.size() = {self.decoder.weight.size()}"
+        )
         hidden_states = self.decoder(hidden_states) + self.bias
         torch.cuda.nvtx.range_pop()
         return hidden_states
@@ -762,8 +759,7 @@ def __init__(self, config, bert_model_embedding_weights):
                                                 bert_model_embedding_weights)
 
     def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
+        return self.predictions(sequence_output)
 
 
 class BertOnlyNSPHead(nn.Module):
@@ -772,8 +768,7 @@ def __init__(self, config):
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
 
     def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
+        return self.seq_relationship(pooled_output)
 
 
 class BertPreTrainingHeads(nn.Module):
@@ -801,10 +796,8 @@ def __init__(self, config, *inputs, **kwargs):
         super(BertPreTrainedModel, self).__init__()
         if not isinstance(config, BertConfig):
             raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__))
+                f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class `BertConfig`. To create a model from a Google pretrained model use `model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
         self.config = config
 
     def init_bert_weights(self, module):
@@ -952,15 +945,15 @@ def load(module, prefix=''):
                 s.startswith('bert.') for s in state_dict.keys()):
             start_prefix = 'bert.'
         load(model, prefix=start_prefix)
-        if len(missing_keys) > 0:
+        if missing_keys:
             logger.info(
                 "Weights of {} not initialized from pretrained model: {}".
                 format(model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
+        if unexpected_keys:
             logger.info(
                 "Weights from pretrained model not used in {}: {}".format(
                     model.__class__.__name__, unexpected_keys))
-        if len(error_msgs) > 0:
+        if error_msgs:
             raise RuntimeError(
                 'Error(s) in loading state_dict for {}:\n\t{}'.format(
                     model.__class__.__name__, "\n\t".join(error_msgs)))
@@ -1179,8 +1172,7 @@ def forward(self, batch, **kwargs):
                 prediction_scores.view(-1, self.config.vocab_size), target)
             next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
                                           next_sentence_label.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-            return total_loss
+            return masked_lm_loss + next_sentence_loss
         else:
             prediction_scores, seq_relationship_score = self.cls(
                 sequence_output, pooled_output)
@@ -1250,10 +1242,10 @@ def forward(self,
 
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
+            return loss_fct(
                 prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            return masked_lm_loss
+                masked_lm_labels.view(-1),
+            )
         else:
             return prediction_scores
 
@@ -1319,13 +1311,12 @@ def forward(self,
                                      output_all_encoded_layers=False)
         seq_relationship_score = self.cls(pooled_output)
 
-        if next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
-                                          next_sentence_label.view(-1))
-            return next_sentence_loss
-        else:
+        if next_sentence_label is None:
             return seq_relationship_score
+        loss_fct = CrossEntropyLoss(ignore_index=-1)
+        return loss_fct(
+            seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)
+        )
 
 
 class BertForSequenceClassification(BertPreTrainedModel):
@@ -1396,8 +1387,7 @@ def forward(self,
 
         if labels is not None:
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
         else:
             return logits
 
@@ -1471,12 +1461,10 @@ def forward(self,
         logits = self.classifier(pooled_output)
         reshaped_logits = logits.view(-1, self.num_choices)
 
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            return loss
-        else:
+        if labels is None:
             return reshaped_logits
+        loss_fct = CrossEntropyLoss()
+        return loss_fct(reshaped_logits, labels)
 
 
 class BertForTokenClassification(BertPreTrainedModel):
@@ -1545,20 +1533,15 @@ def forward(self,
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
 
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels),
-                                labels.view(-1))
-            return loss
-        else:
+        if labels is None:
             return logits
+        loss_fct = CrossEntropyLoss()
+        if attention_mask is None:
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        active_loss = attention_mask.view(-1) == 1
+        active_logits = logits.view(-1, self.num_labels)[active_loss]
+        active_labels = labels.view(-1)[active_loss]
+        return loss_fct(active_logits, active_labels)
 
 
 class BertForQuestionAnswering(BertPreTrainedModel):
@@ -1632,21 +1615,19 @@ def forward(self,
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            return total_loss
-        else:
+        if start_positions is None or end_positions is None:
             return start_logits, end_logits
+        # If we are on multi-GPU, split add a dimension
+        if len(start_positions.size()) > 1:
+            start_positions = start_positions.squeeze(-1)
+        if len(end_positions.size()) > 1:
+            end_positions = end_positions.squeeze(-1)
+        # sometimes the start/end positions are outside our model inputs, we ignore these terms
+        ignored_index = start_logits.size(1)
+        start_positions.clamp_(0, ignored_index)
+        end_positions.clamp_(0, ignored_index)
+
+        loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+        start_loss = loss_fct(start_logits, start_positions)
+        end_loss = loss_fct(end_logits, end_positions)
+        return (start_loss + end_loss) / 2
diff --git a/BingBertSquad/turing/nvidia_modeling.py b/BingBertSquad/turing/nvidia_modeling.py
index de0306169..511de83da 100755
--- a/BingBertSquad/turing/nvidia_modeling.py
+++ b/BingBertSquad/turing/nvidia_modeling.py
@@ -81,13 +81,13 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         )
         raise
     tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    print(f"Converting TensorFlow checkpoint from {tf_path}")
     # Load weights from TF model
     init_vars = tf.train.list_variables(tf_path)
     names = []
     arrays = []
     for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
+        print(f"Loading TF weight {name} with shape {shape}")
         array = tf.train.load_variable(tf_path, name)
         names.append(name)
         arrays.append(array)
@@ -97,7 +97,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(n in ["adam_v", "adam_m"] for n in name):
-            print("Skipping {}".format("/".join(name)))
+            print(f'Skipping {"/".join(name)}')
             continue
         pointer = model
         for m_name in name:
@@ -105,12 +105,14 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
                 l = re.split(r'_(\d+)', m_name)
             else:
                 l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
+            if (
+                l[0] in ['kernel', 'gamma']
+                or l[0] not in ['output_bias', 'beta']
+                and l[0] == 'output_weights'
+            ):
                 pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
+            elif l[0] in ['output_bias', 'beta']:
                 pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
             else:
                 pointer = getattr(pointer, l[0])
             if len(l) >= 2:
@@ -125,7 +127,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
-        print("Initialize PyTorch weight {}".format(name))
+        print(f"Initialize PyTorch weight {name}")
         pointer.data = torch.from_numpy(array)
     return model
 
@@ -207,8 +209,7 @@ def forward(self, input):
             return self.act_fn(F.linear(input, self.weight, self.bias))
 
     def extra_repr(self):
-        return 'in_features={}, out_features={}, bias={}'.format(
-            self.in_features, self.out_features, self.bias is not None)
+        return f'in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}'
 
 
 class BertConfig(object):
@@ -295,8 +296,7 @@ def __repr__(self):
 
     def to_dict(self):
         """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
+        return copy.deepcopy(self.__dict__)
 
     def to_json_string(self):
         """Serializes this instance to a JSON string."""
@@ -451,8 +451,7 @@ def __init__(self, config):
 
     def forward(self, input_tensor, attention_mask):
         self_output = self.self(input_tensor, attention_mask)
-        attention_output = self.output(self_output, input_tensor)
-        return attention_output
+        return self.output(self_output, input_tensor)
 
 
 class BertIntermediate(nn.Module):
@@ -491,8 +490,7 @@ def __init__(self, config):
     def forward(self, hidden_states, attention_mask):
         attention_output = self.attention(hidden_states, attention_mask)
         intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
+        return self.output(intermediate_output, attention_output)
 
 
 class BertEncoder(nn.Module):
@@ -607,8 +605,7 @@ def forward(self, hidden_states):
         # We "pool" the model by simply taking the hidden state corresponding
         # to the first token.
         first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense_act(first_token_tensor)
-        return pooled_output
+        return self.dense_act(first_token_tensor)
 
 
 class BertPredictionHeadTransform(nn.Module):
@@ -642,8 +639,8 @@ def __init__(self, config, bert_model_embedding_weights):
     def forward(self, hidden_states):
         hidden_states = self.transform(hidden_states)
         torch.cuda.nvtx.range_push(
-            "decoder input.size() = {}, weight.size() = {}".format(
-                hidden_states.size(), self.decoder.weight.size()))
+            f"decoder input.size() = {hidden_states.size()}, weight.size() = {self.decoder.weight.size()}"
+        )
         hidden_states = self.decoder(hidden_states) + self.bias
         torch.cuda.nvtx.range_pop()
         return hidden_states
@@ -656,8 +653,7 @@ def __init__(self, config, bert_model_embedding_weights):
                                                 bert_model_embedding_weights)
 
     def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
+        return self.predictions(sequence_output)
 
 
 class BertOnlyNSPHead(nn.Module):
@@ -666,8 +662,7 @@ def __init__(self, config):
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
 
     def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
+        return self.seq_relationship(pooled_output)
 
 
 class BertPreTrainingHeads(nn.Module):
@@ -691,10 +686,8 @@ def __init__(self, config, *inputs, **kwargs):
         super(BertPreTrainedModel, self).__init__()
         if not isinstance(config, BertConfig):
             raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__))
+                f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class `BertConfig`. To create a model from a Google pretrained model use `model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
         self.config = config
 
     def init_bert_weights(self, module):
@@ -836,15 +829,15 @@ def load(module, prefix=''):
                 s.startswith('bert.') for s in state_dict.keys()):
             start_prefix = 'bert.'
         load(model, prefix=start_prefix)
-        if len(missing_keys) > 0:
+        if missing_keys:
             logger.info(
                 "Weights of {} not initialized from pretrained model: {}".
                 format(model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
+        if unexpected_keys:
             logger.info(
                 "Weights from pretrained model not used in {}: {}".format(
                     model.__class__.__name__, unexpected_keys))
-        if len(error_msgs) > 0:
+        if error_msgs:
             raise RuntimeError(
                 'Error(s) in loading state_dict for {}:\n\t{}'.format(
                     model.__class__.__name__, "\n\t".join(error_msgs)))
@@ -1030,20 +1023,15 @@ def forward(self, batch, log=True):
         prediction_scores, seq_relationship_score = self.cls(
             sequence_output, pooled_output)
 
-        if masked_lm_labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
-                                          next_sentence_label.view(-1))
-            #print("loss is {} {}".format(masked_lm_loss, next_sentence_loss))
-            total_loss = masked_lm_loss + next_sentence_loss
-            #            if log:
-            #                self.log_summary_writer(logs={'train_loss': total_loss.item()})
-            return total_loss
-        else:
+        if masked_lm_labels is None or next_sentence_label is None:
             return prediction_scores, seq_relationship_score
+        loss_fct = CrossEntropyLoss(ignore_index=-1)
+        masked_lm_loss = loss_fct(
+            prediction_scores.view(-1, self.config.vocab_size),
+            masked_lm_labels.view(-1))
+        next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
+                                      next_sentence_label.view(-1))
+        return masked_lm_loss + next_sentence_loss
 
 
 class BertForMaskedLM(BertPreTrainedModel):
@@ -1109,10 +1097,10 @@ def forward(self,
 
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
+            return loss_fct(
                 prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            return masked_lm_loss
+                masked_lm_labels.view(-1),
+            )
         else:
             return prediction_scores
 
@@ -1178,13 +1166,12 @@ def forward(self,
                                      output_all_encoded_layers=False)
         seq_relationship_score = self.cls(pooled_output)
 
-        if next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
-                                          next_sentence_label.view(-1))
-            return next_sentence_loss
-        else:
+        if next_sentence_label is None:
             return seq_relationship_score
+        loss_fct = CrossEntropyLoss(ignore_index=-1)
+        return loss_fct(
+            seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)
+        )
 
 
 class BertForSequenceClassification(BertPreTrainedModel):
@@ -1255,8 +1242,7 @@ def forward(self,
 
         if labels is not None:
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
         else:
             return logits
 
@@ -1330,12 +1316,10 @@ def forward(self,
         logits = self.classifier(pooled_output)
         reshaped_logits = logits.view(-1, self.num_choices)
 
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            return loss
-        else:
+        if labels is None:
             return reshaped_logits
+        loss_fct = CrossEntropyLoss()
+        return loss_fct(reshaped_logits, labels)
 
 
 class BertForTokenClassification(BertPreTrainedModel):
@@ -1404,20 +1388,15 @@ def forward(self,
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
 
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels),
-                                labels.view(-1))
-            return loss
-        else:
+        if labels is None:
             return logits
+        loss_fct = CrossEntropyLoss()
+        if attention_mask is None:
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        active_loss = attention_mask.view(-1) == 1
+        active_logits = logits.view(-1, self.num_labels)[active_loss]
+        active_labels = labels.view(-1)[active_loss]
+        return loss_fct(active_logits, active_labels)
 
 
 class BertForQuestionAnswering(BertPreTrainedModel):
@@ -1491,21 +1470,19 @@ def forward(self,
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            return total_loss
-        else:
+        if start_positions is None or end_positions is None:
             return start_logits, end_logits
+        # If we are on multi-GPU, split add a dimension
+        if len(start_positions.size()) > 1:
+            start_positions = start_positions.squeeze(-1)
+        if len(end_positions.size()) > 1:
+            end_positions = end_positions.squeeze(-1)
+        # sometimes the start/end positions are outside our model inputs, we ignore these terms
+        ignored_index = start_logits.size(1)
+        start_positions.clamp_(0, ignored_index)
+        end_positions.clamp_(0, ignored_index)
+
+        loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+        start_loss = loss_fct(start_logits, start_positions)
+        end_loss = loss_fct(end_logits, end_positions)
+        return (start_loss + end_loss) / 2
diff --git a/BingBertSquad/turing/nvidia_modelingpreln.py b/BingBertSquad/turing/nvidia_modelingpreln.py
index 1f1e88de1..9bf3d9e7d 100755
--- a/BingBertSquad/turing/nvidia_modelingpreln.py
+++ b/BingBertSquad/turing/nvidia_modelingpreln.py
@@ -81,13 +81,13 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         )
         raise
     tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    print(f"Converting TensorFlow checkpoint from {tf_path}")
     # Load weights from TF model
     init_vars = tf.train.list_variables(tf_path)
     names = []
     arrays = []
     for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
+        print(f"Loading TF weight {name} with shape {shape}")
         array = tf.train.load_variable(tf_path, name)
         names.append(name)
         arrays.append(array)
@@ -97,7 +97,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(n in ["adam_v", "adam_m"] for n in name):
-            print("Skipping {}".format("/".join(name)))
+            print(f'Skipping {"/".join(name)}')
             continue
         pointer = model
         for m_name in name:
@@ -105,12 +105,14 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
                 l = re.split(r'_(\d+)', m_name)
             else:
                 l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
+            if (
+                l[0] in ['kernel', 'gamma']
+                or l[0] not in ['output_bias', 'beta']
+                and l[0] == 'output_weights'
+            ):
                 pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
+            elif l[0] in ['output_bias', 'beta']:
                 pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
             else:
                 pointer = getattr(pointer, l[0])
             if len(l) >= 2:
@@ -125,7 +127,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
-        print("Initialize PyTorch weight {}".format(name))
+        print(f"Initialize PyTorch weight {name}")
         pointer.data = torch.from_numpy(array)
     return model
 
@@ -210,8 +212,7 @@ def forward(self, input):
             return self.act_fn(F.linear(input, self.weight, self.bias))
 
     def extra_repr(self):
-        return 'in_features={}, out_features={}, bias={}'.format(
-            self.in_features, self.out_features, self.bias is not None)
+        return f'in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}'
 
 
 class BertConfig(object):
@@ -300,8 +301,7 @@ def __repr__(self):
 
     def to_dict(self):
         """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
+        return copy.deepcopy(self.__dict__)
 
     def to_json_string(self):
         """Serializes this instance to a JSON string."""
@@ -458,8 +458,7 @@ def __init__(self, config):
 
     def forward(self, input_tensor, attention_mask):
         self_output = self.self(input_tensor, attention_mask)
-        attention_output = self.output(self_output, input_tensor)
-        return attention_output
+        return self.output(self_output, input_tensor)
 
 
 class BertIntermediate(nn.Module):
@@ -628,8 +627,7 @@ def forward(self, hidden_states):
         # We "pool" the model by simply taking the hidden state corresponding
         # to the first token.
         first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense_act(first_token_tensor)
-        return pooled_output
+        return self.dense_act(first_token_tensor)
 
 
 class BertPredictionHeadTransform(nn.Module):
@@ -663,8 +661,8 @@ def __init__(self, config, bert_model_embedding_weights):
     def forward(self, hidden_states):
         hidden_states = self.transform(hidden_states)
         torch.cuda.nvtx.range_push(
-            "decoder input.size() = {}, weight.size() = {}".format(
-                hidden_states.size(), self.decoder.weight.size()))
+            f"decoder input.size() = {hidden_states.size()}, weight.size() = {self.decoder.weight.size()}"
+        )
         hidden_states = self.decoder(hidden_states) + self.bias
         torch.cuda.nvtx.range_pop()
         return hidden_states
@@ -677,8 +675,7 @@ def __init__(self, config, bert_model_embedding_weights):
                                                 bert_model_embedding_weights)
 
     def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
+        return self.predictions(sequence_output)
 
 
 class BertOnlyNSPHead(nn.Module):
@@ -687,8 +684,7 @@ def __init__(self, config):
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
 
     def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
+        return self.seq_relationship(pooled_output)
 
 
 class BertPreTrainingHeads(nn.Module):
@@ -712,10 +708,8 @@ def __init__(self, config, *inputs, **kwargs):
         super(BertPreTrainedModel, self).__init__()
         if not isinstance(config, BertConfig):
             raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__))
+                f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class `BertConfig`. To create a model from a Google pretrained model use `model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
         self.config = config
 
     def init_bert_weights(self, module):
@@ -863,15 +857,15 @@ def load(module, prefix=''):
                 s.startswith('bert.') for s in state_dict.keys()):
             start_prefix = 'bert.'
         load(model, prefix=start_prefix)
-        if len(missing_keys) > 0:
+        if missing_keys:
             logger.info(
                 "Weights of {} not initialized from pretrained model: {}".
                 format(model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
+        if unexpected_keys:
             logger.info(
                 "Weights from pretrained model not used in {}: {}".format(
                     model.__class__.__name__, unexpected_keys))
-        if len(error_msgs) > 0:
+        if error_msgs:
             raise RuntimeError(
                 'Error(s) in loading state_dict for {}:\n\t{}'.format(
                     model.__class__.__name__, "\n\t".join(error_msgs)))
@@ -1058,20 +1052,15 @@ def forward(self, batch, log=True):
         prediction_scores, seq_relationship_score = self.cls(
             sequence_output, pooled_output)
 
-        if masked_lm_labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
-                prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
-                                          next_sentence_label.view(-1))
-            #print("loss is {} {}".format(masked_lm_loss, next_sentence_loss))
-            total_loss = masked_lm_loss + next_sentence_loss
-            #            if log:
-            #                self.log_summary_writer(logs={'train_loss': total_loss.item()})
-            return total_loss
-        else:
+        if masked_lm_labels is None or next_sentence_label is None:
             return prediction_scores, seq_relationship_score
+        loss_fct = CrossEntropyLoss(ignore_index=-1)
+        masked_lm_loss = loss_fct(
+            prediction_scores.view(-1, self.config.vocab_size),
+            masked_lm_labels.view(-1))
+        next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
+                                      next_sentence_label.view(-1))
+        return masked_lm_loss + next_sentence_loss
 
 
 class BertForMaskedLM(BertPreTrainedModel):
@@ -1137,10 +1126,10 @@ def forward(self,
 
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(
+            return loss_fct(
                 prediction_scores.view(-1, self.config.vocab_size),
-                masked_lm_labels.view(-1))
-            return masked_lm_loss
+                masked_lm_labels.view(-1),
+            )
         else:
             return prediction_scores
 
@@ -1206,13 +1195,12 @@ def forward(self,
                                      output_all_encoded_layers=False)
         seq_relationship_score = self.cls(pooled_output)
 
-        if next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2),
-                                          next_sentence_label.view(-1))
-            return next_sentence_loss
-        else:
+        if next_sentence_label is None:
             return seq_relationship_score
+        loss_fct = CrossEntropyLoss(ignore_index=-1)
+        return loss_fct(
+            seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)
+        )
 
 
 class BertForSequenceClassification(BertPreTrainedModel):
@@ -1283,8 +1271,7 @@ def forward(self,
 
         if labels is not None:
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            return loss
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
         else:
             return logits
 
@@ -1358,12 +1345,10 @@ def forward(self,
         logits = self.classifier(pooled_output)
         reshaped_logits = logits.view(-1, self.num_choices)
 
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            return loss
-        else:
+        if labels is None:
             return reshaped_logits
+        loss_fct = CrossEntropyLoss()
+        return loss_fct(reshaped_logits, labels)
 
 
 class BertForTokenClassification(BertPreTrainedModel):
@@ -1432,20 +1417,15 @@ def forward(self,
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
 
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)[active_loss]
-                active_labels = labels.view(-1)[active_loss]
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels),
-                                labels.view(-1))
-            return loss
-        else:
+        if labels is None:
             return logits
+        loss_fct = CrossEntropyLoss()
+        if attention_mask is None:
+            return loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        active_loss = attention_mask.view(-1) == 1
+        active_logits = logits.view(-1, self.num_labels)[active_loss]
+        active_labels = labels.view(-1)[active_loss]
+        return loss_fct(active_logits, active_labels)
 
 
 class BertForQuestionAnswering(BertPreTrainedModel):
@@ -1519,21 +1499,19 @@ def forward(self,
         start_logits = start_logits.squeeze(-1)
         end_logits = end_logits.squeeze(-1)
 
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            return total_loss
-        else:
+        if start_positions is None or end_positions is None:
             return start_logits, end_logits
+        # If we are on multi-GPU, split add a dimension
+        if len(start_positions.size()) > 1:
+            start_positions = start_positions.squeeze(-1)
+        if len(end_positions.size()) > 1:
+            end_positions = end_positions.squeeze(-1)
+        # sometimes the start/end positions are outside our model inputs, we ignore these terms
+        ignored_index = start_logits.size(1)
+        start_positions.clamp_(0, ignored_index)
+        end_positions.clamp_(0, ignored_index)
+
+        loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+        start_loss = loss_fct(start_logits, start_positions)
+        end_loss = loss_fct(end_logits, end_positions)
+        return (start_loss + end_loss) / 2
diff --git a/BingBertSquad/utils.py b/BingBertSquad/utils.py
index ce9ee235c..9ab814f76 100755
--- a/BingBertSquad/utils.py
+++ b/BingBertSquad/utils.py
@@ -246,9 +246,10 @@ def check_early_exit_warning(args):
     # Issue warning if early exit from epoch is configured
     if args.max_steps < sys.maxsize:
         logging.warning(
-            'Early training exit is set after {} global steps'.format(
-                args.max_steps))
+            f'Early training exit is set after {args.max_steps} global steps'
+        )
 
     if args.max_steps_per_epoch < sys.maxsize:
-        logging.warning('Early epoch exit is set after {} global steps'.format(
-            args.max_steps_per_epoch))
+        logging.warning(
+            f'Early epoch exit is set after {args.max_steps_per_epoch} global steps'
+        )
diff --git a/HelloDeepSpeed/train_bert.py b/HelloDeepSpeed/train_bert.py
index 88417623f..80d7ef4ef 100644
--- a/HelloDeepSpeed/train_bert.py
+++ b/HelloDeepSpeed/train_bert.py
@@ -348,9 +348,7 @@ def forward(
 
         loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
 
-        masked_lm_loss = loss_fct(
-            prediction_scores.view(-1, self.config.vocab_size), target)
-        return masked_lm_loss
+        return loss_fct(prediction_scores.view(-1, self.config.vocab_size), target)
 
 
 def create_model(num_layers: int, num_heads: int, ff_dim: int, h_dim: int,
@@ -401,8 +399,7 @@ def create_model(num_layers: int, num_heads: int, ff_dim: int, h_dim: int,
     }
     roberta_config = RobertaConfig.from_dict(roberta_config_dict)
     roberta_encoder = RobertaModel(roberta_config)
-    roberta_model = RobertaMLMModel(roberta_config, roberta_encoder)
-    return roberta_model
+    return RobertaMLMModel(roberta_config, roberta_encoder)
 
 
 ######################################################################
@@ -415,9 +412,9 @@ def get_unique_identifier(length: int = 8) -> str:
     random characters from list of ascii characters and numbers
     """
     alphabet = string.ascii_lowercase + string.digits
-    uuid = "".join(alphabet[ix]
-                   for ix in np.random.choice(len(alphabet), length))
-    return uuid
+    return "".join(
+        alphabet[ix] for ix in np.random.choice(len(alphabet), length)
+    )
 
 
 def create_experiment_dir(checkpoint_dir: pathlib.Path,
@@ -523,7 +520,7 @@ def load_model_checkpoint(
             not None,
             load_checkpoint_dir.glob("*.pt"),
         ))
-    assert len(checkpoint_files) > 0, "No checkpoints found in directory"
+    assert checkpoint_files, "No checkpoints found in directory"
     checkpoint_files = sorted(
         checkpoint_files,
         key=lambda path: int(
@@ -738,7 +735,7 @@ def train(
     ####### The Training Loop ######
     ################################
     logger.info(
-        f"Total number of model parameters: {sum([p.numel() for p in model.parameters()]):,d}"
+        f"Total number of model parameters: {sum(p.numel() for p in model.parameters()):,d}"
     )
     model.train()
     losses = []
@@ -758,7 +755,7 @@ def train(
         losses.append(loss.item())
         if step % log_every == 0:
             logger.info("Loss: {0:.4f}".format(np.mean(losses)))
-            summary_writer.add_scalar(f"Train/loss", np.mean(losses), step)
+            summary_writer.add_scalar("Train/loss", np.mean(losses), step)
         if step % checkpoint_every == 0:
             state_dict = {
                 "model": model.state_dict(),
diff --git a/HelloDeepSpeed/train_bert_ds.py b/HelloDeepSpeed/train_bert_ds.py
index 98f43fcd4..f7b331de9 100644
--- a/HelloDeepSpeed/train_bert_ds.py
+++ b/HelloDeepSpeed/train_bert_ds.py
@@ -379,9 +379,7 @@ def forward(
 
         loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
 
-        masked_lm_loss = loss_fct(
-            prediction_scores.view(-1, self.config.vocab_size), target)
-        return masked_lm_loss
+        return loss_fct(prediction_scores.view(-1, self.config.vocab_size), target)
 
 
 def create_model(num_layers: int, num_heads: int, ff_dim: int, h_dim: int,
@@ -432,8 +430,7 @@ def create_model(num_layers: int, num_heads: int, ff_dim: int, h_dim: int,
     }
     roberta_config = RobertaConfig.from_dict(roberta_config_dict)
     roberta_encoder = RobertaModel(roberta_config)
-    roberta_model = RobertaMLMModel(roberta_config, roberta_encoder)
-    return roberta_model
+    return RobertaMLMModel(roberta_config, roberta_encoder)
 
 
 ######################################################################
@@ -446,9 +443,9 @@ def get_unique_identifier(length: int = 8) -> str:
     random characters from list of ascii characters and numbers
     """
     alphabet = string.ascii_lowercase + string.digits
-    uuid = "".join(alphabet[ix]
-                   for ix in np.random.choice(len(alphabet), length))
-    return uuid
+    return "".join(
+        alphabet[ix] for ix in np.random.choice(len(alphabet), length)
+    )
 
 
 def create_experiment_dir(checkpoint_dir: pathlib.Path,
@@ -564,7 +561,7 @@ def load_model_checkpoint(
             not None,
             load_checkpoint_dir.glob("*.pt"),
         ))
-    assert len(checkpoint_files) > 0, "No checkpoints found in directory"
+    assert checkpoint_files, "No checkpoints found in directory"
     checkpoint_files = sorted(
         checkpoint_files,
         key=lambda path: int(
@@ -812,9 +809,10 @@ def train(
     ####### The Training Loop ######
     ################################
     log_dist(
-        f"Total number of model parameters: {sum([p.numel() for p in model.parameters()]):,d}",
+        f"Total number of model parameters: {sum(p.numel() for p in model.parameters()):,d}",
         ranks=[0],
-        level=logging.INFO)
+        level=logging.INFO,
+    )
     model.train()
     losses = []
     for step, batch in enumerate(data_iterator, start=start_step):
@@ -835,7 +833,7 @@ def train(
                      ranks=[0],
                      level=logging.INFO)
             if is_rank_0():
-                summary_writer.add_scalar(f"Train/loss", np.mean(losses), step)
+                summary_writer.add_scalar("Train/loss", np.mean(losses), step)
         if step % checkpoint_every == 0:
             model.save_checkpoint(save_dir=exp_dir,
                                   client_state={'checkpoint_step': step})
diff --git a/MoQ/huggingface-transformers/examples/benchmarking/plot_csv_file.py b/MoQ/huggingface-transformers/examples/benchmarking/plot_csv_file.py
index 58dc50bb8..87c68d6a0 100644
--- a/MoQ/huggingface-transformers/examples/benchmarking/plot_csv_file.py
+++ b/MoQ/huggingface-transformers/examples/benchmarking/plot_csv_file.py
@@ -105,7 +105,11 @@ def __init__(self, args):
     def plot(self):
         fig, ax = plt.subplots()
         title_str = "Time usage" if self.args.is_time else "Memory usage"
-        title_str = title_str + " for training" if self.args.is_train else title_str + " for inference"
+        title_str = (
+            f"{title_str} for training"
+            if self.args.is_train
+            else f"{title_str} for inference"
+        )
 
         if not self.args.no_log_scale:
             # set logarithm scales
diff --git a/MoQ/huggingface-transformers/examples/benchmarking/run_benchmark.py b/MoQ/huggingface-transformers/examples/benchmarking/run_benchmark.py
index e2e7d4c5e..61abe482c 100755
--- a/MoQ/huggingface-transformers/examples/benchmarking/run_benchmark.py
+++ b/MoQ/huggingface-transformers/examples/benchmarking/run_benchmark.py
@@ -36,7 +36,7 @@ def main():
                 full_error_msg += arg_error_msg.format(arg[5:])
             else:
                 wrong_args.append(arg)
-        if len(wrong_args) > 0:
+        if wrong_args:
             full_error_msg = full_error_msg + begin_error_msg + str(wrong_args)
         raise ValueError(full_error_msg)
 
diff --git a/MoQ/huggingface-transformers/examples/benchmarking/run_benchmark_tf.py b/MoQ/huggingface-transformers/examples/benchmarking/run_benchmark_tf.py
index 25aabc5f5..c4c8e78c5 100755
--- a/MoQ/huggingface-transformers/examples/benchmarking/run_benchmark_tf.py
+++ b/MoQ/huggingface-transformers/examples/benchmarking/run_benchmark_tf.py
@@ -38,7 +38,7 @@ def main():
                 full_error_msg += arg_error_msg.format(arg[5:])
             else:
                 wrong_args.append(arg)
-        if len(wrong_args) > 0:
+        if wrong_args:
             full_error_msg = full_error_msg + begin_error_msg + str(wrong_args)
         raise ValueError(full_error_msg)
     benchmark.run()
diff --git a/MoQ/huggingface-transformers/examples/conftest.py b/MoQ/huggingface-transformers/examples/conftest.py
index 2415ae8db..921319ecf 100644
--- a/MoQ/huggingface-transformers/examples/conftest.py
+++ b/MoQ/huggingface-transformers/examples/conftest.py
@@ -39,6 +39,5 @@ def pytest_addoption(parser):
 def pytest_terminal_summary(terminalreporter):
     from transformers.testing_utils import pytest_terminal_summary_main
 
-    make_reports = terminalreporter.config.getoption("--make-reports")
-    if make_reports:
+    if make_reports := terminalreporter.config.getoption("--make-reports"):
         pytest_terminal_summary_main(terminalreporter, id=make_reports)
diff --git a/MoQ/huggingface-transformers/examples/language-modeling/run_clm.py b/MoQ/huggingface-transformers/examples/language-modeling/run_clm.py
index c5a5bdd0c..7802cb2b1 100755
--- a/MoQ/huggingface-transformers/examples/language-modeling/run_clm.py
+++ b/MoQ/huggingface-transformers/examples/language-modeling/run_clm.py
@@ -139,13 +139,12 @@ class DataTrainingArguments:
     def __post_init__(self):
         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
             raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+        if self.train_file is not None:
+            extension = self.train_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+        if self.validation_file is not None:
+            extension = self.validation_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
 
 
 def main():
@@ -186,8 +185,10 @@ def main():
 
     # Log on each process the small summary:
     logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        (
+            f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+            + f"distributed training: {training_args.local_rank != -1}, 16-bits training: {training_args.fp16}"
+        )
     )
     # Set the verbosity to info of the Transformers logger (on main process only):
     if is_main_process(training_args.local_rank):
@@ -277,7 +278,7 @@ def main():
     if model_args.model_name_or_path:
         model = AutoModelForCausalLM.from_pretrained(
             model_args.model_name_or_path,
-            from_tf=bool(".ckpt" in model_args.model_name_or_path),
+            from_tf=".ckpt" in model_args.model_name_or_path,
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
diff --git a/MoQ/huggingface-transformers/examples/language-modeling/run_mlm.py b/MoQ/huggingface-transformers/examples/language-modeling/run_mlm.py
index 437fb356b..2fcf378e5 100755
--- a/MoQ/huggingface-transformers/examples/language-modeling/run_mlm.py
+++ b/MoQ/huggingface-transformers/examples/language-modeling/run_mlm.py
@@ -150,13 +150,12 @@ class DataTrainingArguments:
     def __post_init__(self):
         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
             raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+        if self.train_file is not None:
+            extension = self.train_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+        if self.validation_file is not None:
+            extension = self.validation_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
 
 
 def main():
@@ -197,8 +196,10 @@ def main():
 
     # Log on each process the small summary:
     logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        (
+            f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+            + f"distributed training: {training_args.local_rank != -1}, 16-bits training: {training_args.fp16}"
+        )
     )
     # Set the verbosity to info of the Transformers logger (on main process only):
     if is_main_process(training_args.local_rank):
@@ -283,7 +284,7 @@ def main():
     if model_args.model_name_or_path:
         model = AutoModelForMaskedLM.from_pretrained(
             model_args.model_name_or_path,
-            from_tf=bool(".ckpt" in model_args.model_name_or_path),
+            from_tf=".ckpt" in model_args.model_name_or_path,
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
diff --git a/MoQ/huggingface-transformers/examples/language-modeling/run_mlm_flax.py b/MoQ/huggingface-transformers/examples/language-modeling/run_mlm_flax.py
index c2883118f..4e7dd76b0 100755
--- a/MoQ/huggingface-transformers/examples/language-modeling/run_mlm_flax.py
+++ b/MoQ/huggingface-transformers/examples/language-modeling/run_mlm_flax.py
@@ -20,6 +20,7 @@
 Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 https://huggingface.co/models?filter=masked-lm
 """
+
 import logging
 import os
 import sys
@@ -55,9 +56,7 @@
 )
 
 
-# Cache the result
-has_tensorboard = is_tensorboard_available()
-if has_tensorboard:
+if has_tensorboard := is_tensorboard_available():
     try:
         from flax.metrics.tensorboard import SummaryWriter
     except ImportError as ie:
@@ -166,13 +165,12 @@ class DataTrainingArguments:
     def __post_init__(self):
         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
             raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+        if self.train_file is not None:
+            extension = self.train_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+        if self.validation_file is not None:
+            extension = self.validation_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
 
 
 # Adapted from transformers/data/data_collator.py
@@ -307,7 +305,7 @@ def step_fn(step):
                 progress = jnp.maximum(0.0, (step - warmup_steps) / float(steps_per_cycle))
                 ret *= jnp.maximum(0.0, 0.5 * (1.0 + jnp.cos(jnp.pi * (progress % 1.0))))
             else:
-                raise ValueError("Unknown factor %s." % name)
+                raise ValueError(f"Unknown factor {name}.")
         return jnp.asarray(ret, dtype=jnp.float32)
 
     return step_fn
@@ -333,7 +331,7 @@ def accuracy(logits, targets, weights=None):
     """
     if logits.ndim != targets.ndim + 1:
         raise ValueError(
-            "Incorrect shapes. Got shape %s logits and %s targets" % (str(logits.shape), str(targets.shape))
+            f"Incorrect shapes. Got shape {str(logits.shape)} logits and {str(targets.shape)} targets"
         )
 
     loss = jnp.equal(jnp.argmax(logits, axis=-1), targets)
@@ -354,7 +352,7 @@ def cross_entropy(logits, targets, weights=None, label_smoothing=0.0):
     """
     if logits.ndim != targets.ndim + 1:
         raise ValueError(
-            "Incorrect shapes. Got shape %s logits and %s targets" % (str(logits.shape), str(targets.shape))
+            f"Incorrect shapes. Got shape {str(logits.shape)} logits and {str(targets.shape)} targets"
         )
 
     vocab_size = logits.shape[-1]
@@ -420,8 +418,7 @@ def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndar
     if samples_to_remove != 0:
         samples_idx = samples_idx[:-samples_to_remove]
     sections_split = nb_samples // batch_size
-    batch_idx = np.split(samples_idx, sections_split)
-    return batch_idx
+    return np.split(samples_idx, sections_split)
 
 
 if __name__ == "__main__":
diff --git a/MoQ/huggingface-transformers/examples/language-modeling/run_plm.py b/MoQ/huggingface-transformers/examples/language-modeling/run_plm.py
index b44748c1d..5337710fe 100755
--- a/MoQ/huggingface-transformers/examples/language-modeling/run_plm.py
+++ b/MoQ/huggingface-transformers/examples/language-modeling/run_plm.py
@@ -147,13 +147,12 @@ class DataTrainingArguments:
     def __post_init__(self):
         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
             raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+        if self.train_file is not None:
+            extension = self.train_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+        if self.validation_file is not None:
+            extension = self.validation_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
 
 
 def main():
@@ -194,8 +193,10 @@ def main():
 
     # Log on each process the small summary:
     logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+        (
+            f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+            + f"distributed training: {training_args.local_rank != -1}, 16-bits training: {training_args.fp16}"
+        )
     )
     # Set the verbosity to info of the Transformers logger (on main process only):
     if is_main_process(training_args.local_rank):
@@ -280,7 +281,7 @@ def main():
     if model_args.model_name_or_path:
         model = XLNetLMHeadModel.from_pretrained(
             model_args.model_name_or_path,
-            from_tf=bool(".ckpt" in model_args.model_name_or_path),
+            from_tf=".ckpt" in model_args.model_name_or_path,
             config=config,
             cache_dir=model_args.cache_dir,
             revision=model_args.model_revision,
diff --git a/MoQ/huggingface-transformers/examples/legacy/multiple_choice/run_multiple_choice.py b/MoQ/huggingface-transformers/examples/legacy/multiple_choice/run_multiple_choice.py
index bf79f2ac7..fd8d0ffb5 100644
--- a/MoQ/huggingface-transformers/examples/legacy/multiple_choice/run_multiple_choice.py
+++ b/MoQ/huggingface-transformers/examples/legacy/multiple_choice/run_multiple_choice.py
@@ -116,7 +116,7 @@ def main():
         training_args.local_rank,
         training_args.device,
         training_args.n_gpu,
-        bool(training_args.local_rank != -1),
+        training_args.local_rank != -1,
         training_args.fp16,
     )
     # Set the verbosity to info of the Transformers logger (on main process only):
@@ -134,7 +134,7 @@ def main():
         label_list = processor.get_labels()
         num_labels = len(label_list)
     except KeyError:
-        raise ValueError("Task not found: %s" % (data_args.task_name))
+        raise ValueError(f"Task not found: {data_args.task_name}")
 
     # Load pretrained model and tokenizer
     #
@@ -154,7 +154,7 @@ def main():
     )
     model = AutoModelForMultipleChoice.from_pretrained(
         model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        from_tf=".ckpt" in model_args.model_name_or_path,
         config=config,
         cache_dir=model_args.cache_dir,
     )
diff --git a/MoQ/huggingface-transformers/examples/legacy/multiple_choice/utils_multiple_choice.py b/MoQ/huggingface-transformers/examples/legacy/multiple_choice/utils_multiple_choice.py
index 784a7578d..f68094a94 100644
--- a/MoQ/huggingface-transformers/examples/legacy/multiple_choice/utils_multiple_choice.py
+++ b/MoQ/huggingface-transformers/examples/legacy/multiple_choice/utils_multiple_choice.py
@@ -100,17 +100,12 @@ def __init__(
 
             cached_features_file = os.path.join(
                 data_dir,
-                "cached_{}_{}_{}_{}".format(
-                    mode.value,
-                    tokenizer.__class__.__name__,
-                    str(max_seq_length),
-                    task,
-                ),
+                f"cached_{mode.value}_{tokenizer.__class__.__name__}_{str(max_seq_length)}_{task}",
             )
 
             # Make sure only the first process in distributed training processes the dataset,
             # and the others will use the cache.
-            lock_path = cached_features_file + ".lock"
+            lock_path = f"{cached_features_file}.lock"
             with FileLock(lock_path):
 
                 if os.path.exists(cached_features_file) and not overwrite_cache:
@@ -255,7 +250,7 @@ class RaceProcessor(DataProcessor):
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        logger.info("LOOKING AT {} train".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} train")
         high = os.path.join(data_dir, "train/high")
         middle = os.path.join(data_dir, "train/middle")
         high = self._read_txt(high)
@@ -264,7 +259,7 @@ def get_train_examples(self, data_dir):
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        logger.info("LOOKING AT {} dev".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} dev")
         high = os.path.join(data_dir, "dev/high")
         middle = os.path.join(data_dir, "dev/middle")
         high = self._read_txt(high)
@@ -273,7 +268,7 @@ def get_dev_examples(self, data_dir):
 
     def get_test_examples(self, data_dir):
         """See base class."""
-        logger.info("LOOKING AT {} test".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} test")
         high = os.path.join(data_dir, "test/high")
         middle = os.path.join(data_dir, "test/middle")
         high = self._read_txt(high)
@@ -286,7 +281,7 @@ def get_labels(self):
 
     def _read_txt(self, input_dir):
         lines = []
-        files = glob.glob(input_dir + "/*txt")
+        files = glob.glob(f"{input_dir}/*txt")
         for file in tqdm.tqdm(files, desc="read files"):
             with open(file, "r", encoding="utf-8") as fin:
                 data_raw = json.load(fin)
@@ -297,8 +292,8 @@ def _read_txt(self, input_dir):
     def _create_examples(self, lines, set_type):
         """Creates examples for the training and dev sets."""
         examples = []
-        for (_, data_raw) in enumerate(lines):
-            race_id = "%s-%s" % (set_type, data_raw["race_id"])
+        for data_raw in lines:
+            race_id = f'{set_type}-{data_raw["race_id"]}'
             article = data_raw["article"]
             for i in range(len(data_raw["answers"])):
                 truth = str(ord(data_raw["answers"][i]) - ord("A"))
@@ -322,17 +317,17 @@ class SynonymProcessor(DataProcessor):
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        logger.info("LOOKING AT {} train".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} train")
         return self._create_examples(self._read_csv(os.path.join(data_dir, "mctrain.csv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        logger.info("LOOKING AT {} dev".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} dev")
         return self._create_examples(self._read_csv(os.path.join(data_dir, "mchp.csv")), "dev")
 
     def get_test_examples(self, data_dir):
         """See base class."""
-        logger.info("LOOKING AT {} dev".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} dev")
 
         return self._create_examples(self._read_csv(os.path.join(data_dir, "mctest.csv")), "test")
 
@@ -347,7 +342,7 @@ def _read_csv(self, input_file):
     def _create_examples(self, lines: List[List[str]], type: str):
         """Creates examples for the training and dev sets."""
 
-        examples = [
+        return [
             InputExample(
                 example_id=line[0],
                 question="",  # in the swag dataset, the
@@ -360,30 +355,27 @@ def _create_examples(self, lines: List[List[str]], type: str):
             for line in lines  # we skip the line with the column names
         ]
 
-        return examples
-
 
 class SwagProcessor(DataProcessor):
     """Processor for the SWAG data set."""
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        logger.info("LOOKING AT {} train".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} train")
         return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        logger.info("LOOKING AT {} dev".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} dev")
         return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")
 
     def get_test_examples(self, data_dir):
         """See base class."""
-        logger.info("LOOKING AT {} dev".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} dev")
         raise ValueError(
             "For swag testing, the input file does not contain a label column. It can not be tested in current code"
             "setting!"
         )
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")
 
     def get_labels(self):
         """See base class."""
@@ -398,7 +390,7 @@ def _create_examples(self, lines: List[List[str]], type: str):
         if type == "train" and lines[0][-1] != "label":
             raise ValueError("For training, the input file must contain a label column.")
 
-        examples = [
+        return [
             InputExample(
                 example_id=line[2],
                 question=line[5],  # in the swag dataset, the
@@ -411,24 +403,22 @@ def _create_examples(self, lines: List[List[str]], type: str):
             for line in lines[1:]  # we skip the line with the column names
         ]
 
-        return examples
-
 
 class ArcProcessor(DataProcessor):
     """Processor for the ARC data set (request from allennlp)."""
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        logger.info("LOOKING AT {} train".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} train")
         return self._create_examples(self._read_json(os.path.join(data_dir, "train.jsonl")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        logger.info("LOOKING AT {} dev".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} dev")
         return self._create_examples(self._read_json(os.path.join(data_dir, "dev.jsonl")), "dev")
 
     def get_test_examples(self, data_dir):
-        logger.info("LOOKING AT {} test".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} test")
         return self._create_examples(self._read_json(os.path.join(data_dir, "test.jsonl")), "test")
 
     def get_labels(self):
@@ -437,8 +427,7 @@ def get_labels(self):
 
     def _read_json(self, input_file):
         with open(input_file, "r", encoding="utf-8") as fin:
-            lines = fin.readlines()
-            return lines
+            return fin.readlines()
 
     def _create_examples(self, lines, type):
         """Creates examples for the training and dev sets."""
@@ -528,7 +517,7 @@ def convert_examples_to_features(
                 # this is for cloze question
                 text_b = example.question.replace("_", ending)
             else:
-                text_b = example.question + " " + ending
+                text_b = f"{example.question} {ending}"
 
             inputs = tokenizer(
                 text_a,
@@ -570,10 +559,10 @@ def convert_examples_to_features(
 
     for f in features[:2]:
         logger.info("*** Example ***")
-        logger.info("feature: %s" % f)
+        logger.info(f"feature: {f}")
 
     return features
 
 
 processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor, "syn": SynonymProcessor}
-MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4, "syn", 5}
+MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", "swag", "arc", 4, "syn", 5}
diff --git a/MoQ/huggingface-transformers/examples/legacy/pytorch-lightning/lightning_base.py b/MoQ/huggingface-transformers/examples/legacy/pytorch-lightning/lightning_base.py
index a9a05fbf9..461a5e469 100644
--- a/MoQ/huggingface-transformers/examples/legacy/pytorch-lightning/lightning_base.py
+++ b/MoQ/huggingface-transformers/examples/legacy/pytorch-lightning/lightning_base.py
@@ -108,7 +108,7 @@ def __init__(
         if model is None:
             self.model = self.model_type.from_pretrained(
                 self.hparams.model_name_or_path,
-                from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
+                from_tf=".ckpt" in self.hparams.model_name_or_path,
                 config=self.config,
                 cache_dir=cache_dir,
             )
@@ -132,11 +132,19 @@ def configure_optimizers(self):
         no_decay = ["bias", "LayerNorm.weight"]
         optimizer_grouped_parameters = [
             {
-                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+                "params": [
+                    p
+                    for n, p in model.named_parameters()
+                    if all(nd not in n for nd in no_decay)
+                ],
                 "weight_decay": self.hparams.weight_decay,
             },
             {
-                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+                "params": [
+                    p
+                    for n, p in model.named_parameters()
+                    if any(nd in n for nd in no_decay)
+                ],
                 "weight_decay": 0.0,
             },
         ]
@@ -189,11 +197,7 @@ def test_dataloader(self):
     def _feature_file(self, mode):
         return os.path.join(
             self.hparams.data_dir,
-            "cached_{}_{}_{}".format(
-                mode,
-                list(filter(None, self.hparams.model_name_or_path.split("/"))).pop(),
-                str(self.hparams.max_seq_length),
-            ),
+            f'cached_{mode}_{list(filter(None, self.hparams.model_name_or_path.split("/"))).pop()}_{str(self.hparams.max_seq_length)}',
         )
 
     @pl.utilities.rank_zero_only
@@ -278,7 +282,7 @@ def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
         # Log results
         for key in sorted(metrics):
             if key not in ["log", "progress_bar"]:
-                rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
+                rank_zero_info(f"{key} = {str(metrics[key])}\n")
 
     def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
         rank_zero_info("***** Test results *****")
@@ -288,8 +292,8 @@ def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
         with open(output_test_results_file, "w") as writer:
             for key in sorted(metrics):
                 if key not in ["log", "progress_bar"]:
-                    rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
-                    writer.write("{} = {}\n".format(key, str(metrics[key])))
+                    rank_zero_info(f"{key} = {str(metrics[key])}\n")
+                    writer.write(f"{key} = {str(metrics[key])}\n")
 
 
 def add_generic_args(parser, root_dir) -> None:
diff --git a/MoQ/huggingface-transformers/examples/legacy/pytorch-lightning/run_glue.py b/MoQ/huggingface-transformers/examples/legacy/pytorch-lightning/run_glue.py
index abb06bf52..30b6afb79 100644
--- a/MoQ/huggingface-transformers/examples/legacy/pytorch-lightning/run_glue.py
+++ b/MoQ/huggingface-transformers/examples/legacy/pytorch-lightning/run_glue.py
@@ -126,7 +126,7 @@ def _eval_end(self, outputs) -> tuple:
 
         results = {**{"val_loss": val_loss_mean}, **compute_metrics(self.hparams.task, preds, out_label_ids)}
 
-        ret = {k: v for k, v in results.items()}
+        ret = dict(results)
         ret["log"] = results
         return ret, preds_list, out_label_list
 
diff --git a/MoQ/huggingface-transformers/examples/legacy/pytorch-lightning/run_ner.py b/MoQ/huggingface-transformers/examples/legacy/pytorch-lightning/run_ner.py
index 1066c6fed..7ebd93e9d 100644
--- a/MoQ/huggingface-transformers/examples/legacy/pytorch-lightning/run_ner.py
+++ b/MoQ/huggingface-transformers/examples/legacy/pytorch-lightning/run_ner.py
@@ -68,20 +68,24 @@ def prepare_data(self):
             else:
                 logger.info("Creating features from dataset file at %s", args.data_dir)
                 examples = self.token_classification_task.read_examples_from_file(args.data_dir, mode)
-                features = self.token_classification_task.convert_examples_to_features(
-                    examples,
-                    self.labels,
-                    args.max_seq_length,
-                    self.tokenizer,
-                    cls_token_at_end=bool(self.config.model_type in ["xlnet"]),
-                    cls_token=self.tokenizer.cls_token,
-                    cls_token_segment_id=2 if self.config.model_type in ["xlnet"] else 0,
-                    sep_token=self.tokenizer.sep_token,
-                    sep_token_extra=False,
-                    pad_on_left=bool(self.config.model_type in ["xlnet"]),
-                    pad_token=self.tokenizer.pad_token_id,
-                    pad_token_segment_id=self.tokenizer.pad_token_type_id,
-                    pad_token_label_id=self.pad_token_label_id,
+                features = (
+                    self.token_classification_task.convert_examples_to_features(
+                        examples,
+                        self.labels,
+                        args.max_seq_length,
+                        self.tokenizer,
+                        cls_token_at_end=self.config.model_type in ["xlnet"],
+                        cls_token=self.tokenizer.cls_token,
+                        cls_token_segment_id=2
+                        if self.config.model_type in ["xlnet"]
+                        else 0,
+                        sep_token=self.tokenizer.sep_token,
+                        sep_token_extra=False,
+                        pad_on_left=self.config.model_type in ["xlnet"],
+                        pad_token=self.tokenizer.pad_token_id,
+                        pad_token_segment_id=self.tokenizer.pad_token_type_id,
+                        pad_token_label_id=self.pad_token_label_id,
+                    )
                 )
                 logger.info("Saving features into cached file %s", cached_features_file)
                 torch.save(features, cached_features_file)
@@ -123,7 +127,7 @@ def _eval_end(self, outputs):
         preds = np.argmax(preds, axis=2)
         out_label_ids = np.concatenate([x["target"] for x in outputs], axis=0)
 
-        label_map = {i: label for i, label in enumerate(self.labels)}
+        label_map = dict(enumerate(self.labels))
         out_label_list = [[] for _ in range(out_label_ids.shape[0])]
         preds_list = [[] for _ in range(out_label_ids.shape[0])]
 
@@ -141,7 +145,7 @@ def _eval_end(self, outputs):
             "f1": f1_score(out_label_list, preds_list),
         }
 
-        ret = {k: v for k, v in results.items()}
+        ret = dict(results)
         ret["log"] = results
         return ret, preds_list, out_label_list
 
diff --git a/MoQ/huggingface-transformers/examples/legacy/question-answering/run_squad.py b/MoQ/huggingface-transformers/examples/legacy/question-answering/run_squad.py
index ff693ad24..de9c73623 100644
--- a/MoQ/huggingface-transformers/examples/legacy/question-answering/run_squad.py
+++ b/MoQ/huggingface-transformers/examples/legacy/question-answering/run_squad.py
@@ -92,10 +92,21 @@ def train(args, train_dataset, model, tokenizer):
     no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
         {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if all(nd not in n for nd in no_decay)
+            ],
             "weight_decay": args.weight_decay,
         },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": 0.0,
+        },
     ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
     scheduler = get_linear_schedule_with_warmup(
@@ -193,13 +204,14 @@ def train(args, train_dataset, model, tokenizer):
                 del inputs["token_type_ids"]
 
             if args.model_type in ["xlnet", "xlm"]:
-                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
+                inputs |= {"cls_index": batch[5], "p_mask": batch[6]}
                 if args.version_2_with_negative:
-                    inputs.update({"is_impossible": batch[7]})
+                    inputs["is_impossible"] = batch[7]
                 if hasattr(model, "config") and hasattr(model.config, "lang2id"):
-                    inputs.update(
-                        {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
-                    )
+                    inputs["langs"] = (
+                        torch.ones(batch[0].shape, dtype=torch.int64)
+                        * args.lang_id
+                    ).to(args.device)
 
             outputs = model(**inputs)
             # model outputs are always tuple in transformers (see doc)
@@ -234,14 +246,14 @@ def train(args, train_dataset, model, tokenizer):
                     if args.local_rank == -1 and args.evaluate_during_training:
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
-                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                            tb_writer.add_scalar(f"eval_{key}", value, global_step)
                     tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                     tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                     logging_loss = tr_loss
 
                 # Save model checkpoint
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
+                    output_dir = os.path.join(args.output_dir, f"checkpoint-{global_step}")
                     # Take care of distributed/parallel training
                     model_to_save = model.module if hasattr(model, "module") else model
                     model_to_save.save_pretrained(output_dir)
@@ -284,7 +296,7 @@ def evaluate(args, model, tokenizer, prefix=""):
         model = torch.nn.DataParallel(model)
 
     # Eval!
-    logger.info("***** Running evaluation {} *****".format(prefix))
+    logger.info(f"***** Running evaluation {prefix} *****")
     logger.info("  Num examples = %d", len(dataset))
     logger.info("  Batch size = %d", args.eval_batch_size)
 
@@ -309,12 +321,13 @@ def evaluate(args, model, tokenizer, prefix=""):
 
             # XLNet and XLM use more arguments for their predictions
             if args.model_type in ["xlnet", "xlm"]:
-                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
+                inputs |= {"cls_index": batch[4], "p_mask": batch[5]}
                 # for lang_id-sensitive xlm models
                 if hasattr(model, "config") and hasattr(model.config, "lang2id"):
-                    inputs.update(
-                        {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
-                    )
+                    inputs["langs"] = (
+                        torch.ones(batch[0].shape, dtype=torch.int64)
+                        * args.lang_id
+                    ).to(args.device)
             outputs = model(**inputs)
 
         for i, feature_index in enumerate(feature_indices):
@@ -351,11 +364,17 @@ def evaluate(args, model, tokenizer, prefix=""):
     logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))
 
     # Compute predictions
-    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
-    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
+    output_prediction_file = os.path.join(
+        args.output_dir, f"predictions_{prefix}.json"
+    )
+    output_nbest_file = os.path.join(
+        args.output_dir, f"nbest_predictions_{prefix}.json"
+    )
 
     if args.version_2_with_negative:
-        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
+        output_null_log_odds_file = os.path.join(
+            args.output_dir, f"null_odds_{prefix}.json"
+        )
     else:
         output_null_log_odds_file = None
 
@@ -396,9 +415,7 @@ def evaluate(args, model, tokenizer, prefix=""):
             tokenizer,
         )
 
-    # Compute the F1 and exact scores.
-    results = squad_evaluate(examples, predictions)
-    return results
+    return squad_evaluate(examples, predictions)
 
 
 def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
@@ -410,11 +427,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
     input_dir = args.data_dir if args.data_dir else "."
     cached_features_file = os.path.join(
         input_dir,
-        "cached_{}_{}_{}".format(
-            "dev" if evaluate else "train",
-            list(filter(None, args.model_name_or_path.split("/"))).pop(),
-            str(args.max_seq_length),
-        ),
+        f'cached_{"dev" if evaluate else "train"}_{list(filter(None, args.model_name_or_path.split("/"))).pop()}_{str(args.max_seq_length)}',
     )
 
     # Init features and dataset from cache if it exists
@@ -466,9 +479,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
         # Make sure only the first process in distributed training process the dataset, and the others will use the cache
         torch.distributed.barrier()
 
-    if output_examples:
-        return dataset, examples, features
-    return dataset
+    return (dataset, examples, features) if output_examples else dataset
 
 
 def main():
diff --git a/MoQ/huggingface-transformers/examples/legacy/question-answering/run_squad_trainer.py b/MoQ/huggingface-transformers/examples/legacy/question-answering/run_squad_trainer.py
index 1b1d6e6fe..0cbb09dc3 100644
--- a/MoQ/huggingface-transformers/examples/legacy/question-answering/run_squad_trainer.py
+++ b/MoQ/huggingface-transformers/examples/legacy/question-answering/run_squad_trainer.py
@@ -98,7 +98,7 @@ def main():
         training_args.local_rank,
         training_args.device,
         training_args.n_gpu,
-        bool(training_args.local_rank != -1),
+        training_args.local_rank != -1,
         training_args.fp16,
     )
     # Set the verbosity to info of the Transformers logger (on main process only):
@@ -126,7 +126,7 @@ def main():
     )
     model = AutoModelForQuestionAnswering.from_pretrained(
         model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        from_tf=".ckpt" in model_args.model_name_or_path,
         config=config,
         cache_dir=model_args.cache_dir,
     )
diff --git a/MoQ/huggingface-transformers/examples/legacy/run_chinese_ref.py b/MoQ/huggingface-transformers/examples/legacy/run_chinese_ref.py
index f7c09e37f..df0386959 100755
--- a/MoQ/huggingface-transformers/examples/legacy/run_chinese_ref.py
+++ b/MoQ/huggingface-transformers/examples/legacy/run_chinese_ref.py
@@ -17,19 +17,16 @@ def _is_chinese_char(cp):
     # as is Japanese Hiragana and Katakana. Those alphabets are used to write
     # space-separated words, so they are not treated specially and handled
     # like the all of the other languages.
-    if (
+    return (
         (cp >= 0x4E00 and cp <= 0x9FFF)
-        or (cp >= 0x3400 and cp <= 0x4DBF)  #
-        or (cp >= 0x20000 and cp <= 0x2A6DF)  #
-        or (cp >= 0x2A700 and cp <= 0x2B73F)  #
-        or (cp >= 0x2B740 and cp <= 0x2B81F)  #
-        or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+        or (cp >= 0x3400 and cp <= 0x4DBF)
+        or (cp >= 0x20000 and cp <= 0x2A6DF)
+        or (cp >= 0x2A700 and cp <= 0x2B73F)
+        or (cp >= 0x2B740 and cp <= 0x2B81F)
+        or (cp >= 0x2B820 and cp <= 0x2CEAF)
         or (cp >= 0xF900 and cp <= 0xFAFF)
-        or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
-    ):  #
-        return True
-
-    return False
+        or (cp >= 0x2F800 and cp <= 0x2FA1F)
+    )
 
 
 def is_chinese(word: str):
@@ -45,17 +42,15 @@ def get_chinese_word(tokens: List[str]):
     word_set = set()
 
     for token in tokens:
-        chinese_word = len(token) > 1 and is_chinese(token)
-        if chinese_word:
+        if chinese_word := len(token) > 1 and is_chinese(token):
             word_set.add(token)
-    word_list = list(word_set)
-    return word_list
+    return list(word_set)
 
 
 def add_sub_symbol(bert_tokens: List[str], chinese_word_set: set()):
     if not chinese_word_set:
         return bert_tokens
-    max_word_len = max([len(w) for w in chinese_word_set])
+    max_word_len = max(len(w) for w in chinese_word_set)
 
     bert_word = bert_tokens
     start, end = 0, len(bert_word)
@@ -67,7 +62,7 @@ def add_sub_symbol(bert_tokens: List[str], chinese_word_set: set()):
                 whole_word = "".join(bert_word[start : start + i])
                 if whole_word in chinese_word_set:
                     for j in range(start + 1, start + i):
-                        bert_word[j] = "##" + bert_word[j]
+                        bert_word[j] = f"##{bert_word[j]}"
                     start = start + i
                     single_word = False
                     break
diff --git a/MoQ/huggingface-transformers/examples/legacy/run_language_modeling.py b/MoQ/huggingface-transformers/examples/legacy/run_language_modeling.py
index 20995f1bf..be4ebe396 100755
--- a/MoQ/huggingface-transformers/examples/legacy/run_language_modeling.py
+++ b/MoQ/huggingface-transformers/examples/legacy/run_language_modeling.py
@@ -220,7 +220,7 @@ def main():
         training_args.local_rank,
         training_args.device,
         training_args.n_gpu,
-        bool(training_args.local_rank != -1),
+        training_args.local_rank != -1,
         training_args.fp16,
     )
     # Set the verbosity to info of the Transformers logger (on main process only):
@@ -260,7 +260,7 @@ def main():
     if model_args.model_name_or_path:
         model = AutoModelWithLMHead.from_pretrained(
             model_args.model_name_or_path,
-            from_tf=bool(".ckpt" in model_args.model_name_or_path),
+            from_tf=".ckpt" in model_args.model_name_or_path,
             config=config,
             cache_dir=model_args.cache_dir,
         )
@@ -298,15 +298,14 @@ def main():
             plm_probability=data_args.plm_probability,
             max_span_length=data_args.max_span_length,
         )
+    elif data_args.mlm and data_args.whole_word_mask:
+        data_collator = DataCollatorForWholeWordMask(
+            tokenizer=tokenizer, mlm_probability=data_args.mlm_probability
+        )
     else:
-        if data_args.mlm and data_args.whole_word_mask:
-            data_collator = DataCollatorForWholeWordMask(
-                tokenizer=tokenizer, mlm_probability=data_args.mlm_probability
-            )
-        else:
-            data_collator = DataCollatorForLanguageModeling(
-                tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability
-            )
+        data_collator = DataCollatorForLanguageModeling(
+            tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability
+        )
 
     # Initialize our Trainer
     trainer = Trainer(
@@ -350,7 +349,7 @@ def main():
                     logger.info("  %s = %s", key, str(result[key]))
                     writer.write("%s = %s\n" % (key, str(result[key])))
 
-        results.update(result)
+        results |= result
 
     return results
 
diff --git a/MoQ/huggingface-transformers/examples/legacy/run_openai_gpt.py b/MoQ/huggingface-transformers/examples/legacy/run_openai_gpt.py
index 72314b5ed..7a43b4192 100755
--- a/MoQ/huggingface-transformers/examples/legacy/run_openai_gpt.py
+++ b/MoQ/huggingface-transformers/examples/legacy/run_openai_gpt.py
@@ -64,10 +64,11 @@ def load_rocstories_dataset(dataset_path):
     """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
     with open(dataset_path, encoding="utf_8") as f:
         f = csv.reader(f)
-        output = []
         next(f)  # skip the first line
-        for line in tqdm(f):
-            output.append((" ".join(line[1:5]), line[5], line[6], int(line[-1]) - 1))
+        output = [
+            (" ".join(line[1:5]), line[5], line[6], int(line[-1]) - 1)
+            for line in tqdm(f)
+        ]
     return output
 
 
@@ -189,7 +190,7 @@ def tokenize_and_encode(obj):
             return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
         elif isinstance(obj, int):
             return obj
-        return list(tokenize_and_encode(o) for o in obj)
+        return [tokenize_and_encode(o) for o in obj]
 
     logger.info("Encoding dataset...")
     train_dataset = load_rocstories_dataset(args.train_dataset)
@@ -230,10 +231,21 @@ def tokenize_and_encode(obj):
         no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
         optimizer_grouped_parameters = [
             {
-                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+                "params": [
+                    p
+                    for n, p in param_optimizer
+                    if all(nd not in n for nd in no_decay)
+                ],
                 "weight_decay": args.weight_decay,
             },
-            {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+            {
+                "params": [
+                    p
+                    for n, p in param_optimizer
+                    if any(nd in n for nd in no_decay)
+                ],
+                "weight_decay": 0.0,
+            },
         ]
         optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
         scheduler = get_linear_schedule_with_warmup(
diff --git a/MoQ/huggingface-transformers/examples/legacy/run_swag.py b/MoQ/huggingface-transformers/examples/legacy/run_swag.py
index ddce4d20e..31d70c4fe 100755
--- a/MoQ/huggingface-transformers/examples/legacy/run_swag.py
+++ b/MoQ/huggingface-transformers/examples/legacy/run_swag.py
@@ -73,17 +73,17 @@ def __str__(self):
 
     def __repr__(self):
         attributes = [
-            "swag_id: {}".format(self.swag_id),
-            "context_sentence: {}".format(self.context_sentence),
-            "start_ending: {}".format(self.start_ending),
-            "ending_0: {}".format(self.endings[0]),
-            "ending_1: {}".format(self.endings[1]),
-            "ending_2: {}".format(self.endings[2]),
-            "ending_3: {}".format(self.endings[3]),
+            f"swag_id: {self.swag_id}",
+            f"context_sentence: {self.context_sentence}",
+            f"start_ending: {self.start_ending}",
+            f"ending_0: {self.endings[0]}",
+            f"ending_1: {self.endings[1]}",
+            f"ending_2: {self.endings[2]}",
+            f"ending_3: {self.endings[3]}",
         ]
 
         if self.label is not None:
-            attributes.append("label: {}".format(self.label))
+            attributes.append(f"label: {self.label}")
 
         return ", ".join(attributes)
 
@@ -105,7 +105,7 @@ def read_swag_examples(input_file, is_training=True):
     if is_training and lines[0][-1] != "label":
         raise ValueError("For training, the input file must contain a label column.")
 
-    examples = [
+    return [
         SwagExample(
             swag_id=line[2],
             context_sentence=line[4],
@@ -121,8 +121,6 @@ def read_swag_examples(input_file, is_training=True):
         for line in lines[1:]  # we skip the line with the column names
     ]
 
-    return examples
-
 
 def convert_examples_to_features(examples, tokenizer, max_seq_length, is_training):
     """Loads a data file into a list of `InputBatch`s."""
@@ -149,7 +147,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, is_trainin
         start_ending_tokens = tokenizer.tokenize(example.start_ending)
 
         choices_features = []
-        for ending_index, ending in enumerate(example.endings):
+        for ending in example.endings:
             # We create a copy of the context tokens in order to be
             # able to shrink it according to ending_tokens
             context_tokens_choice = context_tokens[:]
@@ -181,15 +179,15 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, is_trainin
         label = example.label
         if example_index < 5:
             logger.info("*** Example ***")
-            logger.info("swag_id: {}".format(example.swag_id))
+            logger.info(f"swag_id: {example.swag_id}")
             for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
-                logger.info("choice: {}".format(choice_idx))
-                logger.info("tokens: {}".format(" ".join(tokens)))
-                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
-                logger.info("input_mask: {}".format(" ".join(map(str, input_mask))))
-                logger.info("segment_ids: {}".format(" ".join(map(str, segment_ids))))
+                logger.info(f"choice: {choice_idx}")
+                logger.info(f'tokens: {" ".join(tokens)}')
+                logger.info(f'input_ids: {" ".join(map(str, input_ids))}')
+                logger.info(f'input_mask: {" ".join(map(str, input_mask))}')
+                logger.info(f'segment_ids: {" ".join(map(str, segment_ids))}')
             if is_training:
-                logger.info("label: {}".format(label))
+                logger.info(f"label: {label}")
 
         features.append(InputFeatures(example_id=example.swag_id, choices_features=choices_features, label=label))
 
@@ -238,11 +236,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
     input_file = args.predict_file if evaluate else args.train_file
     cached_features_file = os.path.join(
         os.path.dirname(input_file),
-        "cached_{}_{}_{}".format(
-            "dev" if evaluate else "train",
-            list(filter(None, args.model_name_or_path.split("/"))).pop(),
-            str(args.max_seq_length),
-        ),
+        f'cached_{"dev" if evaluate else "train"}_{list(filter(None, args.model_name_or_path.split("/"))).pop()}_{str(args.max_seq_length)}',
     )
     if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
         logger.info("Loading features from cached file %s", cached_features_file)
@@ -265,14 +259,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
     all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
     all_label = torch.tensor([f.label for f in features], dtype=torch.long)
 
-    if evaluate:
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
-    else:
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
-
-    if output_examples:
-        return dataset, examples, features
-    return dataset
+    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
+    return (dataset, examples, features) if output_examples else dataset
 
 
 def train(args, train_dataset, model, tokenizer):
@@ -294,10 +282,21 @@ def train(args, train_dataset, model, tokenizer):
     no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
         {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if all(nd not in n for nd in no_decay)
+            ],
             "weight_decay": args.weight_decay,
         },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": 0.0,
+        },
     ]
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
     scheduler = get_linear_schedule_with_warmup(
@@ -384,14 +383,14 @@ def train(args, train_dataset, model, tokenizer):
                     ):  # Only evaluate when single GPU otherwise metrics may not average well
                         results = evaluate(args, model, tokenizer)
                         for key, value in results.items():
-                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                            tb_writer.add_scalar(f"eval_{key}", value, global_step)
                     tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                     tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                     logging_loss = tr_loss
 
                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                     # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
+                    output_dir = os.path.join(args.output_dir, f"checkpoint-{global_step}")
                     model_to_save = (
                         model.module if hasattr(model, "module") else model
                     )  # Take care of distributed/parallel training
@@ -425,7 +424,7 @@ def evaluate(args, model, tokenizer, prefix=""):
     eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
     # Eval!
-    logger.info("***** Running evaluation {} *****".format(prefix))
+    logger.info(f"***** Running evaluation {prefix} *****")
     logger.info("  Num examples = %d", len(dataset))
     logger.info("  Batch size = %d", args.eval_batch_size)
 
diff --git a/MoQ/huggingface-transformers/examples/legacy/seq2seq/download_wmt.py b/MoQ/huggingface-transformers/examples/legacy/seq2seq/download_wmt.py
index c52c0c7b4..ac9109010 100755
--- a/MoQ/huggingface-transformers/examples/legacy/seq2seq/download_wmt.py
+++ b/MoQ/huggingface-transformers/examples/legacy/seq2seq/download_wmt.py
@@ -34,7 +34,7 @@ def download_wmt_dataset(src_lang="ro", tgt_lang="en", dataset="wmt16", save_dir
     """
     try:
         import datasets
-    except (ModuleNotFoundError, ImportError):
+    except ImportError:
         raise ImportError("run pip install datasets")
     pair = f"{src_lang}-{tgt_lang}"
     print(f"Converting {dataset}-{pair}")
diff --git a/MoQ/huggingface-transformers/examples/legacy/seq2seq/finetune_trainer.py b/MoQ/huggingface-transformers/examples/legacy/seq2seq/finetune_trainer.py
index 37573e50b..19e8896c4 100755
--- a/MoQ/huggingface-transformers/examples/legacy/seq2seq/finetune_trainer.py
+++ b/MoQ/huggingface-transformers/examples/legacy/seq2seq/finetune_trainer.py
@@ -172,7 +172,7 @@ def main():
         training_args.local_rank,
         training_args.device,
         training_args.n_gpu,
-        bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED),
+        training_args.parallel_mode == ParallelMode.DISTRIBUTED,
         training_args.fp16,
     )
     transformers.utils.logging.enable_default_handler()
@@ -311,7 +311,7 @@ def main():
 
         if trainer.is_world_process_zero():
             handle_metrics("train", metrics, training_args.output_dir)
-            all_metrics.update(metrics)
+            all_metrics |= metrics
 
             # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
             trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))
diff --git a/MoQ/huggingface-transformers/setup.py b/MoQ/huggingface-transformers/setup.py
index 87c18390f..cd0e97e21 100644
--- a/MoQ/huggingface-transformers/setup.py
+++ b/MoQ/huggingface-transformers/setup.py
@@ -60,6 +60,7 @@
 10. Update the version in __init__.py, setup.py to the new version "-dev" and push to master.
 """
 
+
 import os
 import re
 import shutil
@@ -73,14 +74,7 @@
 stale_egg_info = Path(__file__).parent / "transformers.egg-info"
 if stale_egg_info.exists():
     print(
-        (
-            "Warning: {} exists.\n\n"
-            "If you recently updated transformers to 3.0 or later, this is expected,\n"
-            "but it may prevent transformers from installing in editable mode.\n\n"
-            "This directory is automatically generated by Python's packaging tools.\n"
-            "I will remove it now.\n\n"
-            "See https://github.com/pypa/pip/issues/5466 for details.\n"
-        ).format(stale_egg_info)
+        f"Warning: {stale_egg_info} exists.\n\nIf you recently updated transformers to 3.0 or later, this is expected,\nbut it may prevent transformers from installing in editable mode.\n\nThis directory is automatically generated by Python's packaging tools.\nI will remove it now.\n\nSee https://github.com/pypa/pip/issues/5466 for details.\n"
     )
     shutil.rmtree(stale_egg_info)
 
@@ -205,9 +199,8 @@ def run(self):
             f.write("\n".join(content))
 
 
-extras = {}
+extras = {"ja": deps_list("fugashi", "ipadic", "unidic_lite", "unidic")}
 
-extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic")
 extras["sklearn"] = deps_list("scikit-learn")
 
 extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "keras2onnx")