In [None]:
!nvidia-smi

Wed Jul  6 02:00:19 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P0    44W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers==3.5.1
!pip install torch==1.4.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import XLMRobertaForQuestionAnswering, XLMRobertaTokenizer
import torch
import torch.nn as nn
from transformers.data.metrics.squad_metrics import compute_predictions_log_probs, compute_predictions_logits, squad_evaluate
from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor

In [None]:
from transformers import RobertaModel, XLMRobertaConfig
from torch.nn import CrossEntropyLoss
import math

In [None]:
from transformers.modeling_outputs import QuestionAnsweringModelOutput

# BLANC Model for XLM-R

In [None]:
class BLANC(XLMRobertaForQuestionAnswering):
  config_class = XLMRobertaConfig
  
  def __init__(self, config):
    super().__init__(config)
    self.num_labels = config.num_labels

    self.roberta = RobertaModel(config, add_pooling_layer=False)
    self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
    self.block_outputs = nn.Linear(config.hidden_size, 2)
    self.init_weights()

  def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None, geometric_p=0.3, window_size=5, lmb=0.5):
    # device = input_ids.device
    device = torch.device('cuda')
    # sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
    return_dict = self.config.use_return_dict

    outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=return_dict)
    sequence_output = outputs[0]

    bsize = sequence_output.size(0)
    seq_len = sequence_output.size(1)
    
    r_logits = self.block_outputs(sequence_output)
    sr_logits, er_logits = r_logits.split(1, dim=-1)
    sr_logits = sr_logits.squeeze(-1)
    er_logits = er_logits.squeeze(-1)
    
    softmax = nn.Softmax(dim=-1)
    spred = softmax(sr_logits)
    epred = softmax(er_logits)

    bn = sequence_output.size(0)
    
    attention_s = torch.cumsum(spred[:,1:], -1)
    attention_s = torch.cat((spred[:,0:1], attention_s), dim=1)
    attention_e = torch.flip(torch.cumsum(torch.flip(epred[:,1:], dims=[1]), -1), dims=[1])
    attention_e = torch.cat((epred[:,0:1], attention_e), dim=1)
    
    attention = attention_s * attention_e
    
    smoothed_attention = attention + 1.0
    sequence_output = sequence_output * smoothed_attention.view(bn, seq_len, 1)

    logits = self.qa_outputs(sequence_output)
    start_logits, end_logits = logits.split(1, dim=-1)
    start_logits = start_logits.squeeze(-1)
    end_logits = end_logits.squeeze(-1)

    total_loss = None
    if start_positions is not None and end_positions is not None:
      # If we are on multi-GPU, split add a dimension
      if len(start_positions.size()) > 1:
          start_positions = start_positions.squeeze(-1)
      if len(end_positions.size()) > 1:
          end_positions = end_positions.squeeze(-1)
      # sometimes the start/end positions are outside our model inputs, we ignore these terms
      ignored_index = start_logits.size(1)
      start_positions.clamp_(0, ignored_index)
      end_positions.clamp_(0, ignored_index)

      loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
      dist = self.generate_soft_label(start_positions, end_positions, geometric_p, ignored_index, window_size)
      
      dist_total_loss = torch.mean(dist * torch.log(attention) + (1.0 - dist) * torch.log(1.0 - attention))
      dist_total_loss = - 2.0 * dist_total_loss

      start_loss = loss_fct(start_logits, start_positions)
      end_loss = loss_fct(end_logits, end_positions)
      f_loss = (start_loss + end_loss) / 2.0
      total_loss = (1.0 - lmb) * f_loss + lmb * dist_total_loss
      return (total_loss, dist_total_loss)
    else:
      # return start_logits, end_logits, attention
      return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            # hidden_states=outputs.hidden_states,
            attentions=attention,
      )

  
  def generate_soft_label(self, starts, ends, p, r, window_size):
    device = starts.device
    starts_ = starts.cpu().detach().numpy()
    ends_ = ends.cpu().detach().numpy()
    bsize = starts_.shape[0]
    s = [0] * bsize; e = [0] * bsize
    context_dist = torch.zeros(bsize, r, device=device)
    for i in range(bsize):
      context_dist[i][starts_[i]:ends_[i] + 1] = 1.0

    for i in range(bsize):
      if starts_[i] == 0:
        continue
      ss = max(1, starts_[i] - window_size)
      s[i] = ss
      for j in range(starts_[i] - 1, ss - 1, -1):
        target_ind = starts_[i] - j
        context_dist[i][j] = math.pow(p, target_ind)

    for i in range(bsize):
      if ends_[i] == 0:
        continue
      ee = min(ends_[i] + window_size, r - 1)
      e[i] = ee
      for j in range(ends_[i] + 1, ee + 1):
        target_ind = j - ends_[i]
        context_dist[i][j] = math.pow(p, target_ind)
    return context_dist

In [None]:
model = BLANC.from_pretrained('xlm-roberta-large')

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing BLANC: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing BLANC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BLANC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BLANC were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias', 'block_outputs.weight', 'block_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference

In [None]:
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large")

# Reading SQuAD V1 Form Datatets and convert to features

In [None]:
processor = SquadV1Processor()

In [None]:
train_examples = processor.get_train_examples('/content/drive/MyDrive/Colab Notebooks/MRC - VLSP/Dataset/XQuAD', 'train_xquad.json')
dev_examples = processor.get_dev_examples('/content/drive/MyDrive/Colab Notebooks/MRC - VLSP/Dataset/ViQuADv1.1','dev_ViQuAD.json')

100%|██████████| 87187/87187 [00:41<00:00, 2088.88it/s]
100%|██████████| 18/18 [00:01<00:00, 17.74it/s]


In [None]:
from transformers.data.processors.squad import squad_convert_examples_to_features

## Reading train data

In [None]:
train_features, train_dataset = squad_convert_examples_to_features(train_examples, 
                                                       tokenizer, 
                                                       max_seq_length = 384, 
                                                       doc_stride = 128,
                                                       max_query_length = 64,
                                                       is_training = True,
                                                       return_dataset = 'pt',
                                                       threads = 10
                                                       )

convert squad examples to features:  11%|█▏        | 9825/87187 [00:38<04:49, 267.24it/s]Could not find answer: '750.000 và' vs. '£ 750.000'
convert squad examples to features:  24%|██▍       | 20801/87187 [01:20<04:15, 260.06it/s]Could not find answer: 'góp phần tái sinh cấu trúc xã hội Ottoman' vs. 'phần tái sinh cấu trúc xã hội Ottoman .'
convert squad examples to features:  30%|██▉       | 25953/87187 [01:42<03:54, 260.97it/s]Could not find answer: 'tranh Cách mạng. Là' vs. 'Cách mạng. Là N'
convert squad examples to features:  33%|███▎      | 28777/87187 [01:53<03:20, 291.28it/s]Could not find answer: 'tháng 4,' vs. '7 tháng 4'
convert squad examples to features:  34%|███▍      | 29569/87187 [01:56<03:41, 259.67it/s]Could not find answer: 'tháng 4,' vs. '7 tháng 4'
convert squad examples to features:  41%|████      | 35329/87187 [02:18<03:27, 249.45it/s]Could not find answer: 'tháng 4 ngọn' vs. '5 tháng 4'
convert squad examples to features:  49%|████▉     | 42723/87187 [02:45<02:

In [None]:
del train_examples

## Reading dev data

In [None]:
dev_features, dev_dataset = squad_convert_examples_to_features(dev_examples, 
                                                       tokenizer, 
                                                       max_seq_length = 384, 
                                                       doc_stride = 128,
                                                       max_query_length = 64,
                                                       is_training = False,
                                                       return_dataset = 'pt',
                                                       threads = 10
                                                       )

convert squad examples to features: 100%|██████████| 2285/2285 [00:09<00:00, 242.35it/s]
add example index and unique id: 100%|██████████| 2285/2285 [00:00<00:00, 534671.39it/s]


# Train

### Original Evaluate Function From https://github.com/yeonsw/BLANC

In [None]:
# from transformers import BasicTokenizer

In [None]:
# import time
# import re 
# import string
# import collections

In [None]:
# RawResult = collections.namedtuple("RawResult",
#                                    ["unique_id", "start_logits", "end_logits"])


# def normalize_answer(s):

#     def remove_articles(text):
#         regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
#         return re.sub(regex, ' ', text)

#     def white_space_fix(text):
#         return ' '.join(text.split())

#     def remove_punc(text):
#         exclude = set(string.punctuation)
#         return ''.join(ch for ch in text if ch not in exclude)

#     def lower(text):
#         return text.lower()
#     return white_space_fix(remove_articles(remove_punc(lower(s))))

# def get_tokens(s):
#     if not s:
#         return []
#     return normalize_answer(s).split()


# def compute_exact(a_gold, a_pred):
#     return int(normalize_answer(a_gold) == normalize_answer(a_pred))


# def compute_f1(a_gold, a_pred):
#     gold_toks = get_tokens(a_gold)
#     pred_toks = get_tokens(a_pred)
#     common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
#     num_same = sum(common.values())
#     if len(gold_toks) == 0 or len(pred_toks) == 0:
#         return [int(gold_toks == pred_toks)] * 3
#     if num_same == 0:
#         return [0, 0, 0]
#     precision = 1.0 * num_same / len(pred_toks)
#     recall = 1.0 * num_same / len(gold_toks)
#     f1 = (2 * precision * recall) / (precision + recall)
#     return [precision, recall, f1]


# def _compute_softmax(scores):
#     """Compute softmax probability over raw logits."""
#     if not scores:
#         return []

#     max_score = None
#     for score in scores:
#         if max_score is None or score > max_score:
#             max_score = score

#     exp_scores = []
#     total_sum = 0.0
#     for score in scores:
#         x = math.exp(score - max_score)
#         exp_scores.append(x)
#         total_sum += x

#     probs = []
#     for score in exp_scores:
#         probs.append(score / total_sum)
#     return probs

# def _get_best_indexes(logits, n_best_size):
#     """Get the n-best logits from a list."""
#     index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)

#     best_indexes = []
#     for i in range(len(index_and_score)):
#         if i >= n_best_size:
#             break
#         best_indexes.append(index_and_score[i][0])
#     return best_indexes


# def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
#     """Project the tokenized prediction back to the original text."""

#     def _strip_spaces(text):
#         ns_chars = []
#         ns_to_s_map = collections.OrderedDict()
#         for (i, c) in enumerate(text):
#             if c == " ":
#                 continue
#             ns_to_s_map[len(ns_chars)] = i
#             ns_chars.append(c)
#         ns_text = "".join(ns_chars)
#         return (ns_text, ns_to_s_map)

#     tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
#     tok_text = " ".join(tokenizer.tokenize(orig_text))
#     start_position = tok_text.find(pred_text)
#     if start_position == -1:
#         if verbose_logging:
#             logger.info(
#                 "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
#         return orig_text
#     end_position = start_position + len(pred_text) - 1

#     (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
#     (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

#     if len(orig_ns_text) != len(tok_ns_text):
#         if verbose_logging:
#             logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
#                         orig_ns_text, tok_ns_text)
#         return orig_text

#     tok_s_to_ns_map = {}
#     for (i, tok_index) in tok_ns_to_s_map.items():
#         tok_s_to_ns_map[tok_index] = i

#     orig_start_position = None
#     if start_position in tok_s_to_ns_map:
#         ns_start_position = tok_s_to_ns_map[start_position]
#         if ns_start_position in orig_ns_to_s_map:
#             orig_start_position = orig_ns_to_s_map[ns_start_position]

#     if orig_start_position is None:
#         if verbose_logging:
#             logger.info("Couldn't map start position")
#         return orig_text

#     orig_end_position = None
#     if end_position in tok_s_to_ns_map:
#         ns_end_position = tok_s_to_ns_map[end_position]
#         if ns_end_position in orig_ns_to_s_map:
#             orig_end_position = orig_ns_to_s_map[ns_end_position]

#     if orig_end_position is None:
#         if verbose_logging:
#             logger.info("Couldn't map end position")
#         return orig_text

#     output_text = orig_text[orig_start_position:(orig_end_position + 1)]
#     return output_text


# def make_predictions(all_examples, all_features, all_results, n_best_size,
#                      max_answer_length, do_lower_case, verbose_logging):
#     example_index_to_features = collections.defaultdict(list)
#     for feature in all_features:
#         example_index_to_features[feature.example_index].append(feature)
#     unique_id_to_result = {}
#     for result in all_results:
#         unique_id_to_result[result.unique_id] = result
#     _PrelimPrediction = collections.namedtuple(
#         "PrelimPrediction",
#         ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])

#     all_predictions = collections.OrderedDict()
#     all_nbest_json = collections.OrderedDict()
#     scores_diff_json = collections.OrderedDict()

#     for (example_index, example) in enumerate(all_examples):
#         features = example_index_to_features[example_index]
#         prelim_predictions = []
#         score_null = 1000000
#         min_null_feature_index = 0
#         null_start_logit = 0
#         null_end_logit = 0
#         for (feature_index, feature) in enumerate(features):
#             result = unique_id_to_result[feature.unique_id]
#             start_indexes = _get_best_indexes(result.start_logits, n_best_size)
#             end_indexes = _get_best_indexes(result.end_logits, n_best_size)
#             for start_index in start_indexes:
#                 for end_index in end_indexes:
#                     if start_index >= len(feature.tokens):
#                         continue
#                     if end_index >= len(feature.tokens):
#                         continue
#                     if start_index not in feature.token_to_orig_map:
#                         continue
#                     if end_index not in feature.token_to_orig_map:
#                         continue
#                     if not feature.token_is_max_context.get(start_index, False):
#                         continue
#                     if end_index < start_index:
#                         continue
#                     length = end_index - start_index + 1
#                     if length > max_answer_length:
#                         continue
#                     prelim_predictions.append(
#                         _PrelimPrediction(
#                             feature_index=feature_index,
#                             start_index=start_index,
#                             end_index=end_index,
#                             start_logit=result.start_logits[start_index],
#                             end_logit=result.end_logits[end_index]))
#         prelim_predictions = sorted(
#             prelim_predictions,
#             key=lambda x: (x.start_logit + x.end_logit),
#             reverse=True)

#         _NbestPrediction = collections.namedtuple(
#             "NbestPrediction", ["text", "start_logit", "end_logit", "start_index", "end_index"])
#         seen_predictions = {}
#         nbest = []
#         for pred in prelim_predictions:
#             if len(nbest) >= n_best_size:
#                 break
#             feature = features[pred.feature_index]
#             orig_doc_start = None
#             orig_doc_end = None
#             if pred.start_index > 0:
#                 tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
#                 orig_doc_start = feature.token_to_orig_map[pred.start_index]
#                 orig_doc_end = feature.token_to_orig_map[pred.end_index]
#                 orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
#                 tok_text = " ".join(tok_tokens)
#                 tok_text = tok_text.replace(" ##", "")
#                 tok_text = tok_text.replace("##", "")
#                 tok_text = tok_text.strip()
#                 tok_text = " ".join(tok_text.split())
#                 orig_text = " ".join(orig_tokens)
#                 final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
#                 if final_text in seen_predictions:
#                     continue
#                 seen_predictions[final_text] = True
#             else:
#                 final_text = ""
#                 seen_predictions[final_text] = True

#             nbest.append(
#                 _NbestPrediction(
#                     text=final_text,
#                     start_logit=pred.start_logit,
#                     end_logit=pred.end_logit,
#                     start_index=orig_doc_start,
#                     end_index=orig_doc_end))

#         if not nbest:
#             nbest.append(
#                 _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0, start_index=None, end_index=None))
#         assert len(nbest) >= 1

#         total_scores = []
#         best_non_null_entry = None
#         for entry in nbest:
#             total_scores.append(entry.start_logit + entry.end_logit)
#             if not best_non_null_entry:
#                 if entry.text:
#                     best_non_null_entry = entry
#         target_entry = {}
#         target_entry["text"] = best_non_null_entry.text
#         target_entry["start_logit"] = best_non_null_entry.start_logit
#         target_entry["end_logit"] = best_non_null_entry.end_logit
#         target_entry["start_index"] = best_non_null_entry.start_index
#         target_entry["end_index"] = best_non_null_entry.end_index
        
#         probs = _compute_softmax(total_scores)
#         nbest_json = []
#         for (i, entry) in enumerate(nbest):
#             output = collections.OrderedDict()
#             output["text"] = entry.text
#             output["probability"] = probs[i]
#             output["start_logit"] = entry.start_logit
#             output["end_logit"] = entry.end_logit
#             output["start_index"] = entry.start_index
#             output["end_index"] = entry.end_index
#             nbest_json.append(output)

#         assert len(nbest_json) >= 1
#         all_predictions[example.qas_id] = target_entry

#         all_nbest_json[example.qas_id] = nbest_json

#     return all_predictions, all_nbest_json, scores_diff_json

# def make_eval_dict(exact_scores, f1_scores, p_scores={}, r_scores={}, span_exact={}, span_f1={}, span_p={}, span_r={}, qid_list=None):
#     if not qid_list:
#         total = len(exact_scores)
#         return collections.OrderedDict([
#             ('exact', 100.0 * sum(exact_scores.values()) / total),
#             ('f1', 100.0 * sum(f1_scores.values()) / total),
#             ('precision', 100.0 * sum(p_scores.values()) / total),
#             ('recall', 100.0 * sum(r_scores.values()) / total),
#             ('span_exact', 100.0 * sum(span_exact.values()) / total),
#             ('span_f1', 100.0 * sum(span_f1.values()) / total),
#             ('span_precision', 100.0 * sum(span_p.values()) / total),
#             ('span_recall', 100.0 * sum(span_r.values()) / total),
#             ('total', total),
#         ])
#     else:
#         total = len(qid_list)
#         return collections.OrderedDict([
#             ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
#             ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
#             ('precision', 100.0 * sum(p_scores.values()) / total),
#             ('recall', 100.0 * sum(r_scores.values()) / total),
#             ('span_exact', 100.0 * sum(span_exact.values()) / total),
#             ('span_f1', 100.0 * sum(span_f1.values()) / total),
#             ('span_precision', 100.0 * sum(span_p.values()) / total),
#             ('span_recall', 100.0 * sum(span_r.values()) / total),
#             ('total', total),
#         ])

# def get_raw_scores(dataset, preds, examples):
#     exact_scores = {}
#     f1_scores = {}
#     scores = {}
#     precision_scores = {}
#     recall_scores = {}
#     for article in dataset:
#         for p in article['paragraphs']:
#             for qa in p['qas']:
#                 qid = qa['id']
#                 gold_answers = [a['text'] for a in qa['answers'] if normalize_answer(a['text'])]
#                 if not gold_answers:
#                     gold_answers = ['']
#                 if qid not in preds:
#                     print('Missing prediction for %s' % qid)
#                     continue
#                 a_pred = preds[qid]["text"]
#                 exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
#                 scores[qid] = [compute_f1(a, a_pred) for a in gold_answers]
#                 f1_scores[qid] = max([s[2] for s in scores[qid]])
#                 recall_scores[qid] = max([s[1] for s in scores[qid]])
#                 precision_scores[qid] = max([s[0] for s in scores[qid]])
    
#     def get_precision(sp, ep, sr, er):
#         p_span = set(list(range(sp, ep + 1))) # TP + FP
#         r_span = set(list(range(sr, er + 1))) # TP + FN
#         # TP_set = intersect p_span and r_span = p_span & r_span
#         # precision = TP/(TP+FP) = len(p_span & r_span) / len(p_span)

#         if len(p_span & r_span)==0 and len(p_span)==0 and len(r_span)==0:
#             return 1
#         elif len(p_span & r_span)==0 and (len(p_span)>0 or len(r_span)>0):
#             return 0
        
#         return 1.0 * len(p_span & r_span) / len(p_span)
    
#     def get_recall(sp, ep, sr, er):
#         p_span = set(list(range(sp, ep + 1))) # TP + FP
#         r_span = set(list(range(sr, er + 1))) # TP + FN
#         # TP_set = intersect p_span and r_span = p_span & r_span
#         # recall = TP/(TP+FN) = len(p_span & r_span) / len(r_span)
        
#         if len(p_span & r_span)==0 and len(p_span)==0 and len(r_span)==0:
#             return 1
#         elif len(p_span & r_span)==0 and (len(p_span)>0 or len(r_span)>0):
#             return 0

#         return 1.0 * len(p_span & r_span) / len(r_span)
        
#     def get_f1(sp, ep, sr, er):
#         p = get_precision(sp, ep, sr, er)
#         r = get_recall(sp, ep, sr, er)
#         if p < 1e-10 or r < 1e-10:
#             return 0.0
#         else:
#             return 2.0 * p * r / (p + r)
    
#     def select_g(sgs, egs):
#         n = len(sgs)
#         si = min([i for i in sgs])
#         ei = max([i for i in egs])
#         i2n = [0] * (ei + 1)
#         for i in range(si, ei + 1):
#             for j in range(n):
#                 i2n[i] += 1 if sgs[j] <= i and i <= egs[j] else 0
#         m = max(i2n)
#         st = 0; et = 0
#         for i in range(si, ei + 1):
#             if i2n[i] == m:
#                 st = i
#                 break
#         for i in range(ei, si - 1, -1):
#             if i2n[i] == m:
#                 et = i
#                 break
#         return st, et

#     span_f1 = {}
#     span_exact = {}
#     span_precision = {}
#     span_recall = {}
#     for example in examples:
#         qid = example.qas_id
#         sgs = example.start_positions
#         egs = example.end_positions
        
#         sf = preds[qid]["start_index"]
#         ef = preds[qid]["end_index"]
#         if sf == None:
#             sf = -1
#         if ef == None:
#             ef = -1
        
#         n_can = len(sgs)
#         span_exact[qid] = 0.0
#         for j in range(n_can):
#             if sf == sgs[j] and ef == egs[j]:
#                 span_exact[qid] = 1.0
#                 break
#         span_f1[qid] = max([get_f1(sf, ef, sgs[i], egs[i]) for i in range(n_can)])
#         span_precision[qid] = max([get_precision(sf, ef, sgs[i], egs[i]) for i in range(n_can)])
#         span_recall[qid] = max([get_recall(sf, ef, sgs[i], egs[i]) for i in range(n_can)])
            
#     return exact_scores, f1_scores, precision_scores, recall_scores, span_exact, span_f1, span_precision, span_recall


In [None]:
# def evaluate_v2(model, device, eval_dataset, eval_dataloader,
#              eval_examples, eval_features,geometric_p, window_size, lmb, na_prob_thresh=1.0, pred_only=False):
#     all_results = []
#     model.eval()
#     eval_time_s = time.time()
#     for idx, (input_ids, input_mask, segment_ids, example_indices) in enumerate(eval_dataloader):
#         if idx % 10 == 0:
#             logger.info("Running test: %d / %d" % (idx, len(eval_dataloader)))
#         input_ids = input_ids.to(device)
#         input_mask = input_mask.to(device)
#         segment_ids = segment_ids.to(device)
#         with torch.no_grad():
#             batch_start_logits, batch_end_logits, _ = model(input_ids, segment_ids, input_mask, geometric_p=geometric_p, window_size=window_size, lmb=lmb)
#         for i, example_index in enumerate(example_indices):
#             start_logits = batch_start_logits[i].detach().cpu().tolist()
#             end_logits = batch_end_logits[i].detach().cpu().tolist()
#             eval_feature = eval_features[example_index.item()]
#             unique_id = int(eval_feature.unique_id)
#             all_results.append(RawResult(unique_id=unique_id,
#                                          start_logits=start_logits,
#                                          end_logits=end_logits))
#     eval_time_e = time.time()

#     preds, nbest_preds, na_probs = \
#         make_predictions(eval_examples, eval_features, all_results,
#                          n_best_size=20, max_answer_length=500,
#                          do_lower_case=False, verbose_logging=False)
    
#     if pred_only:
#       return {}, preds, nbest_preds


#     # V1 squad like dataset
#     exact_raw, f1_raw, p_raw, r_raw, span_exact, span_f1, span_p, span_r = get_raw_scores(eval_dataset, preds, eval_examples)
#     result = make_eval_dict(exact_raw, f1_raw, p_scores=p_raw, r_scores=r_raw, span_exact=span_exact, span_f1=span_f1, span_p=span_p, span_r=span_r)
    
#     logger.info("***** Eval results *****")
#     for key in sorted(result.keys()):
#         logger.info("  %s = %s", key, str(result[key]))
#     logger.info("Eval time: {:.06f}".format(eval_time_e - eval_time_s))
#     return result, preds, nbest_preds

### Main Thread

In [None]:
def to_list(tensor):
    return tensor.detach().cpu().tolist()

In [None]:
import os

def evaluate(model, tokenizer, dev_dataset, dev_examples, dev_features, geometric_p, window_size, lmb):
    eval_sampler = SequentialSampler(dev_dataset)
    eval_dataloader = DataLoader(dev_dataset, sampler=eval_sampler, batch_size=12)
    all_results = []
#     start_time = timeit.default_timer()
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        # device = torch.device('cuda')
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            input_ids = batch[0]
            input_mask = batch[1]
            segment_ids = batch[2]
            example_indices = batch[3]
            outputs = model(input_ids, segment_ids, input_mask, geometric_p=geometric_p, window_size= window_size, lmb=lmb)
            
        for i, example_index in enumerate(example_indices):
            eval_feature = dev_features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            output = []

            for idx in range(len(outputs)):
              output.append(to_list(outputs[idx][i]))

            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )
            else:
                start_logits, end_logits, _ = output
                result = SquadResult(unique_id, start_logits, end_logits)
            all_results.append(result)
    
    output_prediction_file = os.path.join("./", "predictions_{}.json".format(""))
    output_nbest_file = os.path.join("./", "nbest_predictions_{}.json".format(""))
    output_null_log_odds_file = os.path.join("./", "null_odds_{}.json".format(""))
    predictions = compute_predictions_logits(
            dev_examples,
            dev_features,
            all_results,
            20,
            300,
            False,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            True,
            False,
            0.0,
            tokenizer,
        )
    results = squad_evaluate(dev_examples, predictions)
    return results

In [None]:
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import trange, tqdm
device = torch.device('cuda')

In [None]:
num_epochs = 1
geometric_p = 0.7
window_size = 2
lmb = 0.4

tb_writer = SummaryWriter()
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=4)
t_total = len(train_dataloader) // 1 * num_epochs


no_decay = ["bias", "LayerNorm.weight"]

optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0,
    },
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, eps = 1e-8)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=814, num_training_steps=t_total
)

device = torch.device('cuda')

model.to(device)

global_step = 1
epochs_trained = 0
steps_trained_in_current_epoch = 0
tr_loss, logging_loss = 0.0, 0.0

model.zero_grad()
train_iterator = trange(
    epochs_trained, int(num_epochs), desc="Epoch", disable=-1 not in [-1, 0]
)

from functools import partial
tqdm = partial(tqdm, position=0, leave=True)

for _ in train_iterator:
    epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=False)
    for step, batch in enumerate(epoch_iterator):
        model.train()
        batch = tuple(t.to(device) for t in batch)
        
        input_ids = batch[0]
        input_mask = batch[1] 
        segment_ids = batch[2] 
        start_positions = batch[3] 
        end_positions = batch[4]

        outputs = model(input_ids, segment_ids, input_mask, start_positions, end_positions, geometric_p, window_size, lmb)
        
        loss = outputs[0]
        loss.backward()
        
        tr_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        scheduler.step()
        model.zero_grad()
        global_step += 1

        if global_step % 1000 == 0:
            print(" global_step = %s, average loss = %s" % (global_step, tr_loss/global_step))

            
output_dir = os.path.join('/content/drive/MyDrive/Colab Notebooks/BLANC for Language Models/model', 'xlmr-blanc-xquad-pretrained_July06')
model_to_save = model.module if hasattr(model, "module") else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(" global_step = %s, average loss = %s" % (global_step, tr_loss/global_step))

results = evaluate(model, tokenizer, dev_dataset, dev_examples, dev_features, geometric_p, window_size, lmb)
for key, value in results.items():
    print(key, value)

Iteration:   4%|▍         | 999/22953 [10:57<4:00:38,  1.52it/s]

 global_step = 1000, average loss = 1.9666236989274621


Iteration:   9%|▊         | 1999/22953 [21:56<3:50:10,  1.52it/s]

 global_step = 2000, average loss = 1.5280657810159028


Iteration:  13%|█▎        | 2999/22953 [32:54<3:39:33,  1.51it/s]

 global_step = 3000, average loss = 1.3402779516937833


Iteration:  17%|█▋        | 3999/22953 [43:53<3:28:36,  1.51it/s]

 global_step = 4000, average loss = 1.2308539595436305


Iteration:  22%|██▏       | 4999/22953 [54:51<3:17:14,  1.52it/s]

 global_step = 5000, average loss = 1.1567522909604013


Iteration:  26%|██▌       | 5999/22953 [1:05:50<3:05:56,  1.52it/s]

 global_step = 6000, average loss = 1.1009978011405717


Iteration:  30%|███       | 6999/22953 [1:16:52<2:56:06,  1.51it/s]

 global_step = 7000, average loss = 1.0569850729219616


Iteration:  35%|███▍      | 7999/22953 [1:27:52<2:45:23,  1.51it/s]

 global_step = 8000, average loss = 1.0207785929697566


Iteration:  39%|███▉      | 8999/22953 [1:38:55<2:34:02,  1.51it/s]

 global_step = 9000, average loss = 0.9891404113440464


Iteration:  44%|████▎     | 9999/22953 [1:49:56<2:22:51,  1.51it/s]

 global_step = 10000, average loss = 0.9642282651416026


Iteration:  48%|████▊     | 10999/22953 [2:00:58<2:11:47,  1.51it/s]

 global_step = 11000, average loss = 0.9427220532769676


Iteration:  52%|█████▏    | 11999/22953 [2:12:01<2:01:08,  1.51it/s]

 global_step = 12000, average loss = 0.9226131804941687


Iteration:  57%|█████▋    | 12999/22953 [2:23:03<1:49:43,  1.51it/s]

 global_step = 13000, average loss = 0.9050170085339162


Iteration:  61%|██████    | 13999/22953 [2:34:05<1:38:13,  1.52it/s]

 global_step = 14000, average loss = 0.8911577817843561


Iteration:  65%|██████▌   | 14999/22953 [2:45:07<1:27:35,  1.51it/s]

 global_step = 15000, average loss = 0.8772127624187619


Iteration:  70%|██████▉   | 15999/22953 [2:56:08<1:16:40,  1.51it/s]

 global_step = 16000, average loss = 0.8642748431223445


Iteration:  74%|███████▍  | 16999/22953 [3:07:10<1:05:31,  1.51it/s]

 global_step = 17000, average loss = 0.8515496675216538


Iteration:  78%|███████▊  | 17999/22953 [3:18:10<54:28,  1.52it/s]

 global_step = 18000, average loss = 0.8392818899904895


Iteration:  83%|████████▎ | 18999/22953 [3:29:10<43:26,  1.52it/s]

 global_step = 19000, average loss = 0.8266922273107952


Iteration:  87%|████████▋ | 19999/22953 [3:40:10<32:28,  1.52it/s]

 global_step = 20000, average loss = 0.8157374974071048


Iteration:  91%|█████████▏| 20999/22953 [3:51:09<21:27,  1.52it/s]

 global_step = 21000, average loss = 0.8053493634480096


Iteration:  96%|█████████▌| 21999/22953 [4:02:09<10:28,  1.52it/s]

 global_step = 22000, average loss = 0.795863690612626


Iteration: 100%|██████████| 22953/22953 [4:12:37<00:00,  1.51it/s]
Epoch: 100%|██████████| 1/1 [4:12:37<00:00, 15157.93s/it]


 global_step = 22954, average loss = 0.787211597111773


Evaluating: 100%|██████████| 199/199 [01:38<00:00,  2.03it/s]


exact 57.85776997366111
f1 81.14498686912013
total 2278
HasAns_exact 57.85776997366111
HasAns_f1 81.14498686912013
HasAns_total 2278
best_exact 57.85776997366111
best_exact_thresh 0.0
best_f1 81.14498686912013
best_f1_thresh 0.0
