In [None]:
#version merge datasets v1
!pip install transformers==3.5.1

In [None]:
from transformers import BertTokenizer, BertForPreTraining, BertForQuestionAnswering, BertModel, BertConfig
from transformers import XLMRobertaForQuestionAnswering, XLMRobertaTokenizer, XLMRobertaConfig
import torch
import torch.nn as nn
from transformers.data.metrics.squad_metrics import compute_predictions_log_probs, compute_predictions_logits, squad_evaluate
from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor

In [None]:
# class mBert(nn.Module):
#     def __init__(self):
#         super(mBert, self).__init__()
#         self.bert_block = BertModel.from_pretrained("bert-base-multilingual-cased")
#         self.question = nn.Linear(768, 2, bias=True)
#     def forward(self, inputs):
#         pooled_output, sequence_output = self.bert_block(inputs)
#         return self.question(sequence_output)

In [None]:
model = XLMRobertaForQuestionAnswering.from_pretrained('xlm-roberta-large')

In [None]:
model.config

In [None]:
# model = mBert()
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large")

In [None]:
# !mkdir dataset \
# && cd dataset \
# && wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json \
# && wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json

In [None]:
processor = SquadV1Processor()

In [None]:
train_examples = processor.get_train_examples('../input/merge-dataset','merge_dataset_v1_train.json')
dev_examples = processor.get_dev_examples('../input/merge-dataset','merge_dataset_v1_dev.json')

In [None]:
from transformers.data.processors.squad import squad_convert_examples_to_features

In [None]:
train_features, train_dataset = squad_convert_examples_to_features(train_examples, 
                                                       tokenizer, 
                                                       max_seq_length = 384, 
                                                       doc_stride = 128,
                                                       max_query_length = 64,
                                                       is_training = True,
                                                       return_dataset = 'pt',
                                                       threads = 10
                                                       )

In [None]:
del train_examples

In [None]:
dev_features, dev_dataset = squad_convert_examples_to_features(dev_examples, 
                                                       tokenizer, 
                                                       max_seq_length = 384, 
                                                       doc_stride = 128,
                                                       max_query_length = 64,
                                                       is_training = False,
                                                       return_dataset = 'pt',
                                                       threads = 10
                                                       )

In [None]:
def to_list(tensor):
    return tensor.detach().cpu().tolist()

In [None]:
import os
def evaluate(model, tokenizer, dev_dataset, dev_examples, dev_features):
    eval_sampler = SequentialSampler(dev_dataset)
    eval_dataloader = DataLoader(dev_dataset, sampler=eval_sampler, batch_size=12)
    all_results = []
#     start_time = timeit.default_timer()
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }
            example_indices = batch[3]
            outputs = model(**inputs)
        for i, example_index in enumerate(example_indices):
            eval_feature = dev_features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
#             for output in outputs:
#                 print(output)
            output = [to_list(output[i]) for output in outputs]
#             output = [to_list(output) for output in outputs]
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )
            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
            all_results.append(result)
    
    output_prediction_file = os.path.join("./", "predictions_{}.json".format(""))
    output_nbest_file = os.path.join("./", "nbest_predictions_{}.json".format(""))
    output_null_log_odds_file = os.path.join("./", "null_odds_{}.json".format(""))
    predictions = compute_predictions_logits(
            dev_examples,
            dev_features,
            all_results,
            20,
            300,
            False,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            True,
            False,
            0.0,
            tokenizer,
        )
    results = squad_evaluate(dev_examples, predictions)
    return results

In [None]:
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import trange, tqdm
device = torch.device('cuda')

In [None]:
# for param in model.bert.parameters():
#     param.requires_grad = False

In [None]:
import numpy as np
num_epochs = 2
tb_writer = SummaryWriter()
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=4)
t_total = len(train_dataloader) // 1 * num_epochs


no_decay = ["bias", "LayerNorm.weight"]

optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0,
    },
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, eps = 1e-8)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=814, num_training_steps=t_total
)

device = torch.device('cuda')

model.to(device)

global_step = 1
epochs_trained = 0
steps_trained_in_current_epoch = 0
tr_loss, logging_loss = 0.0, 0.0

model.zero_grad()
train_iterator = trange(
    epochs_trained, int(num_epochs), desc="Epoch", disable=-1 not in [-1, 0]
)

from functools import partial
tqdm = partial(tqdm, position=0, leave=True)

for _ in train_iterator:
    epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=False)
    for step, batch in enumerate(epoch_iterator):
        model.train()
        batch = tuple(t.to(device) for t in batch)
        
        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "token_type_ids": batch[2],
            "start_positions": batch[3],
            "end_positions": batch[4],
        }
        print(batch[2][0])
        outputs = model(**inputs)
        loss = outputs[0]
        loss.backward()
        tr_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        scheduler.step()
        model.zero_grad()
        global_step += 1

        if global_step % 5000 == 0:
#             output_dir = os.path.join('./', "checkpoint-{}".format(global_step))
#             model_to_save = model.module if hasattr(model, "module") else model
#             model_to_save.save_pretrained(output_dir)
#             tokenizer.save_pretrained(output_dir)
#             torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
#             torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
            print(" global_step = %s, average loss = %s", global_step, tr_loss/global_step)

            
output_dir = os.path.join('./', 'final_model')
model_to_save = model.module if hasattr(model, "module") else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(" global_step = %s, average loss = %s", global_step, tr_loss/global_step)

results = evaluate(model, tokenizer, dev_dataset, dev_examples, dev_features)
for key, value in results.items():
    print(key, value)

In [None]:
tokenizer_1 = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large")

In [None]:
test_examples = processor.get_dev_examples('../input/merge-dataset', 'merge_dataset_v1_test.json')
test_features, test_dataset = squad_convert_examples_to_features(test_examples, 
                                                       tokenizer, 
                                                       max_seq_length = 384, 
                                                       doc_stride = 128,
                                                       max_query_length = 64,
                                                       is_training = False,
                                                       return_dataset = 'pt',
                                                       threads = 10
                                                       )

In [None]:
results = evaluate(model, tokenizer_1, test_dataset, test_examples, test_features)
for key, value in results.items():
    print(key, value)