In [1]:
import torch.utils.data as Data
from transformers import AutoTokenizer

import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import Adam
from torchcrf import CRF
from torch.optim.lr_scheduler import ExponentialLR, CyclicLR
from tqdm import tqdm_notebook as tqdm
from tqdm import notebook
from sklearn.metrics import precision_recall_fscore_support
from datasets import load_dataset, load_metric
from sklearn.metrics import confusion_matrix
import heapq

In [2]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently 
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    np.random.seed(seed)

enforce_reproducibility()

In [3]:
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")

# Data Processing

In [4]:
dataset = load_dataset("copenlu/answerable_tydiqa")

Using custom data configuration copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6
Reusing dataset parquet (/root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
def getLanguageDataSet(data, language):
    return data.filter(lambda x: x['language'] == language)

def getEnglishDataSet(data):
    return getLanguageDataSet(data, "english").remove_columns(["language", "document_url"])


def getJapaneseDataSet(data):
    return getLanguageDataSet(data, "japanese").remove_columns(["language", "document_url"])


def getFinnishDataSet(data):
    return getLanguageDataSet(data, "finnish").remove_columns(["language", "document_url"])


# keep only english data
english_set = getEnglishDataSet(dataset)
japanese_set = getJapaneseDataSet(dataset)
finnish_set = getFinnishDataSet(dataset)

Loading cached processed dataset at /root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-1103e6c04ff44af3.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-dabfcfd450c9224c.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-dc9fb4fd79187984.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-cee1771b5f371cc3.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/co

In [6]:
english_set

DatasetDict({
    train: Dataset({
        features: ['question_text', 'document_title', 'annotations', 'document_plaintext'],
        num_rows: 7389
    })
    validation: Dataset({
        features: ['question_text', 'document_title', 'annotations', 'document_plaintext'],
        num_rows: 990
    })
})

In [7]:
japanese_set

DatasetDict({
    train: Dataset({
        features: ['question_text', 'document_title', 'annotations', 'document_plaintext'],
        num_rows: 8778
    })
    validation: Dataset({
        features: ['question_text', 'document_title', 'annotations', 'document_plaintext'],
        num_rows: 1036
    })
})

In [8]:
finnish_set

DatasetDict({
    train: Dataset({
        features: ['question_text', 'document_title', 'annotations', 'document_plaintext'],
        num_rows: 13701
    })
    validation: Dataset({
        features: ['question_text', 'document_title', 'annotations', 'document_plaintext'],
        num_rows: 1686
    })
})

# data preprocessing for transformer 

Following data processing method can not be used in the LSTM below, becuase the LSTM needs pretrained vectors

In [9]:
max_length = 512  # 输入feature的最大长度，question和context拼接之后
doc_stride = 64  # 2个切片之间的重合token数量。
pad_on_right = True

In [10]:
squad_v2 = False
# distilbert-base-uncased can only be used in English
# model_checkpoint = "distilbert-base-uncased"
model_checkpoint = "bert-base-multilingual-uncased"
batch_size = 32

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [17]:
def get_answer_index(start_index, offset_mapping):
    res = -1
    for i, t in enumerate(offset_mapping):
        if t[0] <= start_index and t[1] >= start_index:
            res = i
    return res


def get_start_and_end_answer_index(start_index, end_index, offset_mapping):
    start_res = -1
    end_res = -1
    for i, t in enumerate(offset_mapping):
        if t[0] <= start_index and t[1] >= start_index:
            start_res = i
        if t[0] <= end_index and t[1] >= end_index:
            end_res = i
    return (start_res, end_res)


def prepare_train_features(examples):
    # 既要对examples进行truncation（截断）和padding（补全）还要还要保留所有信息，所以要用的切片的方法。
    # 每一个一个超长文本example会被切片成多个输入，相邻两个输入之间会有交集。
    tokenized_examples = tokenizer(
        examples["question_text" if pad_on_right else "document_plaintext"],
        examples["document_plaintext" if pad_on_right else "question_text"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # 我们使用overflow_to_sample_mapping参数来映射切片片ID到原始ID。
    # 比如有2个expamples被切成4片，那么对应是[0, 0, 1, 1]，前两片对应原来的第一个example。
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # offset_mapping也对应4片
    # offset_mapping参数帮助我们映射到原始输入，由于答案标注在原始输入上，所以有助于我们找到答案的起始和结束位置。
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # 重新标注数据
    tokenized_examples["labels"] = []
    input_ids = tokenized_examples["input_ids"]
    tokenized_examples["length"] =  [input_id.index(0) if 0 in input_id else max_length 
                                     for input_id in input_ids]
    for i, offsets in enumerate(offset_mapping):
        # 对每一片进行处理
        # 将无答案的样本标注到CLS上
        input_ids = tokenized_examples["input_ids"][i]

        # 区分question和context
        sequence_ids = tokenized_examples.sequence_ids(i)

        # 拿到原始的example 下标.
        sample_index = sample_mapping[i]
        answers = examples["annotations"][sample_index]
        # 如果没有答案，则使用CLS所在的位置为答案.
        if len(answers["answer_start"]) == [-1]:
            tokenized_examples["labels"].append([0] * max_length)
        else:
            # 答案的character级别Start/end位置.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["answer_text"][0])

            # 找到token级别的index start.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # 找到token级别的index end.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1
            # 检测答案是否超出文本长度，超出的话也适用CLS index作为标注.
            temp = [0] * max_length
            if offsets[token_start_index][0] > start_char or offsets[token_end_index][1] < end_char:
                tokenized_examples["labels"].append(temp)
            elif offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char:
                s, e = get_start_and_end_answer_index(start_char, end_char, offsets)
                temp[s] = 1
                temp[s + 1:e + 1] = [2] * (e - s)
                tokenized_examples["labels"].append(temp)
            elif offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] < end_char:
                s = get_answer_index(start_char, offsets)
                temp[s] = 1
                length = len(temp) - s
                temp[s:] = [2] * length
                tokenized_examples["labels"].append(temp)
            elif offsets[token_start_index][0] > start_char and offsets[token_end_index][1] >= end_char:
                e = get_answer_index(end_char, offsets)
                temp[:e + 1] = [2] * e
                tokenized_examples["labels"].append(temp)
    return tokenized_examples


In [18]:
english_tokenized_datasets = english_set.map(prepare_train_features, batched=True,
                                     remove_columns=english_set["train"].column_names)


japanese_tokenized_datasets = japanese_set.map(prepare_train_features, batched=True,
                                     remove_columns=japanese_set["train"].column_names)


finnish_tokenized_datasets = finnish_set.map(prepare_train_features, batched=True,
                                     remove_columns=finnish_set["train"].column_names)


  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [20]:
print(english_tokenized_datasets)
print(japanese_tokenized_datasets)
print(finnish_tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'length'],
        num_rows: 7477
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'length'],
        num_rows: 996
    })
})
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'length'],
        num_rows: 9534
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'length'],
        num_rows: 1092
    })
})
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'length'],
        num_rows: 13838
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'length'],
        num_rows: 1707
    })
})


In [21]:
print(english_tokenized_datasets["train"][0])
print(japanese_tokenized_datasets["train"][0])
print(finnish_tokenized_datasets["train"][0])

{'input_ids': [101, 10704, 10140, 32950, 12270, 14676, 14906, 136, 102, 32950, 12270, 14676, 71070, 12167, 10171, 10103, 13498, 10108, 51900, 10380, 50418, 11183, 52898, 117, 10146, 10103, 51900, 10380, 50418, 11183, 12270, 10140, 10103, 10902, 11197, 20429, 12270, 10146, 10108, 10103, 43416, 119, 138, 129, 140, 131, 122, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [23]:
def collate_batch_bilstm(input_data):
    input_ids = [tokenizer.encode(i['tokens']) for i in input_data]
    seq_lens = [i["length"] for i in input_data]
    labels = [i['labels'] for i in input_data]

    max_length = max([len(i) for i in input_ids])
    
    input_ids = [(i + [0] * (max_length - len(i))) for i in input_ids]
    labels = [(i + [0] * (max_length - len(i))) for i in labels] # 0 is the id of the O tag

    assert (all(len(i) == max_length for i in input_ids))
    assert (all(len(i) == max_length for i in labels))

    return torch.tensor(input_ids), torch.tensor(seq_lens), torch.tensor(labels)

In [10]:

# tokenized_datasets = tokenized_datasets.remove_columns("token_type_ids")

train_data = tokenized_datasets['train']
test_data = tokenized_datasets['validation']

train_loader = Data.DataLoader(dataset=train_data,
                               batch_size=32,
                               shuffle=False,
                               collate_fn=collate_batch_bilstm)

test_loader = Data.DataLoader(dataset=test_data,
                              batch_size=32,
                              shuffle=False, 
                              collate_fn=collate_batch_bilstm)

Loading cached processed dataset at /root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-83eccc48c30b6714.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8300ccd77c297487.arrow


NameError: name 'collate_batch_bilstm' is not defined

# Transformer Model

In [24]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/641M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-b

In [25]:
args = TrainingArguments(
    f"test-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5, #学习率
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3, # 训练的论次
    weight_decay=0.01,
)

In [26]:
from transformers import default_data_collator

data_collator = default_data_collator

In [27]:
english_trainer = Trainer(
    model,
    args,
    train_dataset=english_tokenized_datasets["train"],
    eval_dataset=english_tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [29]:
english_trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: labels, length. If labels, length are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7477
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 702


RuntimeError: CUDA out of memory. Tried to allocate 384.00 MiB (GPU 0; 23.70 GiB total capacity; 8.28 GiB already allocated; 195.81 MiB free; 8.31 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
trainer.save_model("test-squad-trained")