In [1]:
import torch
import numpy as np
from torch.utils.data import Dataset
from bpemb import BPEmb
from datasets import load_dataset

In [2]:
device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")
device

device(type='cuda')

In [3]:
dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

Using custom data configuration copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6
Reusing dataset parquet (/home/lyk/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-9ffd3d37cf2899c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
train_set = train_set.remove_columns(["question_text","document_title","language","annotations","document_url"])
validation_set = validation_set.remove_columns(["question_text","document_title","language","annotations","document_url"])

In [5]:
# We'll use again the pretrained BP Embeddings and the corresponding tokenizer.

bpemb_en = BPEmb(lang='en', dim=100, vs=25000)
# Extract the embeddings and add a randomly initialized embedding for our extra [PAD] token
pretrained_embeddings = np.concatenate([bpemb_en.emb.vectors, np.zeros(shape=(1,100))], axis=0)
# Extract the vocab and add an extra [PAD] token
vocabulary = bpemb_en.emb.index_to_key + ['[PAD]']

In [6]:
block_size = 128
def tokenizer(text):
  return {'input_ids': bpemb_en.encode_ids_with_eos(text)}

def tokenize_function(examples):
    return tokenizer(examples['document_plaintext'])

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of size block_size .
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [7]:
train_tokenized_datasets = train_set.map(tokenize_function, batched=True, num_proc=4, remove_columns=["document_plaintext"])
train_datasets = train_tokenized_datasets.map(group_texts, batched=True, batch_size=1000, num_proc=4,)

        

#0:   0%|          | 0/30 [00:00<?, ?ba/s]

#2:   0%|          | 0/30 [00:00<?, ?ba/s]

#1:   0%|          | 0/30 [00:00<?, ?ba/s]

#3:   0%|          | 0/30 [00:00<?, ?ba/s]

        

#0:   0%|          | 0/30 [00:00<?, ?ba/s]

#2:   0%|          | 0/30 [00:00<?, ?ba/s]

#1:   0%|          | 0/30 [00:00<?, ?ba/s]

#3:   0%|          | 0/30 [00:00<?, ?ba/s]

In [8]:
val_tokenized_datasets = validation_set.map(tokenize_function, batched=True, num_proc=4, remove_columns=["document_plaintext"])
val_datasets = val_tokenized_datasets.map(group_texts, batched=True, batch_size=1000, num_proc=4,)

       

#0:   0%|          | 0/4 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/4 [00:00<?, ?ba/s]

#1:   0%|          | 0/4 [00:00<?, ?ba/s]

#3:   0%|          | 0/4 [00:00<?, ?ba/s]

        

#0:   0%|          | 0/4 [00:00<?, ?ba/s]

#1:   0%|          | 0/4 [00:00<?, ?ba/s]

#2:   0%|          | 0/4 [00:00<?, ?ba/s]

#3:   0%|          | 0/4 [00:00<?, ?ba/s]

In [9]:
def get_document_dataset(dataset, block_size = 128):
    document = []
    for element in dataset:
        document.append(element["document_plaintext"].lower())
    document_bpemb = [bpemb_en.encode_ids_with_eos(doc) for doc in document]
    document_bpemb = sum(document_bpemb, [])
    total_length = len(document_bpemb)
    total_length = (total_length // block_size) * block_size
    result = []
    for i in range(0, total_length, block_size):
        result.append(document_bpemb[i : i + block_size])
    return result

In [10]:
seq_len = 128
def collate_batch_bilstm(dataset):
    """
    Combines multiple data samples into a single batch
    :param input_ids: The token input ids
    :return: A tuple of tensors (input_ids, targets)
    """
    input_ids = [i['input_ids'] for i in dataset]

    input_lengths, padded_input = [], []
    for sentence in input_ids:
      sentence = sentence[:seq_len]
      input_lengths.append(len(sentence) - 1)
      sentence = sentence + [0] * (seq_len - len(sentence))
      padded_input.append(sentence)

    input_data = torch.tensor(padded_input)

    # we don't use the last position as there isn't anything left for generation
    input_ids = input_data[:, :-1]

    # the target at each step is to generate the next word from the sequence
    # so we shift the token ids with 1 position
    targets = input_data[:, 1:].reshape(-1)

    return input_ids, torch.tensor(input_lengths), targets

In [11]:
train_dl = torch.utils.data.DataLoader(train_datasets, batch_size=32, collate_fn=collate_batch_bilstm)
valid_dl = torch.utils.data.DataLoader(val_datasets, batch_size=32, collate_fn=collate_batch_bilstm)