In [1]:
import torch
import datasets as nlp
from transformers import LongformerTokenizerFast
from transformers import XLMRobertaTokenizerFast, AutoTokenizer
from transformers import XLMRobertaForQuestionAnswering
from transformers import RobertaForQuestionAnswering

from transformers.utils import logging as hf_logging


hf_logging.enable_default_handler()
hf_logging.set_verbosity_info()
hf_logging.enable_explicit_format()


tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', use_fast=True)
#tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base')

[INFO|configuration_utils.py:395] 2021-01-21 17:44:28,536 >> loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json from cache at /.cache/torch/transformers/762ddd751172e9d3229e5da17a459eee6c0dfdc237c718944d0b1a85f06c7e1e.2b0f807393c56e8861a31cd67d2fc0b45d71d9735dd47dd66afb650f90b6d2a8
[INFO|configuration_utils.py:431] 2021-01-21 17:44:28,539 >> Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 250002
}

[INFO|tokenization_ut

In [2]:
def get_correct_alignement(context: str, answer):
    """ Some original examples in SQuAD have indices wrong by 1 or 2 character. We test and fix this here. """
    gold_text = answer["text"][0]
    start_idx = answer["answer_start"][0]
    end_idx = start_idx + len(gold_text)
    if context[start_idx:end_idx] == gold_text:
        return start_idx, end_idx  # When the gold label position is good
    elif context[start_idx - 1 : end_idx - 1] == gold_text:
        return start_idx - 1, end_idx - 1  # When the gold label is off by one character
    elif context[start_idx - 2 : end_idx - 2] == gold_text:
        return start_idx - 2, end_idx - 2  # When the gold label is off by two character
    else:
        raise ValueError()


# Tokenize our training dataset
def convert_to_features(example):
    # Tokenize contexts and questions (as pairs of inputs)
    encodings = tokenizer.encode_plus(
        example["question"],
        example["context"],
        pad_to_max_length=True,
        max_length=512,
        truncation=True,
    )
    context_encodings = tokenizer.encode_plus(example["context"])

    # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methodes.
    # this will give us the position of answer span in the context text
    start_idx, end_idx = get_correct_alignement(example["context"], example["answers"])
    start_positions_context = context_encodings.char_to_token(start_idx)
    end_positions_context = context_encodings.char_to_token(end_idx - 1)
    
    # FIXME: UGLY HACK because of XLM-R tokenization, works fine with monolingual
    # 2 training examples returns incorrect positions
    sep_idx = encodings["input_ids"].index(tokenizer.sep_token_id)
    try:
        # here we will compute the start and end position of the answer in the whole example
        # as the example is encoded like this <s> question</s></s> context</s>
        # and we know the postion of the answer in the context
        # we can just find out the index of the sep token and then add that to position + 1 (+1 because there are two sep tokens)
        # this will give us the position of the answer span in whole example
        
        start_positions = start_positions_context + sep_idx + 1
        end_positions = end_positions_context + sep_idx + 1

        if end_positions > 512:
            start_positions, end_positions = 0, 0
    
    # Returned None for start or end position index
    except:
        #print(f"{example}")
        #print(f"Start_idx: {start_idx} \t End_idx: {end_idx}")
        #print(f"Sep_idx: {sep_idx}")
        #print(f"with start: {start_positions_context} \t end: {end_positions_context}\n")
        start_positions = None
        end_positions = None
    
    encodings.update(
        {
            "start_positions": start_positions,
            "end_positions": end_positions,
            "attention_mask": encodings["attention_mask"],
        }
    )
    return encodings

In [3]:
# Each of these are validation datasets
xquad_en = nlp.load_dataset('xquad', 'xquad.en', split="validation")
xquad_ru = nlp.load_dataset('xquad', 'xquad.ru', split="validation")
xquad_ar = nlp.load_dataset('xquad', 'xquad.ar', split="validation")
xquad_en

Reusing dataset xquad (/.cache/huggingface/datasets/xquad/xquad.en/1.0.0/9e12114a409c05777407a840606169d55d5ffb5ca8003000da5325a25fd55cd3)
Reusing dataset xquad (/.cache/huggingface/datasets/xquad/xquad.ru/1.0.0/9e12114a409c05777407a840606169d55d5ffb5ca8003000da5325a25fd55cd3)
Reusing dataset xquad (/.cache/huggingface/datasets/xquad/xquad.ar/1.0.0/9e12114a409c05777407a840606169d55d5ffb5ca8003000da5325a25fd55cd3)


Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 1190
})

In [4]:
squad_train, squad_valid = nlp.load_dataset('squad', split=['train', 'validation'])
train_dataset = squad_train.map(convert_to_features).filter(lambda example: (example['start_positions'] is not None) and (example['end_positions'] is not None))
valid_dataset = squad_valid.map(convert_to_features).filter(lambda example: (example['start_positions'] is not None) and (example['end_positions'] is not None))


# set the tensor type and the columns which the dataset should return
columns = ['input_ids', 'attention_mask', 'start_positions', 'end_positions']
train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

valid_dataset

Reusing dataset squad (/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41)


HBox(children=(FloatProgress(value=0.0, max=87599.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=88.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10570.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




Dataset({
    features: ['answers', 'attention_mask', 'context', 'end_positions', 'id', 'input_ids', 'question', 'start_positions', 'title'],
    num_rows: 10551
})

In [5]:
def convert_dataset_to_torch_format(data):
    data = data.map(convert_to_features).filter(lambda example: (example['start_positions'] is not None) and (example['end_positions'] is not None))

    # set the tensor type and the columns which the dataset should return
    columns = ['input_ids', 'attention_mask', 'start_positions', 'end_positions']
    data.set_format(type='torch', columns=columns)
    return data

In [6]:
xquad_en

Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 1190
})

In [7]:
xquad_en = convert_dataset_to_torch_format(xquad_en)

HBox(children=(FloatProgress(value=0.0, max=1190.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [8]:
xquad_en

Dataset({
    features: ['answers', 'attention_mask', 'context', 'end_positions', 'id', 'input_ids', 'question', 'start_positions'],
    num_rows: 1187
})

## MLQA

In [25]:
mlqa_train_de = nlp.load_dataset('mlqa', 'mlqa-translate-train.de')
mlqa_test_de = nlp.load_dataset('mlqa', 'mlqa-translate-test.de')
mlqa_valid_de = nlp.load_dataset('mlqa', 'mlqa.de.de')


Reusing dataset mlqa (/.cache/huggingface/datasets/mlqa/mlqa-translate-train.de/1.0.0/2b5eaa00f1bd38db2d350b549e6b98c12822a0a3f00ad9fff89743970d6b671a)
Reusing dataset mlqa (/.cache/huggingface/datasets/mlqa/mlqa-translate-test.de/1.0.0/2b5eaa00f1bd38db2d350b549e6b98c12822a0a3f00ad9fff89743970d6b671a)
Reusing dataset mlqa (/.cache/huggingface/datasets/mlqa/mlqa.de.de/1.0.0/2b5eaa00f1bd38db2d350b549e6b98c12822a0a3f00ad9fff89743970d6b671a)


In [26]:
print(mlqa_train_de) # Yes for training and val
print(mlqa_test_de)  # Dont use
print(mlqa_valid_de) # Dont use

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answers', 'id'],
        num_rows: 80069
    })
    validation: Dataset({
        features: ['context', 'question', 'answers', 'id'],
        num_rows: 9927
    })
})
DatasetDict({
    test: Dataset({
        features: ['context', 'question', 'answers', 'id'],
        num_rows: 4517
    })
})
DatasetDict({
    test: Dataset({
        features: ['context', 'question', 'answers', 'id'],
        num_rows: 4517
    })
    validation: Dataset({
        features: ['context', 'question', 'answers', 'id'],
        num_rows: 512
    })
})


DatasetDict({
    test: Dataset({
        features: ['context', 'question', 'answers', 'id'],
        num_rows: 4517
    })
})

# TEST TO concat text

In [16]:
train = nlp.load_dataset('squad', split='train')
train

Reusing dataset squad (/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41)


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
})

In [None]:
N = 2

# Combine the context form N samples
text_lenght_above = len(''.join(train['context'][0:N-1]))
prev_start = train['answers'][N]['answer_start'][0]
start_pos = text_lenght_above + prev_start

context = ''.join(train['context'][0:N])

# Get the correct 
''.join(train['context'][0:N])[context_above+prev_start: context_above+prev_start+50]

In [None]:
train['answers'][2]

In [132]:
def get_span(index, span=5):
    """
    Returns the value in a range for whole numbers
    
    Ex: index=4, span=5
        lower=0, upper=5
        
        index=8, span=5
        lower=5, upper=10
    """
    lower_bound = (index-1)//span
    lower_bound = lower_bound*span
    upper_bound = lower_bound+span
    return lower_bound, upper_bound


index=11
get_span(index, span=5)

(10, 15)

In [17]:
def get_span(index, span=5):
    """
    Returns the value in a range for whole numbers
    
    Ex: index=4, span=5
        lower=0, upper=5
        
        index=8, span=5
        lower=5, upper=10
    """
    lower_bound = (index)//span
    lower_bound = lower_bound*span
    upper_bound = lower_bound+span
    return lower_bound, upper_bound

#low, high = get_span(index, span=5)


In [12]:
data = train.filter(lambda example, indice: indice % 5 == 0, with_indices=True)

HBox(children=(FloatProgress(value=0.0, max=88.0), HTML(value='')))




In [19]:
def a(index, example):
    low, high = get_span(index, span=5)

    # Get new starting position
    if index != low:
        prev_start = len(''.join(data['context'][low:index]))
        start_pos = data['answers'][index]['answer_start'][0]
        example['answers']['answer_start'] = [prev_start + start_pos]

    # Get new context
    example['context'] = ''.join(train['context'][low:high])
    return example

    
    
data = data.map(lambda example, indice: a(indice, example), with_indices=True)

HBox(children=(FloatProgress(value=0.0, max=17520.0), HTML(value='')))




In [24]:
nlp.load_dataset('xquad', 'xquad.ar', split="validation")# Arabic

Reusing dataset xquad (/.cache/huggingface/datasets/xquad/xquad.ar/1.0.0/9e12114a409c05777407a840606169d55d5ffb5ca8003000da5325a25fd55cd3)


Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 1190
})

In [58]:
N = 5
for i in range(0, len(lst), N):
    text_lenght_above = len(''.join(train['context'][i:i+N]))
    print(text_lenght_above)

3475
7025
3520
3620
4179
6323
2979
3038
5098
6033
4648
3326
5632
3634
6075
7260
6155
5881
4676
3992


In [167]:
train['answers']

[{'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
 {'answer_start': [188], 'text': ['a copper statue of Christ']},
 {'answer_start': [279], 'text': ['the Main Building']},
 {'answer_start': [381], 'text': ['a Marian place of prayer and reflection']},
 {'answer_start': [92], 'text': ['a golden statue of the Virgin Mary']},
 {'answer_start': [248], 'text': ['September 1876']},
 {'answer_start': [441], 'text': ['twice']},
 {'answer_start': [598], 'text': ['The Observer']},
 {'answer_start': [126], 'text': ['three']},
 {'answer_start': [908], 'text': ['1987']},
 {'answer_start': [119], 'text': ['Rome']},
 {'answer_start': [145], 'text': ['Moreau Seminary']},
 {'answer_start': [234], 'text': ['Old College']},
 {'answer_start': [356], 'text': ['Retired priests and brothers']},
 {'answer_start': [675], 'text': ['Buechner Prize for Preaching']},
 {'answer_start': [487], 'text': ['eight']},
 {'answer_start': [46], 'text': ['1920']},
 {'answer_start': [126], 'text': ['the College

In [162]:
def get_span(index, span=5):
    """
    Returns the value in a range for whole numbers
    
    Ex: index=4, span=5
        lower=0, upper=5
        
        index=8, span=5
        lower=5, upper=10
    """
    lower_bound = (index)//span
    lower_bound = lower_bound*span
    upper_bound = lower_bound+span
    return lower_bound, upper_bound

lst = list(range(20))

for index in range(20):
    low, high = get_span(index, span=5)
    if index != low:
        start = lst[low:index] # + example['start'][index]
        print(index, start)
        # example['start'][index] = sum(lst[low:index])  + example['start'][index]
    print(index, lst[low:high])

0 [0, 1, 2, 3, 4]
1 [0]
1 [0, 1, 2, 3, 4]
2 [0, 1]
2 [0, 1, 2, 3, 4]
3 [0, 1, 2]
3 [0, 1, 2, 3, 4]
4 [0, 1, 2, 3]
4 [0, 1, 2, 3, 4]
5 [5, 6, 7, 8, 9]
6 [5]
6 [5, 6, 7, 8, 9]
7 [5, 6]
7 [5, 6, 7, 8, 9]
8 [5, 6, 7]
8 [5, 6, 7, 8, 9]
9 [5, 6, 7, 8]
9 [5, 6, 7, 8, 9]
10 [10, 11, 12, 13, 14]
11 [10]
11 [10, 11, 12, 13, 14]
12 [10, 11]
12 [10, 11, 12, 13, 14]
13 [10, 11, 12]
13 [10, 11, 12, 13, 14]
14 [10, 11, 12, 13]
14 [10, 11, 12, 13, 14]
15 [15, 16, 17, 18, 19]
16 [15]
16 [15, 16, 17, 18, 19]
17 [15, 16]
17 [15, 16, 17, 18, 19]
18 [15, 16, 17]
18 [15, 16, 17, 18, 19]
19 [15, 16, 17, 18]
19 [15, 16, 17, 18, 19]


TypeError: 'int' object is not iterable

In [21]:
data = nlp.load_from_disk("squad_concat")

In [22]:
data

Dataset({
    features: ['answers', 'context', 'id', 'question', 'title'],
    num_rows: 17520
})

In [23]:
data['answers']

[{'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
 {'answer_start': [943], 'text': ['September 1876']},
 {'answer_start': [2219], 'text': ['Rome']},
 {'answer_start': [3291], 'text': ['eight']},
 {'answer_start': [4024], 'text': ['Learning Resource Center']},
 {'answer_start': [624], 'text': ['Master of Divinity']},
 {'answer_start': [1893],
  'text': ['President Emeritus of the University of Notre Dame']},
 {'answer_start': [2187], 'text': ['Theodore M. Hesburgh Library']},
 {'answer_start': [2699], 'text': ['19.7%']},
 {'answer_start': [3885], 'text': ['8th']},
 {'answer_start': [353], 'text': ['1851–1921']},
 {'answer_start': [1169], 'text': ['Professor Jerome Green']},
 {'answer_start': [2512], 'text': ['the 1940s']},
 {'answer_start': [2879], 'text': ['German Catholic journals']},
 {'answer_start': [3822], 'text': ['International Peace studies']},
 {'answer_start': [1446], 'text': ['over 700']},
 {'answer_start': [1976], 'text': ['15']},
 {'answer_start': [3148], '

In [31]:
!ls /workspace/models

Longformer-4094-squad_seed_42
RoBERTa_Long_seed_1337
longformer-base-seed-42-squad-finetuned
roberta-base-4096-seed-42-fastest-lm-complete
roberta-long-seed-1337-squad-finetuned
roberta-long-seed-42-fine-tuned-squad
roberta-long-seed-42-squad
xlm-roberta-base-4096-seed-42-fast-lm
xlm-roberta-base-4096-seed-42-fastest-lm-complete
xlm-roberta-base-seed-1337-xquad-long-finetuned
xlm-roberta-base-seed-165-xquad-finetuned
xlm-roberta-base-seed-165-xquad-long-finetuned
xlm-roberta-base-seed-1729-xquad-long-finetuned
xlm-roberta-base-seed-42-xquad-long-finetuned
xlm-roberta-base-seed-758241-xquad-long-finetuned


In [32]:
tokenizer = AutoTokenizer.from_pretrained('/workspace/models/xlm-roberta-base-4096-seed-42-fastest-lm-complete', use_fast=True)


[INFO|configuration_utils.py:393] 2021-01-22 20:36:01,372 >> loading configuration file /workspace/models/xlm-roberta-base-4096-seed-42-fastest-lm-complete/config.json
[INFO|configuration_utils.py:431] 2021-01-22 20:36:01,375 >> Model config XLMRobertaConfig {
  "_name_or_path": "/workspace/models/xlm-roberta-base-4096-seed-42-fastest-lm-complete",
  "architectures": [
    "LongModelForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 4098,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "type_vocab_siz