In [1]:
# default_exp datasets

In [2]:
#export
from fastai.text import *
from transformers import AutoTokenizer

from tse.squad_utils import *
from tse.tokenizers import *

### Dataset & DataAugmentor

> Dataset and Data Augmentations

In [3]:
#export
SQUAD_DATA_PATH = Path("../squad_data/")
PRETRAINED_TOK_PATH = Path("../tokenizers/roberta-base/")

In [4]:
tokenizer = AutoTokenizer.from_pretrained(str(PRETRAINED_TOK_PATH))

In [7]:
# old version - don't export
class SQUAD_Dataset(Dataset):
    def __init__(self, dataset_tensors, examples, features, is_training=True):
        self.dataset_tensors = dataset_tensors
        self.examples = examples
        self.features = features
        self.is_training = is_training


    def __getitem__(self, idx):
        'fastai requires (xb, yb) to return'

        input_ids = self.dataset_tensors[0][idx]
        attention_mask = self.dataset_tensors[1][idx]
        token_type_ids = self.dataset_tensors[2][idx]
        xb = (input_ids, attention_mask, token_type_ids)

        if self.is_training:
            start_positions = self.dataset_tensors[3][idx]
            end_positions = self.dataset_tensors[4][idx]
        yb = (start_positions, end_positions)

        return xb, yb

    def __len__(self): return len(self.dataset_tensors[0])

In [8]:
# old version - don't export
def get_fold_ds(foldnum, tokenizer):
    data_dir = "/kaggle/working/squad_data"
    train_filename = f"train_squad_data_{foldnum}.json"
    valid_filename = f"valid_squad_data_{foldnum}.json"
    test_filename = "test_squad_data.json"
    
    # examples
    train_examples = squad_processor.get_train_examples(SQUAD_DATA_PATH, 'train_squad_data_0.json')
    valid_examples = squad_processor.get_train_examples(SQUAD_DATA_PATH, 'valid_squad_data_0.json')
    test_examples = squad_processor.get_dev_examples(SQUAD_DATA_PATH, 'test_squad_data.json')

    # features and tensors
    train_features, train_dataset = get_squad_dataset(train_examples, tokenizer, True)
    valid_features, valid_dataset = get_squad_dataset(valid_examples, tokenizer, True)
    test_features, test_dataset = get_squad_dataset(test_examples, tokenizer, False)
    train_dataset_tensors = train_dataset.tensors
    valid_dataset_tensors = valid_dataset.tensors
    test_dataset_tensors = test_dataset.tensors
    
    # create pytorch dataset
    train_ds = SQUAD_Dataset(train_dataset_tensors, train_examples, train_features)
    valid_ds = SQUAD_Dataset(valid_dataset_tensors, valid_examples, valid_features)
    test_ds = SQUAD_Dataset(test_dataset_tensors, test_examples, test_features, False)
    
    return train_ds, valid_ds, test_ds    

### Data Augmentation

Augmentations to be done on training set

- 1a) Random split - always keep full answer
- 1b) Random split - can split from anwhere
- 2) Randomly mask tokens 
- 3) Randomly mask tokens within the answer context
- 4) Left - Right Flip 

In [9]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [10]:
train_ds, valid_ds, test_ds = get_fold_ds(0, tokenizer)

100%|██████████| 21984/21984 [00:00<00:00, 24434.08it/s]
100%|██████████| 5496/5496 [00:00<00:00, 24844.50it/s]
100%|██████████| 3534/3534 [00:00<00:00, 26853.10it/s]
convert squad examples to features: 100%|██████████| 21984/21984 [00:08<00:00, 2743.49it/s]
add example index and unique id: 100%|██████████| 21984/21984 [00:00<00:00, 788665.19it/s]
convert squad examples to features: 100%|██████████| 5496/5496 [00:02<00:00, 2502.20it/s]
add example index and unique id: 100%|██████████| 5496/5496 [00:00<00:00, 753502.26it/s]
convert squad examples to features: 100%|██████████| 3534/3534 [00:01<00:00, 2192.57it/s]
add example index and unique id: 100%|██████████| 3534/3534 [00:00<00:00, 608234.32it/s]


In [11]:
i = 0

In [12]:
answer_text = train_ds.examples[i].answer_text
context_text = train_ds.examples[i].context_text

In [13]:
example0 = train_ds.examples[i]
features0 = train_ds.features[i]

In [14]:
input_ids = train_ds[i][0][0]
attention_mask = train_ds[i][0][1]
token_type_ids = train_ds[i][0][2]
start_position = train_ds[i][1][0]
end_position = train_ds[i][1][1]

In [15]:
token_to_orig_map = train_ds.features[i].token_to_orig_map

### TSEDataAugmentor

#### 1) Random Left Right Truncate

```
-> tok3 anstok anstok anstok tok7 (rand left and right idxs)
-> tok3 anstok anstok anstok tok7 tok8 (rand left idx)
-> Tok1 tok2 tok3 anstok anstok anstok tok7 (rand right idx)
```


#### 2) Random Mask

```
-> Tok1 tok2 <MASK> anstok anstok anstok tok7 <MASK>
-> Tok1 tok2 <UNK> anstok anstok anstok tok7 <UNK>
```

#### 3) TODO: Random Left Right Flip


- Should be at word level not token: `" ".join(sent.split()[::-1])`


In [26]:
#export
class TSEDataAugmentor():

    def __init__(self, tokenizer, input_ids, attention_mask, start_position, end_position, token_to_orig_map): 

        self.tokenizer = tokenizer 
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        
        # initial answer start and end positions
        self.ans_start_pos, self.ans_end_pos = start_position.item(), end_position.item()
                
        # initial context start and end positions
        self.token_to_orig_map = token_to_orig_map
        self.context_start_pos, self.context_end_pos = min(token_to_orig_map), max(token_to_orig_map)

        
    
    # left and right indexes excluding answer tokens and eos token
    @property
    def left_idxs(self): return np.arange(self.context_start_pos, self.ans_start_pos)
    
    @property
    def right_idxs(self): return np.arange(self.ans_end_pos+1, self.context_end_pos+1)
    
    @property
    def left_right_idxs(self): return np.concatenate([self.left_idxs, self.right_idxs])
    
    @property
    def rand_left_idx(self): return np.random.choice(self.left_idxs) if self.left_idxs.size > 0 else None
    
    @property
    def rand_right_idx(self): return np.random.choice(self.right_idxs) if self.right_idxs.size > 0 else None
        
    
    
    def right_truncate(self, right_idx):
        """
        Truncate context from random right index to beginning, answer pos doesn't change
        Note: token_type_ids NotImplemented
        """
        if not right_idx: raise Exception("Right index can't be None")
        
        # clone for debugging
        new_input_ids = self.input_ids.clone()
        nopad_input_ids = new_input_ids[self.attention_mask.bool()]
        
        # truncate from right idx to beginning - add eos_token_id to end
        truncated = torch.cat([nopad_input_ids[:right_idx+1], tensor([self.tokenizer.eos_token_id])])
        
        # pad new context until size are equal
        # replace original input context with new
        n_pad = len(nopad_input_ids) - len(truncated)
        new_context = F.pad(truncated, (0,n_pad), value=self.tokenizer.pad_token_id)
        new_input_ids[:self.context_end_pos+2] = new_context
        
        
        # find new attention mask, update new context end position (exclude eos token)
        # Note: context start doesn't change since we don't manipulate question
        new_attention_mask = tensor([1 if i != 1 else 0 for i in new_input_ids])
        new_context_end_pos = torch.where(new_attention_mask)[0][-1].item() - 1 
        self.context_end_pos = new_context_end_pos
        
        # update input_ids and attention_masks
        self.input_ids = new_input_ids
        self.attention_mask = new_attention_mask
        
        return self.input_ids, self.attention_mask, (tensor(self.ans_start_pos), tensor(self.ans_end_pos))

    def random_right_truncate(self):
        right_idx = self.rand_right_idx
        if right_idx: self.right_truncate(right_idx)
    
    
    def left_truncate(self, left_idx):
        """
        Truncate context from random left index to end, answer pos changes too
        Note: token_type_ids NotImplemented
        """
        
        if not left_idx: raise Exception("Left index can't be None")
        
        # clone for debugging
        new_input_ids = self.input_ids.clone()
        
        # pad new context until size are equal
        # replace original input context with new

        n_pad = len(new_input_ids[self.context_start_pos:]) - len(new_input_ids[left_idx:])
        
        new_context = F.pad(new_input_ids[left_idx:], (0,n_pad), value=self.tokenizer.pad_token_id)
        
        new_input_ids[self.context_start_pos:] = new_context
        
                
        # find new attention mask, update new context end position (exclude eos token)
        # Note: context start doesn't change since we don't manipulate question
        new_attention_mask = tensor([1 if i != 1 else 0 for i in new_input_ids])
        new_context_end_pos = torch.where(new_attention_mask)[0][-1].item() - 1
        self.context_end_pos = new_context_end_pos
        
        # find new answer start and end positions
        # update new answer start and end positions
        ans_shift = left_idx - self.context_start_pos
        self.ans_start_pos, self.ans_end_pos = self.ans_start_pos-ans_shift, self.ans_end_pos-ans_shift
        
        
        # update input_ids and attention_masks
        self.input_ids = new_input_ids
        self.attention_mask = new_attention_mask
        
        return self.input_ids, self.attention_mask, (tensor(self.ans_start_pos), tensor(self.ans_end_pos))
        
    def random_left_truncate(self):
        left_idx = self.rand_left_idx
        if left_idx: self.left_truncate(left_idx)
        
        
    def replace_with_mask(self, idxs_to_mask):
        """
        Replace given input ids with tokenizer.mask_token_id
        """
        # clone for debugging
        new_input_ids = self.input_ids.clone()
        new_input_ids[idxs_to_mask] = tensor([tokenizer.mask_token_id]*len(idxs_to_mask))
        self.input_ids = new_input_ids

        
    def random_replace_with_mask(self, mask_p=0.2):
        """
        mask_p: Proportion of tokens to replace with mask token id
        """
        idxs_to_mask = np.random.choice(self.left_right_idxs, int(len(self.left_right_idxs)*mask_p))
        if idxs_to_mask.size > 0: self.replace_with_mask(idxs_to_mask)
        
                

In [17]:
input_ids[attention_mask.bool()]

tensor([    0, 33407,     2,     2,  2527,  3036,    29,   625,   118,  6677,
        17745,  6968, 10859,   179, 14832, 18554,  2977, 16506,     2])

In [18]:
start_position, end_position

(tensor(4), tensor(7))

In [19]:
answer_text, context_text, start_position.item(), end_position.item()

('Sooo SAD', ' Sooo SAD I will miss you here in San Diego!!!', 4, 7)

In [20]:
" ".join(tokenizer.convert_ids_to_tokens(input_ids[attention_mask.bool()]))

'<s> negative </s> </s> so oo s ad i will miss you here in san die go !!! </s>'

In [21]:
" ".join(tokenizer.convert_ids_to_tokens(input_ids[start_position.item(): end_position.item()+1]))

'so oo s ad'

### demo right truncate

In [27]:
da = TSEDataAugmentor(tokenizer, input_ids, attention_mask, start_position, end_position, token_to_orig_map)
da.random_right_truncate()
print(" ".join(tokenizer.convert_ids_to_tokens(da.input_ids[da.attention_mask.bool()])))
print()
print(" ".join(tokenizer.convert_ids_to_tokens(da.input_ids[da.ans_start_pos :da.ans_end_pos+1])))

<s> negative </s> </s> so oo s ad i will miss you </s>

so oo s ad


### demo left truncate

In [28]:
da = TSEDataAugmentor(tokenizer, input_ids, attention_mask, start_position, end_position, token_to_orig_map)
da.random_left_truncate()
print(" ".join(tokenizer.convert_ids_to_tokens(da.input_ids[da.attention_mask.bool()])))
print()
print(" ".join(tokenizer.convert_ids_to_tokens(da.input_ids[da.ans_start_pos :da.ans_end_pos+1])))

<s> negative </s> </s> so oo s ad i will miss you here in san die go !!! </s>

so oo s ad


### demo replace with mask

In [29]:
da = TSEDataAugmentor(tokenizer, input_ids, attention_mask, start_position, end_position, token_to_orig_map)
da.random_replace_with_mask(0.2)
print(" ".join(tokenizer.convert_ids_to_tokens(da.input_ids[da.attention_mask.bool()])))
print()
print(" ".join(tokenizer.convert_ids_to_tokens(da.input_ids[da.ans_start_pos :da.ans_end_pos+1])))

<s> negative </s> </s> so oo s ad i <mask> miss you here in san die <mask> !!! </s>

so oo s ad


In [30]:
da.left_idxs, da.right_idxs

(array([], dtype=int64), array([ 8,  9, 10, 11, 12, 13, 14, 15, 16, 17]))

### demo all

In [32]:
da = TSEDataAugmentor(tokenizer, input_ids, attention_mask, start_position, end_position, token_to_orig_map)

da.random_left_truncate()
da.random_right_truncate()
da.random_replace_with_mask(0.3)
print(" ".join(tokenizer.convert_ids_to_tokens(da.input_ids[da.attention_mask.bool()])))
print()
print(" ".join(tokenizer.convert_ids_to_tokens(da.input_ids[da.ans_start_pos :da.ans_end_pos+1])))

<s> negative </s> </s> so oo s ad i <mask> miss you here </s>

so oo s ad


### SQUAD_Dataset

In [33]:
do_tfms = {}
do_tfms["random_left_truncate"] = {"p":0.3}
do_tfms["random_right_truncate"] = {"p":0.3}
do_tfms["random_replace_with_mask"] = {"p":0.3, "mask_p":0.2}
do_tfms

{'random_left_truncate': {'p': 0.3},
 'random_right_truncate': {'p': 0.3},
 'random_replace_with_mask': {'p': 0.3, 'mask_p': 0.2}}

In [34]:
#export
class SQUAD_Dataset(Dataset):
    def __init__(self, tokenizer, dataset_tensors, examples, features, is_training=True, do_tfms=None):
        self.dataset_tensors = dataset_tensors
        self.examples = examples
        self.features = features
        self.is_training = is_training
        self.tokenizer = tokenizer
        self.do_tfms = do_tfms
                
        
    def __getitem__(self, idx):
        'fastai requires (xb, yb) to return'
        
        input_ids = self.dataset_tensors[0][idx]
        attention_mask = self.dataset_tensors[1][idx]
        token_type_ids = self.dataset_tensors[2][idx]
        if self.is_training: 
            start_position = self.dataset_tensors[3][idx]
            end_position = self.dataset_tensors[4][idx]
            
            if self.do_tfms:
                token_to_orig_map = self.features[idx].token_to_orig_map
                
                augmentor = TSEDataAugmentor(self.tokenizer,
                                             input_ids,
                                             attention_mask,
                                             start_position, end_position,
                                             token_to_orig_map)

                if np.random.uniform() < self.do_tfms["random_left_truncate"]["p"]:
                    augmentor.random_left_truncate()
                if np.random.uniform() < self.do_tfms["random_right_truncate"]["p"]:
                    augmentor.random_right_truncate()
                if np.random.uniform() < self.do_tfms["random_replace_with_mask"]["p"]:
                    augmentor.random_replace_with_mask(self.do_tfms["random_replace_with_mask"]["mask_p"])

                input_ids = augmentor.input_ids
                attention_mask = augmentor.attention_mask
                start_position, end_position = tensor(augmentor.ans_start_pos), tensor(augmentor.ans_end_pos)
                
            
        xb = (input_ids, attention_mask, token_type_ids)
        if self.is_training: yb = (start_position, end_position)
        else: yb = 0
        
        return xb, yb
    
    def __len__(self): return len(self.dataset_tensors[0])

In [35]:
#export
def get_fold_ds(foldnum, tokenizer):
    data_dir = "/kaggle/working/squad_data"
    train_filename = f"train_squad_data_{foldnum}.json"
    valid_filename = f"valid_squad_data_{foldnum}.json"
    test_filename = "test_squad_data.json"
    
    # examples
    train_examples = squad_processor.get_train_examples(SQUAD_DATA_PATH, 'train_squad_data_0.json')
    valid_examples = squad_processor.get_train_examples(SQUAD_DATA_PATH, 'valid_squad_data_0.json')
    test_examples = squad_processor.get_dev_examples(SQUAD_DATA_PATH, 'test_squad_data.json')

    # features and tensors
    train_features, train_dataset = get_squad_dataset(train_examples, tokenizer, True)
    valid_features, valid_dataset = get_squad_dataset(valid_examples, tokenizer, True)
    test_features, test_dataset = get_squad_dataset(test_examples, tokenizer, False)
    train_dataset_tensors = train_dataset.tensors
    valid_dataset_tensors = valid_dataset.tensors
    test_dataset_tensors = test_dataset.tensors
    
    # create pytorch dataset
    do_tfms = {}
    do_tfms["random_left_truncate"] = {"p":0.3}
    do_tfms["random_right_truncate"] = {"p":0.3}
    do_tfms["random_replace_with_mask"] = {"p":0.3, "mask_p":0.3}

    train_ds = SQUAD_Dataset(tokenizer, train_dataset_tensors, train_examples, train_features, True, do_tfms)
    valid_ds = SQUAD_Dataset(tokenizer, valid_dataset_tensors, valid_examples, valid_features, True)
    test_ds = SQUAD_Dataset(tokenizer, test_dataset_tensors, test_examples, test_features, False)
    
    return train_ds, valid_ds, test_ds    

In [36]:
train_ds, valid_ds, test_ds = get_fold_ds(0, tokenizer)

100%|██████████| 21984/21984 [00:01<00:00, 19703.62it/s]
100%|██████████| 5496/5496 [00:00<00:00, 27330.81it/s]
100%|██████████| 3534/3534 [00:00<00:00, 27846.25it/s]
convert squad examples to features: 100%|██████████| 21984/21984 [00:08<00:00, 2731.55it/s]
add example index and unique id: 100%|██████████| 21984/21984 [00:00<00:00, 907528.12it/s]
convert squad examples to features: 100%|██████████| 5496/5496 [00:02<00:00, 2698.70it/s]
add example index and unique id: 100%|██████████| 5496/5496 [00:00<00:00, 708156.02it/s]
convert squad examples to features: 100%|██████████| 3534/3534 [00:01<00:00, 3105.58it/s]
add example index and unique id: 100%|██████████| 3534/3534 [00:00<00:00, 729246.79it/s]


In [41]:
i = 7

In [42]:
train_ds.examples[i].answer_text

'Wow... u just became cooler.'

In [51]:
(input_ids, attention_mask, token_type_ids), (start_positions, end_positions) = train_ds[i]

" ".join(train_ds.tokenizer.convert_ids_to_tokens(input_ids[attention_mask.bool()]))

'<s> positive </s> </s> j ourney !? wow ... u just bec ame cool er . he he ... ( is that p ossible ! ?) </s>'

### `predict_answer_text`

TODO: Migrate to proper notebook

In [1146]:
#export
def predict_answer_text(start_logits, end_logits, attention_mask,
                        context_text, char_to_word_offset, token_to_orig_map): 
    "Find best answer from context"
    # find best start and end
    context_start, context_end = min(token_to_orig_map), max(token_to_orig_map)
    truncated_start_logits = start_logits[attention_mask.bool()][context_start:context_end+1]
    truncated_end_logits = end_logits[attention_mask.bool()][context_start:context_end+1]
    best_start_idx, best_end_idx = find_best_start_end_idxs(truncated_start_logits, truncated_end_logits)
    
    # generate answer
    tok_orig_char_start = token_to_orig_map[best_start_idx+context_start] 
    tok_orig_char_end = token_to_orig_map[best_end_idx+context_start]
    return answer_from_orig_context(context_text, char_to_word_offset, tok_orig_char_start, tok_orig_char_end)

In [1150]:
predict_answer_text(start_logits, end_logits, attention_mask, 
                   context_text, char_to_word_offset, token_to_orig_map)

'edings for the baby are fun when he is a'

### export

In [53]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01-squad-utils.ipynb.
Converted 02-tokenizers.ipynb.
Converted 03-datasets.ipynb.
