In [1]:
# default_exp datasets

In [2]:
#export
from fastai.text import *
from tse.preprocessing import *
from tse.tokenizers import *

tokenizers: 0.7.0
fastai: 1.0.60


### Prepare Data Inputs for Q/A


Following for each input for training is needed:

`input_ids`, `attention_mask`, `token_type_ids`, `offsets`, `answer_text`, `start_tok_idx`, `end_tok_idx`

Preprocess

In [3]:
train_df = pd.read_csv("../data/train.csv").dropna().reset_index(drop=True)
test_df = pd.read_csv("../data/test.csv")

In [4]:
strip_text(train_df, "text")
strip_text(train_df, "selected_text")
strip_text(test_df, "text")

In [5]:
replace_whitespace(train_df, "text")
replace_whitespace(train_df, "selected_text")
replace_whitespace(test_df, "text")

In [6]:
replace_URLs(train_df, "text")
replace_URLs(train_df, "selected_text")
replace_URLs(test_df, "text")

In [7]:
replace_user(train_df, "text")
replace_user(train_df, "selected_text")
replace_user(test_df, "text")

In [8]:
is_wrong = train_df.apply(lambda o: is_wrong_selection(o['text'], o['selected_text']), 1)
train_df = train_df[~is_wrong].reset_index(drop=True)

In [9]:
list(train_df['text'])

['I`d have responded, if I were going',
 'Sooo SAD I will miss you here in San Diego!!!',
 'my boss is bullying me...',
 'what interview! leave me alone',
 'Sons of ****, why couldn`t they put them on the releases we already bought',
 'URL - some shameless plugging for the best Rangers forum on earth',
 '2am feedings for the baby are fun when he is all smiles and coos',
 'Soooo high',
 'Both of you',
 'Journey!? Wow... u just became cooler. hehe... (is that possible!?)',
 'as much as i love to be hopeful, i reckon the chances are minimal =P i`m never gonna get my cake and stuff',
 'I really really like the song Love Story by Taylor Swift',
 'My Sharpie is running DANGERously low on ink',
 'i want to go to music tonight but i lost my voice.',
 'test test from the LG enV2',
 'Uh oh, I am sunburned',
 'S`ok, trying to plot alternatives as we speak *sigh*',
 'i`ve been sick for the past few days and thus, my hair looks wierd. if i didnt have a hat on it would look... URL',
 'is back home n

In [10]:
train_df.shape

(27459, 4)

Tokenizer

In [45]:
tokenizer = init_roberta_tokenizer("../roberta-base/vocab.json", "../roberta-base/merges.txt", max_length=192)

In [46]:
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on th...","Sons of ****,",negative


In [47]:
#export
def get_start_end_idxs(context, answer):
    "Get string start and end char for answer span"
    len_a = len(answer)
    for i, _ in enumerate(context):
        if context[i:i+len_a] == answer: 
            start_idx, end_idx = i, i+len_a-1
            return start_idx, end_idx
    raise Exception("No overlapping segment found")

In [48]:
#export
def get_start_end_tok_idxs(offsets, start_idx, end_idx):
    "Generate target from tokens - first 4 tokens belong to question"
    start_tok_idx, end_tok_idx = None, None
    for tok_idx, off in enumerate(offsets[4:]):
        if (off[0] <= start_idx) & (off[1] > start_idx): start_tok_idx = tok_idx + 4
        if (off[0] <= end_idx) & (off[1] > end_idx): end_tok_idx = tok_idx + 4
    return (start_tok_idx, end_tok_idx)

In [49]:
trn_stxt, trn_txt, trn_sent = train_df.selected_text.values, train_df.text.values, train_df.sentiment
test_txt, test_sent = test_df.text.values, test_df.sentiment.values

In [50]:
train_tok_input = list(tuple(zip(trn_sent, trn_txt)))
test_tok_input = list(tuple(zip(test_sent, test_txt)))

In [51]:
# encode batch
train_outputs = tokenizer.encode_batch(train_tok_input)
test_outputs = tokenizer.encode_batch(test_tok_input)

In [52]:
start_end_idxs = [get_start_end_idxs(s1,s2) for (s1,s2) in zip(trn_txt, trn_stxt)]

In [53]:
#export
class QAInputGenerator:
    def __init__(self, contexts, questions, text_ids=None, answers=None, tokenizer=None):
        self.contexts, self.questions, self.answers = contexts, questions, answers
        self.outputs = tokenizer.encode_batch(list(tuple(zip(questions, contexts))))
        if text_ids is not None: self.text_ids = text_ids
        if self.answers is not None:
            self.start_end_idxs = [get_start_end_idxs(s1,s2) for (s1,s2) in zip(self.contexts, self.answers)]
            
            
    @classmethod
    def from_df(cls, df, 
                ctx_col='text', q_col='sentiment', id_col='textID', ans_col='selected_text', 
                is_test=False, tokenizer=None):
        contexts = df[ctx_col].values
        questions = df[q_col].values
        text_ids = None if id_col is None else df[id_col].values
        answers = None if is_test else df[ans_col].values
        return cls(contexts, questions, text_ids, answers, tokenizer)
    
    
    def __getitem__(self, i):
        
        input_ids = array(self.outputs[i].ids)
        attention_mask = array(self.outputs[i].attention_mask)
        offsets = array(self.outputs[i].offsets)
        tokens = array(self.outputs[i].tokens)
        res = {"input_ids": input_ids, "attention_mask": attention_mask, "offsets": offsets, 
              "tokens": tokens, "context_text": self.contexts[i]}
        
        if self.answers is not None:
            answer_text = self.answers[i]
            start_tok_idx, end_tok_idx = get_start_end_tok_idxs(offsets, *self.start_end_idxs[i])
            res["answer_text"] = answer_text
            res["start_end_tok_idxs"] = (start_tok_idx, end_tok_idx)

        if self.text_ids is not None:
            text_id = self.text_ids[i]
            res["text_id"] = text_id
            
        return res
    
    def __len__(self): return len(self.contexts)

In [54]:
train_inputs = QAInputGenerator.from_df(train_df, tokenizer=tokenizer)

In [55]:
test_inputs = QAInputGenerator.from_df(test_df, is_test=True, tokenizer=tokenizer)

In [56]:
i = np.random.choice(range(len(train_inputs)))
print(train_inputs[i].keys())
print(train_inputs[i]['tokens'][train_inputs[i]['start_end_tok_idxs'][0]:train_inputs[i]['start_end_tok_idxs'][1]+1])
print(train_inputs[i]['answer_text'])

dict_keys(['input_ids', 'attention_mask', 'offsets', 'tokens', 'context_text', 'answer_text', 'start_end_tok_idxs', 'text_id'])
['Ġterrible']
terrible


In [57]:
i = np.random.choice(range(len(test_inputs)))
print(test_inputs[i].keys())
print(test_inputs[i]['tokens'][test_inputs[i]['attention_mask'].astype(bool)])

dict_keys(['input_ids', 'attention_mask', 'offsets', 'tokens', 'context_text', 'text_id'])
['<s>' 'Ġnegative' '</s>' '</s>' 'Ġand' 'Ġim' 'Ġjust' 'Ġgoing' 'Ġinto' 'Ġwork' '...' 'Ġif' 'Ġwe' 'Ġwere' 'Ġmarried' ','
 'Ġwe' 'Ġw' 'ud' 'Ġnever' 'Ġsee' 'Ġeach' 'Ġother' '</s>']


In [58]:
train_inputs = list(train_inputs)
test_inputs = list(test_inputs)

In [59]:
len(train_inputs), len(test_inputs)

(27459, 3534)

### TSEDataAugmentor

#### 1) Random Left - Right Truncate

```
-> tok3 anstok anstok anstok tok7 (rand left and right idxs)
-> tok3 anstok anstok anstok tok7 tok8 (rand left idx)
-> Tok1 tok2 tok3 anstok anstok anstok tok7 (rand right idx)
```


#### 2) Random Mask

```
-> Tok1 tok2 <MASK> anstok anstok anstok tok7 <MASK>
-> Tok1 tok2 <UNK> anstok anstok anstok tok7 <UNK>
```

#### 3) Replace with pseudolabel


In [60]:
#export
class TSEDataAugmentor:
    def __init__(self, tokenizer, input_ids, attention_mask, start_position, end_position): 

        self.tokenizer = tokenizer 
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        
        # initial answer start and end positions
        self.ans_start_pos, self.ans_end_pos = start_position.item(), end_position.item()
                        
        # context token start and end excluding bos - eos tokens
        self.context_start_pos = 4
        self.context_end_pos = torch.where(attention_mask)[0][-1].item() - 1
        

        
    
    # left and right indexes excluding answer tokens and eos token
    @property
    def left_idxs(self): return np.arange(self.context_start_pos, self.ans_start_pos)
    
    @property
    def right_idxs(self): return np.arange(self.ans_end_pos+1, self.context_end_pos+1)
    
    @property
    def left_right_idxs(self): return np.concatenate([self.left_idxs, self.right_idxs])
    
    @property
    def rand_left_idx(self): return np.random.choice(self.left_idxs) if self.left_idxs.size > 0 else None
    
    @property
    def rand_right_idx(self): return np.random.choice(self.right_idxs) if self.right_idxs.size > 0 else None
        
    
    
    def right_truncate(self, right_idx):
        """
        Truncate context from random right index to beginning, answer pos doesn't change
        Note: token_type_ids NotImplemented
        """
        if not right_idx: raise Exception("Right index can't be None")
        
        # clone for debugging
        new_input_ids = self.input_ids.clone()
        nopad_input_ids = new_input_ids[self.attention_mask.bool()]
        
        # truncate from right idx to beginning - add eos_token_id to end
        truncated = torch.cat([nopad_input_ids[:right_idx+1], tensor([self.tokenizer.eos_token_id])])
        
        # pad new context until size are equal
        # replace original input context with new
        n_pad = len(nopad_input_ids) - len(truncated)
        new_context = F.pad(truncated, (0,n_pad), value=self.tokenizer.pad_token_id)
        new_input_ids[:self.context_end_pos+2] = new_context
        
        
        # find new attention mask, update new context end position (exclude eos token)
        # Note: context start doesn't change since we don't manipulate question
        new_attention_mask = tensor([1 if i != 1 else 0 for i in new_input_ids])
        new_context_end_pos = torch.where(new_attention_mask)[0][-1].item() - 1 
        self.context_end_pos = new_context_end_pos
        
        # update input_ids and attention_masks
        self.input_ids = new_input_ids
        self.attention_mask = new_attention_mask
        
        return self.input_ids, self.attention_mask, (tensor(self.ans_start_pos), tensor(self.ans_end_pos))

    def random_right_truncate(self):
        right_idx = self.rand_right_idx
        if right_idx: self.right_truncate(right_idx)
    
    
    def left_truncate(self, left_idx):
        """
        Truncate context from random left index to end, answer pos changes too
        Note: token_type_ids NotImplemented
        """
        
        if not left_idx: raise Exception("Left index can't be None")
        
        # clone for debugging
        new_input_ids = self.input_ids.clone()
        
        # pad new context until size are equal
        # replace original input context with new

        n_pad = len(new_input_ids[self.context_start_pos:]) - len(new_input_ids[left_idx:])
        
        new_context = F.pad(new_input_ids[left_idx:], (0,n_pad), value=self.tokenizer.pad_token_id)
        
        new_input_ids[self.context_start_pos:] = new_context
        
                
        # find new attention mask, update new context end position (exclude eos token)
        # Note: context start doesn't change since we don't manipulate question
        new_attention_mask = tensor([1 if i != 1 else 0 for i in new_input_ids])
        new_context_end_pos = torch.where(new_attention_mask)[0][-1].item() - 1
        self.context_end_pos = new_context_end_pos
        
        # find new answer start and end positions
        # update new answer start and end positions
        ans_shift = left_idx - self.context_start_pos
        self.ans_start_pos, self.ans_end_pos = self.ans_start_pos-ans_shift, self.ans_end_pos-ans_shift
        
        
        # update input_ids and attention_masks
        self.input_ids = new_input_ids
        self.attention_mask = new_attention_mask
        
        return self.input_ids, self.attention_mask, (tensor(self.ans_start_pos), tensor(self.ans_end_pos))
        
    def random_left_truncate(self):
        left_idx = self.rand_left_idx
        if left_idx: self.left_truncate(left_idx)
        
        
    def replace_with_mask(self, idxs_to_mask):
        """
        Replace given input ids with tokenizer.mask_token_id
        """
        # clone for debugging
        new_input_ids = self.input_ids.clone()
        new_input_ids[idxs_to_mask] = tensor([self.tokenizer.mask_token_id]*len(idxs_to_mask))
        self.input_ids = new_input_ids

        
    def random_replace_with_mask(self, mask_p=0.2):
        """
        mask_p: Proportion of tokens to replace with mask token id
        """
        idxs_to_mask = np.random.choice(self.left_right_idxs, int(len(self.left_right_idxs)*mask_p))
        if idxs_to_mask.size > 0: self.replace_with_mask(idxs_to_mask)
        
                

In [61]:
i = np.random.choice(range(len(train_inputs)))

input_ids = tensor(train_inputs[i]['input_ids'])
attention_mask = tensor(train_inputs[i]['attention_mask'])
start_position, end_position = train_inputs[i]['start_end_tok_idxs']
start_position, end_position = tensor(start_position), tensor(end_position)

answer_text = train_inputs[i]['answer_text']
context_text = train_inputs[i]['context_text']
offsets = train_inputs[i]['offsets']

In [62]:
input_ids[attention_mask.bool()]

tensor([    0,  7974,     2,     2,  2129,    47,   114,   939,    21,    19,
           47,   235,   122,   131,   939,    74,  1153,   492,    47,    10,
        16531, 25606,   417,     2])

In [63]:
start_position, end_position

(tensor(4), tensor(20))

In [64]:
answer_text, context_text, start_position.item(), end_position.item()

('Poor you If I was with you right now; I would probably give you a hug',
 'Poor you If I was with you right now; I would probably give you a hug ;D',
 4,
 20)

In [65]:
" ".join([tokenizer.id_to_token(o) for o in input_ids[attention_mask.bool()]])

'<s> Ġneutral </s> </s> Ġpoor Ġyou Ġif Ġi Ġwas Ġwith Ġyou Ġright Ġnow ; Ġi Ġwould Ġprobably Ġgive Ġyou Ġa Ġhug Ġ; d </s>'

In [66]:
" ".join([tokenizer.id_to_token(o) for o in input_ids[start_position.item(): end_position.item()+1]])

'Ġpoor Ġyou Ġif Ġi Ġwas Ġwith Ġyou Ġright Ġnow ; Ġi Ġwould Ġprobably Ġgive Ġyou Ġa Ġhug'

In [67]:
char_start = min(np.concatenate([offsets[start_position.item()], offsets[end_position.item()]]))
char_end = max(np.concatenate([offsets[start_position.item()], offsets[end_position.item()]]))

In [68]:
context_text[char_start:char_end]

'Poor you If I was with you right now; I would probably give you a hug'

In [69]:
def convert_ids_to_tokens(toks):
    return [tokenizer.id_to_token(o) for o in toks]

tokenizer.convert_ids_to_tokens = convert_ids_to_tokens

### demo right truncate

In [70]:
da = TSEDataAugmentor(tokenizer, input_ids, attention_mask, start_position, end_position)
da.random_right_truncate()
print(" ".join(tokenizer.convert_ids_to_tokens(da.input_ids[da.attention_mask.bool()])))
print()
print(" ".join(tokenizer.convert_ids_to_tokens(da.input_ids[da.ans_start_pos :da.ans_end_pos+1])))

<s> Ġneutral </s> </s> Ġpoor Ġyou Ġif Ġi Ġwas Ġwith Ġyou Ġright Ġnow ; Ġi Ġwould Ġprobably Ġgive Ġyou Ġa Ġhug Ġ; d </s>

Ġpoor Ġyou Ġif Ġi Ġwas Ġwith Ġyou Ġright Ġnow ; Ġi Ġwould Ġprobably Ġgive Ġyou Ġa Ġhug


### demo left truncate

In [71]:
da = TSEDataAugmentor(tokenizer, input_ids, attention_mask, start_position, end_position)
da.random_left_truncate()
print(" ".join(tokenizer.convert_ids_to_tokens(da.input_ids[da.attention_mask.bool()])))
print()
print(" ".join(tokenizer.convert_ids_to_tokens(da.input_ids[da.ans_start_pos :da.ans_end_pos+1])))

<s> Ġneutral </s> </s> Ġpoor Ġyou Ġif Ġi Ġwas Ġwith Ġyou Ġright Ġnow ; Ġi Ġwould Ġprobably Ġgive Ġyou Ġa Ġhug Ġ; d </s>

Ġpoor Ġyou Ġif Ġi Ġwas Ġwith Ġyou Ġright Ġnow ; Ġi Ġwould Ġprobably Ġgive Ġyou Ġa Ġhug


In [72]:
da.ans_start_pos, da.ans_end_pos

(4, 20)

### demo replace with mask

In [73]:
da = TSEDataAugmentor(tokenizer, input_ids, attention_mask, start_position, end_position)
da.random_replace_with_mask(0.2)
print(" ".join(tokenizer.convert_ids_to_tokens(da.input_ids[da.attention_mask.bool()])))
print()
print(" ".join(tokenizer.convert_ids_to_tokens(da.input_ids[da.ans_start_pos :da.ans_end_pos+1])))

<s> Ġneutral </s> </s> Ġpoor Ġyou Ġif Ġi Ġwas Ġwith Ġyou Ġright Ġnow ; Ġi Ġwould Ġprobably Ġgive Ġyou Ġa Ġhug Ġ; d </s>

Ġpoor Ġyou Ġif Ġi Ġwas Ġwith Ġyou Ġright Ġnow ; Ġi Ġwould Ġprobably Ġgive Ġyou Ġa Ġhug


In [74]:
da.left_idxs, da.right_idxs

(array([], dtype=int64), array([21, 22]))

### demo all

In [75]:
da = TSEDataAugmentor(tokenizer, input_ids, attention_mask, start_position, end_position)

da.random_left_truncate()
da.random_right_truncate()
da.random_replace_with_mask(0.3)
print(" ".join(tokenizer.convert_ids_to_tokens(da.input_ids[da.attention_mask.bool()])))
print()
print(" ".join(tokenizer.convert_ids_to_tokens(da.input_ids[da.ans_start_pos :da.ans_end_pos+1])))

<s> Ġneutral </s> </s> Ġpoor Ġyou Ġif Ġi Ġwas Ġwith Ġyou Ġright Ġnow ; Ġi Ġwould Ġprobably Ġgive Ġyou Ġa Ġhug Ġ; d </s>

Ġpoor Ġyou Ġif Ġi Ġwas Ġwith Ġyou Ġright Ġnow ; Ġi Ġwould Ġprobably Ġgive Ġyou Ġa Ġhug


### TSEDataset

In [76]:
#export
do_tfms = {}
do_tfms["random_left_truncate"] = {"p":.3}
do_tfms["random_right_truncate"] = {"p":.3}
do_tfms["random_replace_with_mask"] = {"p":.3, "mask_p":0.2}
do_tfms["random_replace_with_pseudo"] = {"p":.3}
do_tfms

{'random_left_truncate': {'p': 0.3},
 'random_right_truncate': {'p': 0.3},
 'random_replace_with_mask': {'p': 0.3, 'mask_p': 0.2},
 'random_replace_with_pseudo': {'p': 0.3}}

In [77]:
pseudo_df = pd.read_csv("../data/pseudo_labels/pseudo_labelled_sample.csv")
pseudo_df = pseudo_df[['ids', 'text', 'target', 'predicted_answer']]
pseudo_df.head()

Unnamed: 0,ids,text,target,predicted_answer
0,1467810369,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,bummer.
1,1467810672,is upset that he can't update his Facebook by ...,negative,is upset
2,1467810917,@Kenichan I dived many times for the ball. Man...,negative,The rest go out of bounds
3,1467811184,my whole body feels itchy and like its on fire,negative,my whole body feels itchy
4,1467811193,"@nationwideclass no, it's not behaving at all....",negative,i'm mad.


In [78]:
pseudo_df.shape

(200000, 4)

In [79]:
#export
class TSEDataset(Dataset):
    def __init__(self, inputs, tokenizer=None, is_test=False, do_tfms:Dict=None, pseudo_inputs=None):

        # eval
        self.inputs = inputs

        # augmentation
        self.is_test = is_test
        self.tokenizer = tokenizer
        self.do_tfms = do_tfms
        self.pseudo_inputs = pseudo_inputs
        if self.pseudo_inputs: self.pseudo_idxs = list(range(len(self.pseudo_inputs)))
        
    def __getitem__(self, i):
        'fastai requires (xb, yb) to return'
        
        input_ids = tensor(self.inputs[i]['input_ids'])
        attention_mask = tensor(self.inputs[i]['attention_mask'])
    
        if not self.is_test: 
            start_position, end_position = self.inputs[i]['start_end_tok_idxs']
            start_position, end_position = tensor(start_position), tensor(end_position)
            
            if self.do_tfms:                
                if self.pseudo_inputs and (np.random.uniform() < self.do_tfms["random_replace_with_pseudo"]["p"]):
                    rand_idx = np.random.choice(self.pseudo_idxs)
                    
                    input_ids = tensor(self.pseudo_inputs[rand_idx]['input_ids'])
                    attention_mask = tensor(self.pseudo_inputs[rand_idx]['attention_mask'])
                    start_position, end_position = self.pseudo_inputs[i]['start_end_tok_idxs']
                    start_position, end_position = tensor(start_position), tensor(end_position)
                
                else:
                    augmentor = TSEDataAugmentor(self.tokenizer, 
                             input_ids,
                             attention_mask,
                             start_position, end_position)
                    
                    if np.random.uniform() < self.do_tfms["random_left_truncate"]["p"]:
                        augmentor.random_left_truncate()
                    if np.random.uniform() < self.do_tfms["random_right_truncate"]["p"]:
                        augmentor.random_right_truncate()
                    if np.random.uniform() < self.do_tfms["random_replace_with_mask"]["p"]:
                        augmentor.random_replace_with_mask(self.do_tfms["random_replace_with_mask"]["mask_p"])

                    input_ids = augmentor.input_ids
                    attention_mask = augmentor.attention_mask
                    start_position, end_position = tensor(augmentor.ans_start_pos), tensor(augmentor.ans_end_pos)
                
            
        xb = (input_ids, attention_mask)
        if not self.is_test: yb = (start_position, end_position)
        else: yb = (0,0)
        
        return xb, yb
    
    def __len__(self): return len(self.inputs)

In [80]:
pseudo_inputs = QAInputGenerator.from_df(pseudo_df, 
                                         tokenizer=tokenizer,
                                         q_col='target', id_col='ids', ans_col='predicted_answer')

In [81]:
len(pseudo_inputs)

200000

In [82]:
train_ds = TSEDataset(train_inputs, tokenizer, is_test=False, do_tfms=do_tfms, pseudo_inputs=pseudo_inputs)
test_ds = TSEDataset(test_inputs, tokenizer, is_test=True, do_tfms=None)

In [83]:
do_tfms

{'random_left_truncate': {'p': 0.3},
 'random_right_truncate': {'p': 0.3},
 'random_replace_with_mask': {'p': 0.3, 'mask_p': 0.2},
 'random_replace_with_pseudo': {'p': 0.3}}

In [129]:
train_ds[0]

((tensor([    0,  1313,     2,     2, 20162,   329,  4202,  3807, 23904,  9773,
           1437,   359,  7984,   131,   246,   359,  7984,   131,   246,   359,
           7984,   131,   246, 20162,    62,  3055, 19577,   326, 28897,   328,
              2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,   

In [130]:
train_ds.inputs[0]

{'input_ids': array([   0, 7974,    2,    2, ...,    1,    1,    1,    1]),
 'attention_mask': array([1, 1, 1, 1, ..., 0, 0, 0, 0]),
 'offsets': array([[0, 0],
        [0, 7],
        [0, 0],
        [0, 0],
        ...,
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0]]),
 'tokens': array(['<s>', 'Ġneutral', '</s>', '</s>', ..., '<pad>', '<pad>', '<pad>', '<pad>'], dtype='<U10'),
 'context_text': 'I`d have responded, if I were going',
 'answer_text': 'I`d have responded, if I were going',
 'start_end_tok_idxs': (4, 13),
 'text_id': 'cb774db0d1'}

In [131]:
train_df.iloc[0]

textID                                    cb774db0d1
text             I`d have responded, if I were going
selected_text    I`d have responded, if I were going
sentiment                                    neutral
Name: 0, dtype: object

In [132]:
# ### `predict_answer_text`

# TODO: Migrate to proper notebook

# #export
# def predict_answer_text(start_logits, end_logits, attention_mask,
#                         context_text, char_to_word_offset, token_to_orig_map): 
#     "Find best answer from context"
#     # find best start and end
#     context_start, context_end = min(token_to_orig_map), max(token_to_orig_map)
#     truncated_start_logits = start_logits[attention_mask.bool()][context_start:context_end+1]
#     truncated_end_logits = end_logits[attention_mask.bool()][context_start:context_end+1]
#     best_start_idx, best_end_idx = find_best_start_end_idxs(truncated_start_logits, truncated_end_logits)
    
#     # generate answer
#     tok_orig_char_start = token_to_orig_map[best_start_idx+context_start] 
#     tok_orig_char_end = token_to_orig_map[best_end_idx+context_start]
#     return answer_from_orig_context(context_text, char_to_word_offset, tok_orig_char_start, tok_orig_char_end)

# predict_answer_text(start_logits, end_logits, attention_mask, 
#                    context_text, char_to_word_offset, token_to_orig_map)

### export

In [133]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01-preprocessing.ipynb.
Converted 01-squad-utils.ipynb.
Converted 02-tokenizers.ipynb.
Converted 03-datasets.ipynb.
Converted 04-models.ipynb.
Converted post-process.ipynb.
