In [None]:
# default_exp models

In [None]:
# export
from fastai.text import *
from transformers import RobertaModel, RobertaConfig
from tse.tokenizers import *
from tse.datasets import *

tokenizers: 0.7.0
fastai: 1.0.60


In [None]:
#export
def get_roberta_model(path_to_dir="../roberta-base/"):
    conf = RobertaConfig.from_pretrained(path_to_dir)
    conf.output_hidden_states = True
    model = RobertaModel.from_pretrained(path_to_dir, config=conf)
    # outputs: (final_hidden, pooled_final_hidden, (embedding + 12 hidden))
    return model

In [None]:
model = get_roberta_model()

### Create databunch

In [None]:
# read and strip data
train_df = pd.read_csv("../data/train.csv").dropna().reset_index(drop=True)
test_df = pd.read_csv("../data/test.csv")
train_df.selected_text = train_df.selected_text.apply(lambda s: s.strip())
train_df.text = train_df.text.apply(lambda s: s.strip())
test_df.text = test_df.text.apply(lambda s: s.strip())

In [None]:
from sklearn.model_selection import KFold
kfold = KFold(5, shuffle=True, random_state=42)
fold_idxs = list(kfold.split(train_df))
for i, (trn_idx, val_idx) in enumerate(fold_idxs): train_df.loc[val_idx, "val_fold"] = int(i)

In [None]:
tokenizer = init_roberta_tokenizer("../roberta-base/vocab.json", "../roberta-base/merges.txt", 192)

In [None]:
# get fold dfs
trn_df = train_df[train_df['val_fold'] != 0]
val_df = train_df[train_df['val_fold'] == 0]

In [None]:
# get fold inputs
train_inputs = QAInputGenerator.from_df(trn_df, tokenizer=tokenizer)
valid_inputs = QAInputGenerator.from_df(val_df, tokenizer=tokenizer)

In [None]:
#export
do_tfms = {}
do_tfms["random_left_truncate"] = {"p":.3}
do_tfms["random_right_truncate"] = {"p":.3}
do_tfms["random_replace_with_mask"] = {"p":.3, "mask_p":0.2}
do_tfms

{'random_left_truncate': {'p': 0.3},
 'random_right_truncate': {'p': 0.3},
 'random_replace_with_mask': {'p': 0.3, 'mask_p': 0.2}}

In [None]:
# fold ds
train_ds = TSEDataset(train_inputs, tokenizer, True, do_tfms=do_tfms)
valid_ds = TSEDataset(train_inputs, tokenizer, True)

In [None]:
train_ds[1]

((tensor([   0, 2430,    2,    2,   98, 3036, 5074,  939,   40, 2649,   47,  259,
             2,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1, 

In [None]:
data = DataBunch.create(train_ds, valid_ds, bs=32, val_bs=64)

In [None]:
xb,yb = data.one_batch()

In [None]:
xb, yb

([tensor([[   0, 1313,    2,  ...,    1,    1,    1],
          [   0, 1313,    2,  ...,    1,    1,    1],
          [   0, 1313,    2,  ...,    1,    1,    1],
          ...,
          [   0, 2430,    2,  ...,    1,    1,    1],
          [   0, 7974,    2,  ...,    1,    1,    1],
          [   0, 2430,    2,  ...,    1,    1,    1]]),
  tensor([[1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          ...,
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0]])],
 [tensor([ 4,  4,  9,  4,  4,  4, 10,  5, 19, 31,  7,  4,  4, 12,  6,  4,  4,  4,
           4, 11,  4,  4,  4,  6,  5,  4,  4,  4,  6,  8,  4,  5]),
  tensor([13, 26,  9,  5, 47, 10, 10, 30, 23, 31,  9,  4,  6, 12,  7, 35,  9, 25,
          24, 30,  9,  7, 10,  6,  5, 41, 29, 27,  6, 10, 10,  5])])

### TSEModel

In [None]:
#export
class QAHead(Module): 
    def __init__(self, p=0.5):    
        self.d0 = nn.Dropout(p)
        self.l0 = nn.Linear(768*2, 2)
    def forward(self, x):
        return self.l0(self.d0(x))
    
class TSEModel(Module):
    def __init__(self, model): 
        self.sequence_model = model
        self.head = QAHead()
        
    def forward(self, *xargs):
        inp = {}
        inp["input_ids"] = xargs[0]
        inp["attention_mask"] = xargs[1]    
        _, _, hidden_states = self.sequence_model(**inp)
        x = torch.cat([hidden_states[-1], hidden_states[-1]], dim=-1)
        start_logits, end_logits = self.head(x).split(1, dim=-1)
        return (start_logits.squeeze(-1), end_logits.squeeze(-1))

### loss

In [None]:
tse_model = TSEModel(model)

In [None]:
out = tse_model(*xb)

In [None]:
#export
class CELoss(Module):
    "single backward by concatenating both start and logits with correct targets"
    def __init__(self): self.loss_fn = nn.CrossEntropyLoss()
    def forward(self, inputs, start_targets, end_targets):
        start_logits, end_logits = inputs
        logits = torch.cat([start_logits, end_logits]).contiguous()
        targets = torch.cat([start_targets, end_targets]).contiguous()
        return self.loss_fn(logits, targets)

In [None]:
loss_fn = CELoss()
loss_fn(out, *yb)

tensor(5.2267, grad_fn=<NllLossBackward>)

In [None]:
#export
class LSLoss(Module):
    "single backward by concatenating both start and logits with correct targets"
    def __init__(self, eps=0.1): self.loss_fn = LabelSmoothingCrossEntropy(eps=eps)
    def forward(self, inputs, start_targets, end_targets):
        start_logits, end_logits = inputs
        logits = torch.cat([start_logits, end_logits]).contiguous()
        targets = torch.cat([start_targets, end_targets]).contiguous()
        return self.loss_fn(logits, targets)

In [None]:
loss_fn = LSLoss()
loss_fn(out, *yb)

tensor(5.2330, grad_fn=<AddBackward0>)

### metric

In [None]:
#export
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
#export
def get_best_start_end_idxs(_start_logits, _end_logits):
    best_logit = -1000
    best_idxs = None
    for start_idx, start_logit in enumerate(_start_logits):
        for end_idx, end_logit in enumerate(_end_logits[start_idx:]):
            logit_sum = (start_logit + end_logit).item()
            if logit_sum > best_logit:
                best_logit = logit_sum
                best_idxs = (start_idx, start_idx+end_idx)
    return best_idxs

In [None]:
valid_ds.inputs[0]

{'input_ids': array([   0, 7974,    2,    2, ...,    1,    1,    1,    1]),
 'attention_mask': array([1, 1, 1, 1, ..., 0, 0, 0, 0]),
 'offsets': array([[0, 0],
        [0, 7],
        [0, 0],
        [0, 0],
        ...,
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0]]),
 'tokens': array(['<s>', 'Ġneutral', '</s>', '</s>', ..., '<pad>', '<pad>', '<pad>', '<pad>'], dtype='<U10'),
 'context_text': 'I`d have responded, if I were going',
 'answer_text': 'I`d have responded, if I were going',
 'start_end_tok_idxs': (4, 13)}

In [None]:
xb[0]

tensor([[   0, 1313,    2,  ...,    1,    1,    1],
        [   0, 7974,    2,  ...,    1,    1,    1],
        [   0, 1313,    2,  ...,    1,    1,    1],
        ...,
        [   0, 7974,    2,  ...,    1,    1,    1],
        [   0, 2430,    2,  ...,    1,    1,    1],
        [   0, 7974,    2,  ...,    1,    1,    1]])

In [None]:
valid_ds.inputs[0]

{'input_ids': array([   0, 7974,    2,    2, ...,    1,    1,    1,    1]),
 'attention_mask': array([1, 1, 1, 1, ..., 0, 0, 0, 0]),
 'offsets': array([[0, 0],
        [0, 7],
        [0, 0],
        [0, 0],
        ...,
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0]]),
 'tokens': array(['<s>', 'Ġneutral', '</s>', '</s>', ..., '<pad>', '<pad>', '<pad>', '<pad>'], dtype='<U10'),
 'context_text': 'I`d have responded, if I were going',
 'answer_text': 'I`d have responded, if I were going',
 'start_end_tok_idxs': (4, 13)}

In [None]:
#export
class JaccardScore(Callback):
    "Stores predictions and targets to perform calculations on epoch end."
    def __init__(self, valid_ds): 
        self.valid_ds = valid_ds
        self.offset_shift = 4
        
        
    def on_epoch_begin(self, **kwargs):
        self.jaccard_scores = []  
        self.valid_ds_idx = 0
        
        
    def on_batch_end(self, last_input:Tensor, last_output:Tensor, last_target:Tensor, **kwargs):
                
        input_ids, attention_masks = last_input[0], last_input[1].bool()
        start_logits, end_logits = last_output
        
        
        # mask select only context part
        for i in range(len(input_ids)):

            _input_ids = input_ids[i].masked_select(attention_masks[i])
            _start_logits = start_logits[i].masked_select(attention_masks[i])[4:-1] 
            _end_logits = end_logits[i].masked_select(attention_masks[i])[4:-1] 
            start_idx, end_idx = get_best_start_end_idxs(_start_logits, _end_logits)
            start_idx, end_idx = start_idx + self.offset_shift, end_idx + self.offset_shift
            
            context_text = self.valid_ds.inputs[self.valid_ds_idx]['context_text']
            offsets = self.valid_ds.inputs[self.valid_ds_idx]['offsets']
            answer_text = self.valid_ds.inputs[self.valid_ds_idx]['answer_text']
            
            start_offs, end_offs = offsets[start_idx], offsets[end_idx]
            answer = context_text[start_offs[0]:end_offs[1]]            
            
            self.jaccard_scores.append(jaccard(answer, answer_text))
            self.valid_ds_idx += 1
            
    def on_epoch_end(self, last_metrics, **kwargs):        
        res = np.mean(self.jaccard_scores)
        return add_metrics(last_metrics, res)

### Training

In [None]:
#export
def model_split_func(m): 
    "4 layer groups"
    n = (2*len(m.sequence_model.encoder.layer))//3 
    return (m.sequence_model.embeddings, 
            m.sequence_model.encoder.layer[:n],
            m.sequence_model.encoder.layer[n:],
            m.head)

In [None]:
learner = Learner(data, tse_model, loss_func=CELoss(), metrics=[JaccardScore(valid_ds)])

In [None]:
learner = learner.split(model_split_func)

In [None]:
learner.freeze_to(-1)

In [None]:
# early_stop_cb = EarlyStoppingCallback(learner, monitor='jaccard_score',mode='max',patience=2)
# save_model_cb = SaveModelCallback(learner,every='improvement',monitor='jaccard_score',name=f'{MODEL_TYPE}-qa-finetune')
# csv_logger_cb = CSVLogger(learner, f"training_logs_{foldnum}", True)

In [None]:
# learner.to_fp16();
# learner.to_fp32();

In [None]:
# learner.validate()

### export

In [None]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01-squad-utils.ipynb.
Converted 02-tokenizers.ipynb.
Converted 03-datasets.ipynb.
Converted 04-models.ipynb.
Converted post-process.ipynb.
