In [1]:
# default_exp models

In [2]:
# export
from fastai.text import *
from transformers import RobertaModel, RobertaConfig
from tse.preprocessing import *
from tse.tokenizers import *
from tse.datasets import *

tokenizers: 0.7.0
fastai: 1.0.60


In [3]:
#export
def get_roberta_model(path_to_dir="../roberta-base/"):
    conf = RobertaConfig.from_pretrained(path_to_dir)
    conf.output_hidden_states = True
    model = RobertaModel.from_pretrained(path_to_dir, config=conf)
    # outputs: (final_hidden, pooled_final_hidden, (embedding + 12 hidden))
    return model

In [4]:
model = get_roberta_model()

### Create databunch

Preprocess

In [5]:
train_df = pd.read_csv("../data/train.csv").dropna().reset_index(drop=True)
test_df = pd.read_csv("../data/test.csv")

In [6]:
strip_text(train_df, "text")
strip_text(train_df, "selected_text")
strip_text(test_df, "text")

In [7]:
replace_whitespace(train_df, "text")
replace_whitespace(train_df, "selected_text")
replace_whitespace(test_df, "text")

In [8]:
replace_URLs(train_df, "text")
replace_URLs(train_df, "selected_text")
replace_URLs(test_df, "text")

In [9]:
replace_user(train_df, "text")
replace_user(train_df, "selected_text")
replace_user(test_df, "text")

In [10]:
is_wrong = train_df.apply(lambda o: is_wrong_selection(o['text'], o['selected_text']), 1)
train_df = train_df[~is_wrong].reset_index(drop=True)

In [11]:
from sklearn.model_selection import KFold
kfold = KFold(5, shuffle=True, random_state=42)
fold_idxs = list(kfold.split(train_df))
for i, (trn_idx, val_idx) in enumerate(fold_idxs): train_df.loc[val_idx, "val_fold"] = int(i)

In [12]:
tokenizer = init_roberta_tokenizer("../roberta-base/vocab.json", "../roberta-base/merges.txt", 192)

In [13]:
train_df.val_fold.value_counts()

2.0    5492
0.0    5492
3.0    5492
1.0    5492
4.0    5491
Name: val_fold, dtype: int64

In [14]:
# get fold dfs
trn_df = train_df[train_df['val_fold'] != 0]
val_df = train_df[train_df['val_fold'] == 0]

In [15]:
# get fold inputs
train_inputs = QAInputGenerator.from_df(trn_df, tokenizer=tokenizer)
valid_inputs = QAInputGenerator.from_df(val_df, tokenizer=tokenizer)

In [16]:
#export
do_tfms = {}
do_tfms["random_left_truncate"] = {"p":.3}
do_tfms["random_right_truncate"] = {"p":.3}
do_tfms["random_replace_with_mask"] = {"p":.3, "mask_p":0.2}
do_tfms

{'random_left_truncate': {'p': 0.3},
 'random_right_truncate': {'p': 0.3},
 'random_replace_with_mask': {'p': 0.3, 'mask_p': 0.2}}

In [17]:
# fold ds
train_ds = TSEDataset(train_inputs, tokenizer, is_test=False, do_tfms=do_tfms)
valid_ds = TSEDataset(train_inputs, tokenizer, is_test=False)

In [18]:
train_ds[1]

((tensor([    0,  2430,     2,     2,    98,  3036,  5074,   939,    40,  2649,
             47,   259,    11, 15610,  1597,  2977, 16506,     2,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,   

In [19]:
data = DataBunch.create(train_ds, valid_ds, bs=32, val_bs=64)

In [20]:
xb,yb = data.one_batch()

In [21]:
xb, yb

([tensor([[   0, 7974,    2,  ...,    1,    1,    1],
          [   0, 2430,    2,  ...,    1,    1,    1],
          [   0, 1313,    2,  ...,    1,    1,    1],
          ...,
          [   0, 7974,    2,  ...,    1,    1,    1],
          [   0, 1313,    2,  ...,    1,    1,    1],
          [   0, 1313,    2,  ...,    1,    1,    1]]),
  tensor([[1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          ...,
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0]])],
 [tensor([ 4, 13, 28,  4, 33,  4,  5,  4, 14,  4,  8,  8,  6,  4, 12,  7,  4,  7,
          14, 17,  9,  6,  5,  4,  4, 12, 11, 10, 13,  4,  4, 32]),
  tensor([22, 26, 28, 39, 34,  7,  5,  4, 18, 40,  8,  8,  7, 30, 12, 18, 10,  7,
          34, 26, 12,  7, 11,  7,  8, 13, 14, 12, 14, 20,  5, 32])])

### TSEModel

In [22]:
#export
noop_layer = Lambda(lambda x: x)

class QAHead(Module): 
    def __init__(self, p=0.5, hidden_size=768, num_hidden_states=2, use_ln=False):  
        self.ln0 = nn.LayerNorm(hidden_size*num_hidden_states) if use_ln else noop_layer
        self.d0 = nn.Dropout(p)
        self.l0 = nn.Linear(hidden_size*num_hidden_states, 2)
    def forward(self, x):
        return self.l0(self.d0(self.ln0(x)))
    
class TSEModel(Module):
    def __init__(self, pretrained_model, head=QAHead(), num_hidden_states=2): 
        self.sequence_model = pretrained_model
        self.head = head
        self.num_hidden_states = num_hidden_states
        
    def forward(self, *xargs):
        inp = {}
        inp["input_ids"] = xargs[0]
        inp["attention_mask"] = xargs[1]    
        _, _, hidden_states = self.sequence_model(**inp)
        x = torch.cat(hidden_states[-self.num_hidden_states:], dim=-1)
        start_logits, end_logits = self.head(x).split(1, dim=-1)
        return (start_logits.squeeze(-1), end_logits.squeeze(-1))

### loss

In [23]:
tse_model = TSEModel(model, QAHead(use_ln=True, num_hidden_states=2), num_hidden_states=2)

In [24]:
out = tse_model(*xb)

In [25]:
out

(tensor([[ 0.8597,  0.5541,  0.8427,  ..., -0.2491, -0.1137, -0.1327],
         [ 0.6116, -1.3524,  0.0248,  ...,  0.6115,  0.8474, -0.9773],
         [ 0.7890,  0.1800,  0.9404,  ...,  0.1278,  0.0720,  0.8742],
         ...,
         [ 0.6648, -0.7379,  0.6890,  ..., -0.5549,  0.1579,  0.1812],
         [ 0.7905,  0.2779,  0.0317,  ...,  1.1267,  0.9727, -0.0054],
         [-0.0068, -0.1116,  0.7793,  ...,  0.2527, -0.1828, -0.8484]],
        grad_fn=<SqueezeBackward1>),
 tensor([[ 0.6134,  1.2737, -0.0136,  ..., -0.6596, -0.1256, -0.0377],
         [ 0.6160,  0.4515, -0.0087,  ...,  0.4548,  0.3897,  0.3446],
         [ 0.6111,  0.6305,  0.1408,  ...,  1.0373,  0.4215,  0.3342],
         ...,
         [ 0.5712,  0.1099,  0.6075,  ...,  0.7023,  0.2611,  0.2541],
         [ 0.6227,  0.8035,  0.4469,  ...,  0.9531,  1.4756,  0.9023],
         [-0.1501,  1.2173,  0.6362,  ...,  0.0563,  0.8055,  1.0089]],
        grad_fn=<SqueezeBackward1>))

In [26]:
#export
class CELoss(Module):
    "single backward by concatenating both start and logits with correct targets"
    def __init__(self): self.loss_fn = nn.CrossEntropyLoss()
    def forward(self, inputs, start_targets, end_targets):
        start_logits, end_logits = inputs
        logits = torch.cat([start_logits, end_logits]).contiguous()
        targets = torch.cat([start_targets, end_targets]).contiguous()
        return self.loss_fn(logits, targets)

In [27]:
loss_fn = CELoss()
loss_fn(out, *yb)

tensor(5.3878, grad_fn=<NllLossBackward>)

In [28]:
#export
class LSLoss(Module):
    "single backward by concatenating both start and logits with correct targets"
    def __init__(self, eps=0.1): self.loss_fn = LabelSmoothingCrossEntropy(eps=eps)
    def forward(self, inputs, start_targets, end_targets):
        start_logits, end_logits = inputs
        logits = torch.cat([start_logits, end_logits]).contiguous()
        targets = torch.cat([start_targets, end_targets]).contiguous()
        return self.loss_fn(logits, targets)

In [29]:
loss_fn = LSLoss()
loss_fn(out, *yb)

tensor(5.3878, grad_fn=<AddBackward0>)

### metric

In [30]:
#export
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [31]:
#export
def get_best_start_end_idxs(_start_logits, _end_logits):
    best_logit = -1000
    best_idxs = None
    for start_idx, start_logit in enumerate(_start_logits):
        for end_idx, end_logit in enumerate(_end_logits[start_idx:]):
            logit_sum = (start_logit + end_logit).item()
            if logit_sum > best_logit:
                best_logit = logit_sum
                best_idxs = (start_idx, start_idx+end_idx)
    return best_idxs

In [32]:
valid_ds.inputs[0]

{'input_ids': array([   0, 7974,    2,    2, ...,    1,    1,    1,    1]),
 'attention_mask': array([1, 1, 1, 1, ..., 0, 0, 0, 0]),
 'offsets': array([[0, 0],
        [0, 7],
        [0, 0],
        [0, 0],
        ...,
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0]]),
 'tokens': array(['<s>', 'Ġneutral', '</s>', '</s>', ..., '<pad>', '<pad>', '<pad>', '<pad>'], dtype='<U10'),
 'context_text': 'I`d have responded, if I were going',
 'answer_text': 'I`d have responded, if I were going',
 'start_end_tok_idxs': (4, 13),
 'text_id': 'cb774db0d1'}

In [33]:
xb[0]

tensor([[   0, 7974,    2,  ...,    1,    1,    1],
        [   0, 2430,    2,  ...,    1,    1,    1],
        [   0, 1313,    2,  ...,    1,    1,    1],
        ...,
        [   0, 7974,    2,  ...,    1,    1,    1],
        [   0, 1313,    2,  ...,    1,    1,    1],
        [   0, 1313,    2,  ...,    1,    1,    1]])

In [34]:
valid_ds.inputs[0]

{'input_ids': array([   0, 7974,    2,    2, ...,    1,    1,    1,    1]),
 'attention_mask': array([1, 1, 1, 1, ..., 0, 0, 0, 0]),
 'offsets': array([[0, 0],
        [0, 7],
        [0, 0],
        [0, 0],
        ...,
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0]]),
 'tokens': array(['<s>', 'Ġneutral', '</s>', '</s>', ..., '<pad>', '<pad>', '<pad>', '<pad>'], dtype='<U10'),
 'context_text': 'I`d have responded, if I were going',
 'answer_text': 'I`d have responded, if I were going',
 'start_end_tok_idxs': (4, 13),
 'text_id': 'cb774db0d1'}

In [35]:
#export
class JaccardScore(Callback):
    "Stores predictions and targets to perform calculations on epoch end."
    def __init__(self, valid_ds): 
        self.valid_ds = valid_ds
        self.offset_shift = 4
        
        
    def on_epoch_begin(self, **kwargs):
        self.jaccard_scores = []  
        self.valid_ds_idx = 0
        
        
    def on_batch_end(self, last_input:Tensor, last_output:Tensor, last_target:Tensor, **kwargs):
                
        input_ids, attention_masks = last_input[0], last_input[1].bool()
        start_logits, end_logits = last_output
        
        
        # mask select only context part
        for i in range(len(input_ids)):

            _input_ids = input_ids[i].masked_select(attention_masks[i])
            _start_logits = start_logits[i].masked_select(attention_masks[i])[4:-1] 
            _end_logits = end_logits[i].masked_select(attention_masks[i])[4:-1] 
            start_idx, end_idx = get_best_start_end_idxs(_start_logits, _end_logits)
            start_idx, end_idx = start_idx + self.offset_shift, end_idx + self.offset_shift
            
            context_text = self.valid_ds.inputs[self.valid_ds_idx]['context_text']
            offsets = self.valid_ds.inputs[self.valid_ds_idx]['offsets']
            answer_text = self.valid_ds.inputs[self.valid_ds_idx]['answer_text']
            
            start_offs, end_offs = offsets[start_idx], offsets[end_idx]
            answer = context_text[start_offs[0]:end_offs[1]]            
            
            self.jaccard_scores.append(jaccard(answer, answer_text))
            self.valid_ds_idx += 1
            
    def on_epoch_end(self, last_metrics, **kwargs):        
        res = np.mean(self.jaccard_scores)
        return add_metrics(last_metrics, res)

### Training

In [47]:
#export
def model_split_func(m, num_hidden_states): 
    "4 layer groups"
    return (m.sequence_model.embeddings, 
            m.sequence_model.encoder.layer[:-num_hidden_states],
            m.sequence_model.encoder.layer[-num_hidden_states:],
            m.head)

In [48]:
learner = Learner(data, tse_model, loss_func=CELoss(), metrics=[JaccardScore(valid_ds)])

In [49]:
learner.model

TSEModel(
  (sequence_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-0

In [51]:
split_fn = partial(model_split_func, num_hidden_states=2)
learner = learner.split(split_fn)

In [52]:
learner.freeze_to(-1)

In [53]:
# early_stop_cb = EarlyStoppingCallback(learner, monitor='jaccard_score',mode='max',patience=2)
# save_model_cb = SaveModelCallback(learner,every='improvement',monitor='jaccard_score',name=f'{MODEL_TYPE}-qa-finetune')
# csv_logger_cb = CSVLogger(learner, f"training_logs_{foldnum}", True)

In [54]:
# learner.to_fp16();
# learner.to_fp32();

In [55]:
# learner.validate()

### export

In [56]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01-preprocessing.ipynb.
Converted 01-squad-utils.ipynb.
Converted 02-tokenizers.ipynb.
Converted 03-datasets.ipynb.
Converted 04-models.ipynb.
Converted post-process.ipynb.
