In [1]:
import transformers

In [93]:
tokenizer = transformers.BertTokenizerFast.from_pretrained('bert-base-uncased')

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=466062.0), HTML(value='')))




In [5]:
context = 'Create and share like never before at <b>Polaroid</b>.com. Find instant film and   cameras reinvented for the digital age. Plus, digital cameras, digital camcorders,   LCD ...'

In [74]:
import re

def get_match_positions(match):
    outer_span = match.span(0)
    inner_span = match.span(1)
    return outer_span[0], inner_span[0], inner_span[1], outer_span[1]

def get_word_spans_positions(context):
    pattern = re.compile(r'<b>(.+?)</b>')
    return [get_match_positions(m) for m in pattern.finditer(context)]

def clear_tags_and_get_positions(context):
    word_spans_positions = get_word_spans_positions(context)
    prev_position = 0
    parts = []
    positions = []
    current_diff = 0
    for outer_start, inner_start, inner_end, outer_end in word_spans_positions:
        parts.append(context[prev_position:outer_start])
        parts.append(context[inner_start:inner_end])
        prev_position = outer_end
        
        current_diff += inner_start - outer_start
        positions.append((inner_start - current_diff, inner_end - current_diff))
        current_diff += outer_end - inner_end
    parts.append(context[prev_position:])
    return ''.join(parts), positions

In [80]:
cleared_ctx, positions = clear_tags_and_get_positions('context, <b>one</b> and then <b>And yet another </b> so the end')

In [81]:
cleared_ctx

'context, one and then And yet another  so the end'

In [82]:
for start, end in positions:
    print(cleared_ctx[start:end])

one
And yet another 


In [163]:
def get_word_token_ids(all_token_positions, word_positions):
    all_positions_iter = enumerate(all_token_positions)
    sos_pos = next(all_positions_iter)
    i, current_pos = next(all_positions_iter)
    result = []
    for word_start, word_end in word_positions:
        while current_pos[0] < word_start:
            i, current_pos = next(all_positions_iter)
        while current_pos is not None and current_pos[1] > 0 and current_pos[1] <= word_end:
            result.append(i)
            i, current_pos = next(all_positions_iter, (None, None))
            
    return result

In [164]:
enc = tokenizer.encode_plus(cleared_ctx, return_offsets_mapping=True)
all_token_positions = enc['offset_mapping']
all_token_positions

[(0, 0),
 (0, 7),
 (7, 8),
 (9, 12),
 (13, 16),
 (17, 21),
 (22, 25),
 (26, 29),
 (30, 37),
 (39, 41),
 (42, 45),
 (46, 49),
 (0, 0)]

In [165]:
word_token_ids = get_word_token_ids(all_token_positions, positions)
word_token_ids

[3, 6, 7, 8]

In [166]:
tokenizer.decode([enc['input_ids'][i] for i in word_token_ids])

'one and yet another'

In [167]:
def test_ctx(context, expected):
    cleared_ctx, positions = clear_tags_and_get_positions(context)
    enc = tokenizer.encode_plus(cleared_ctx, return_offsets_mapping=True)
    all_token_positions = enc['offset_mapping']
    word_token_ids = get_word_token_ids(all_token_positions, positions)
    actual = tokenizer.decode([enc['input_ids'][i] for i in word_token_ids])
    assert actual == expected, f'Actual is "{actual}"'
    

In [168]:
test_ctx('context, <b>one</b> and then <b>And yet another </b> so the end', 'one and yet another')
test_ctx('<b>one</b> and then <b>And yet another </b> so the end', 'one and yet another')
test_ctx('context, <b>one</b> and then <b>And yet another </b>', 'one and yet another')
test_ctx('<b>one</b> and then <b>And yet another </b>', 'one and yet another')
test_ctx('<b>one</b> <b>And yet another </b>', 'one and yet another')
test_ctx('<b>only one</b>', 'only one')
test_ctx('<b>only</b>', 'only')

In [169]:
import torch
t = torch.tensor([5, 8, 2, 4])
t[[1, 2, 3]] = -1
t

tensor([ 5, -1, -1, -1])