In [1]:
from datasets import load_dataset
from datasets import DatasetDict
from transformers import AutoTokenizer

In [2]:
# Define list of entity labels and create mappings between labels and IDs
label_list = [
    'B-ACCOUNTNUM',
    'B-BUILDINGNUM',
    'B-CITY',
    'B-CREDITCARDNUMBER',
    'B-DATEOFBIRTH',
    'B-DRIVERLICENSENUM',
    'B-EMAIL',
    'B-GIVENNAME',
    'B-IDCARDNUM',
    'B-PASSWORD',
    'B-SOCIALNUM',
    'B-STREET',
    'B-SURNAME',
    'B-TAXNUM',
    'B-TELEPHONENUM',
    'B-USERNAME',
    'B-ZIPCODE',
    'I-ACCOUNTNUM',
    'I-BUILDINGNUM',
    'I-CITY',
    'I-CREDITCARDNUMBER',
    'I-DATEOFBIRTH',
    'I-DRIVERLICENSENUM',
    'I-EMAIL',
    'I-GIVENNAME',
    'I-IDCARDNUM',
    'I-PASSWORD',
    'I-SOCIALNUM',
    'I-STREET',
    'I-SURNAME',
    'I-TAXNUM',
    'I-TELEPHONENUM',
    'I-USERNAME',
    'I-ZIPCODE',
    'O',
]

id2label = {idx: label for idx, label in enumerate(label_list)}
label2id = {label: idx for idx, label in enumerate(label_list)}

In [3]:
# load bert tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased", truncation=True)

In [4]:
# load the privacy dataset
dataset = load_dataset("ai4privacy/pii-masking-400k")
training_set = dataset["train"]
valid_set = dataset["validation"]

In [5]:
training_set.features

{'source_text': Value(dtype='string', id=None),
 'locale': Value(dtype='string', id=None),
 'language': Value(dtype='string', id=None),
 'split': Value(dtype='string', id=None),
 'privacy_mask': [{'label': Value(dtype='string', id=None),
   'start': Value(dtype='int64', id=None),
   'end': Value(dtype='int64', id=None),
   'value': Value(dtype='string', id=None),
   'label_index': Value(dtype='int64', id=None)}],
 'uid': Value(dtype='int64', id=None),
 'masked_text': Value(dtype='string', id=None),
 'mbert_tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'mbert_token_classes': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [6]:
example = training_set[0]['source_text']
training_set[0]["mbert_tokens"]

['<',
 'p',
 '>',
 'My',
 'child',
 'fa',
 '##oz',
 '##zs',
 '##d',
 '##3',
 '##7',
 '##9',
 '##22',
 '##3',
 '(',
 'DO',
 '##B',
 ':',
 'May',
 '/',
 '58',
 ')',
 'will',
 'under',
 '##go',
 'treatment',
 'with',
 'Dr',
 '.',
 'fa',
 '##oz',
 '##zs',
 '##d',
 '##3',
 '##7',
 '##9',
 '##22',
 '##3',
 ',',
 'office',
 'at',
 'Hill',
 'Road',
 '.',
 'Our',
 'Z',
 '##IP',
 'code',
 'is',
 '281',
 '##70',
 '-',
 '639',
 '##2',
 '.',
 'Con',
 '##sul',
 '##t',
 'policy',
 'M',
 '.',
 'UE',
 '.',
 '227',
 '##99',
 '##5',
 '.',
 'Contact',
 'number',
 ':',
 '007',
 '##0',
 '.',
 '606',
 '.',
 '322',
 '.',
 '624',
 '##4',
 '.',
 'Hand',
 '##le',
 'transaction',
 '##s',
 'with',
 '622',
 '##5',
 '##42',
 '##7',
 '##22',
 '##04',
 '##12',
 '##9',
 '##6',
 '##3',
 '.',
 'Que',
 '##ries',
 '?',
 'Em',
 '##ail',
 ':',
 'fa',
 '##oz',
 '##zs',
 '##d',
 '##3',
 '##7',
 '##9',
 '##22',
 '##3',
 '@',
 'out',
 '##lo',
 '##ok',
 '.',
 'com',
 '.',
 '<',
 '/',
 'p',
 '>']

In [7]:
tokenized_inputs = tokenizer(example, truncation=True)
tokenizer.convert_ids_to_tokens(tokenized_inputs["input_ids"])
# tokens are identical except the first and the last special token

['[CLS]',
 '<',
 'p',
 '>',
 'My',
 'child',
 'fa',
 '##oz',
 '##zs',
 '##d',
 '##3',
 '##7',
 '##9',
 '##22',
 '##3',
 '(',
 'DO',
 '##B',
 ':',
 'May',
 '/',
 '58',
 ')',
 'will',
 'under',
 '##go',
 'treatment',
 'with',
 'Dr',
 '.',
 'fa',
 '##oz',
 '##zs',
 '##d',
 '##3',
 '##7',
 '##9',
 '##22',
 '##3',
 ',',
 'office',
 'at',
 'Hill',
 'Road',
 '.',
 'Our',
 'Z',
 '##IP',
 'code',
 'is',
 '281',
 '##70',
 '-',
 '639',
 '##2',
 '.',
 'Con',
 '##sul',
 '##t',
 'policy',
 'M',
 '.',
 'UE',
 '.',
 '227',
 '##99',
 '##5',
 '.',
 'Contact',
 'number',
 ':',
 '007',
 '##0',
 '.',
 '606',
 '.',
 '322',
 '.',
 '624',
 '##4',
 '.',
 'Hand',
 '##le',
 'transaction',
 '##s',
 'with',
 '622',
 '##5',
 '##42',
 '##7',
 '##22',
 '##04',
 '##12',
 '##9',
 '##6',
 '##3',
 '.',
 'Que',
 '##ries',
 '?',
 'Em',
 '##ail',
 ':',
 'fa',
 '##oz',
 '##zs',
 '##d',
 '##3',
 '##7',
 '##9',
 '##22',
 '##3',
 '@',
 'out',
 '##lo',
 '##ok',
 '.',
 'com',
 '.',
 '<',
 '/',
 'p',
 '>',
 '[SEP]']

In [8]:
# align label and tokens
def align_labels(examples):
    tokenized_inputs = tokenizer(examples["source_text"], truncation=True)
    label_ids = [-100]

    for privacy_class in examples["mbert_token_classes"]:
        label_ids.append(label2id[privacy_class])
    label_ids = label_ids[:511]  # truncate too long labels
    label_ids.append(-100)

    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

In [9]:
tokenized_training_set = training_set.map(align_labels, batched=False)
tokenized_valid_set = valid_set.map(align_labels, batched=False)

In [10]:
removed_columns = [
    'locale',
    'split',
    'privacy_mask',
    'uid',
    'mbert_tokens',
    'mbert_token_classes',
]

tokenized_training_set = tokenized_training_set.remove_columns(removed_columns)
tokenized_valid_set = tokenized_valid_set.remove_columns(removed_columns)

In [11]:
tokenized_datasets = DatasetDict({
    "train": tokenized_training_set,
    "validation": tokenized_valid_set
})

tokenized_datasets.save_to_disk("./tokenized_dataset/tokenized_data")

Saving the dataset (0/1 shards):   0%|          | 0/325517 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/81379 [00:00<?, ? examples/s]