In [1]:
import re
from datasets import load_dataset, load_from_disk, DatasetDict
from transformers import AutoTokenizer

In [2]:
label_list = [
    'B-ACCOUNTNUM',
    'B-BUILDINGNUM',
    'B-CITY',
    'B-CREDITCARDNUMBER',
    'B-DATEOFBIRTH',
    'B-DRIVERLICENSENUM',
    'B-EMAIL',
    'B-GIVENNAME',
    'B-IDCARDNUM',
    'B-PASSWORD',
    'B-SOCIALNUM',
    'B-STREET',
    'B-SURNAME',
    'B-TAXNUM',
    'B-TELEPHONENUM',
    'B-USERNAME',
    'B-ZIPCODE',
    'I-ACCOUNTNUM',
    'I-BUILDINGNUM',
    'I-CITY',
    'I-CREDITCARDNUMBER',
    'I-DATEOFBIRTH',
    'I-DRIVERLICENSENUM',
    'I-EMAIL',
    'I-GIVENNAME',
    'I-IDCARDNUM',
    'I-PASSWORD',
    'I-SOCIALNUM',
    'I-STREET',
    'I-SURNAME',
    'I-TAXNUM',
    'I-TELEPHONENUM',
    'I-USERNAME',
    'I-ZIPCODE',
    'O',
]

id2label = {idx: label for idx, label in enumerate(label_list)}
label2id = {label: idx for idx, label in enumerate(label_list)}
label_set = set(l[2:] for l in label_list[:-1])

In [3]:
def generate_sequence_labels(text, privacy_mask):
    # sort privacy mask by start position
    privacy_mask = sorted(privacy_mask, key=lambda x: x['start'], reverse=True)
    
    # replace sensitive pieces of text with labels
    for item in privacy_mask:
        label = item['label']
        start = item['start']
        end = item['end']
        value = item['value']
        # count the number of words in the value
        word_count = len(value.split())
        
        # replace the sensitive information with the appropriate number of [label] placeholders
        replacement = " ".join([f"{label}" for _ in range(word_count)])
        text = text[:start] + replacement + text[end:]
        
    words = text.split()
    # assign labels to each word
    labels = []
    for word in words:
        match = re.search(r"(\w+)", word)  # match any word character
        if match:
            label = match.group(1)
            if label in label_set:
                labels.append(label)
            else:
                # any other word is labeled as "O"
                labels.append("O")
        else:
            labels.append("O")
    return labels

In [4]:
k = 0
def tokenize_and_align_labels(examples):
    words = [t.split() for t in examples["source_text"]]
    tokenized_inputs = tokenizer(words, truncation=True, is_split_into_words=True, max_length=512)
    source_labels = [
        generate_sequence_labels(text, mask)
        for text, mask in zip(examples["source_text"], examples["privacy_mask"])
    ]

    labels = []
    valid_idx = []
    for i, label in enumerate(source_labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # map tokens to their respective word.
        previous_label = None
        label_ids = [-100]
        try:
            for word_idx in word_ids:
                if word_idx is None:
                    continue
                elif label[word_idx] == "O":
                    label_ids.append(label2id["O"])
                    continue
                elif previous_label == label[word_idx]:
                    label_ids.append(label2id[f"I-{label[word_idx]}"])
                else:
                    label_ids.append(label2id[f"B-{label[word_idx]}"])
                previous_label = label[word_idx]
            label_ids = label_ids[:511] + [-100]
            labels.append(label_ids)
            # print(word_ids)
            # print(label_ids)
        except:
            global k
            k += 1
            # print(f"{word_idx = }")
            # print(f"{len(label) = }")
            labels.append([-100] * len(tokenized_inputs["input_ids"][i]))
        """
        except:
            print(f"{word_ids[-2] = }")
            print(f"{len(label) = }")
            print("Unvalid data detected")
            labels.append([-100] * len(word_ids))
        """

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
# number of invalid data
print(k)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/mdeberta-v3-base", truncation=True, max_length=512)
dataset = load_dataset("ai4privacy/pii-masking-400k")
train = dataset["train"]
valid = dataset["validation"]



In [6]:
token_train = train.map(tokenize_and_align_labels, batched=True)
token_valid = valid.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/325517 [00:00<?, ? examples/s]

Map:   0%|          | 0/81379 [00:00<?, ? examples/s]

In [7]:
k

933

In [8]:
tokenized_datasets = DatasetDict({
    "train": token_train,
    "validation": token_valid
})

tokenized_datasets.save_to_disk("./tokenized_dataset/gen_tokenized_data")

Saving the dataset (0/2 shards):   0%|          | 0/325517 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/81379 [00:00<?, ? examples/s]

In [9]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from datasets import load_dataset, load_from_disk

tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/mdeberta-v3-base", truncation=True, max_length=512)
train = load_from_disk("./tokenized_dataset/gen_tokenized_data")["train"]



In [12]:
example = tokenizer.convert_ids_to_tokens(train[0]["source_text"])
tokens = tokenizer(train[0])
labels = train[0]['labels']

In [13]:
len(labels) == len(tokens)

True

In [14]:
for i in range(1, len(train[0]["labels"])-1): 
    print(f"token: '{tokens[i]}';\tlabel: '{id2label[labels[i]]}'")

token: '▁<';	label: 'O'
token: 'p';	label: 'O'
token: '>';	label: 'O'
token: 'My';	label: 'O'
token: '▁child';	label: 'O'
token: '▁fao';	label: 'B-USERNAME'
token: 'zz';	label: 'I-USERNAME'
token: 's';	label: 'I-USERNAME'
token: 'd';	label: 'I-USERNAME'
token: '379';	label: 'I-USERNAME'
token: '223';	label: 'I-USERNAME'
token: '▁(';	label: 'O'
token: 'DOB';	label: 'O'
token: ':';	label: 'O'
token: '▁May';	label: 'B-DATEOFBIRTH'
token: '/';	label: 'I-DATEOFBIRTH'
token: '58)';	label: 'I-DATEOFBIRTH'
token: '▁will';	label: 'O'
token: '▁under';	label: 'O'
token: 'go';	label: 'O'
token: '▁treatment';	label: 'O'
token: '▁with';	label: 'O'
token: '▁Dr';	label: 'O'
token: '.';	label: 'O'
token: '▁fao';	label: 'B-USERNAME'
token: 'zz';	label: 'I-USERNAME'
token: 's';	label: 'I-USERNAME'
token: 'd';	label: 'I-USERNAME'
token: '379';	label: 'I-USERNAME'
token: '223';	label: 'I-USERNAME'
token: ',';	label: 'I-USERNAME'
token: '▁office';	label: 'O'
token: '▁at';	label: 'O'
token: '▁Hill';	label: '