In [2]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

In [3]:
pip install datasets



In [55]:
# Import necessary libraries
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load tokenizer and pre-trained model
tokenizer = AutoTokenizer.from_pretrained("lakshyakh93/deberta_finetuned_pii")
model = AutoModelForTokenClassification.from_pretrained("lakshyakh93/deberta_finetuned_pii")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Load the dataset
dataset = load_dataset("Isotonic/pii-masking-200k")

# Split the dataset into training, validation, and test sets
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
validation_test_split = split_dataset["test"].train_test_split(test_size=0.5, seed=42)
dataset_splits = DatasetDict({
    "train": split_dataset["train"],
    "validation": validation_test_split["train"],
    "test": validation_test_split["test"]
})

# Reduce dataset sizes for faster processing
subset_splits = dataset_splits.map(
    lambda split: split.shuffle(seed=42).select(range(5000)) if len(split) > 5000 else split
)

# Confirm reduced dataset sizes
print(f"Training size: {len(subset_splits['train'])}")
print(f"Validation size: {len(subset_splits['validation'])}")
print(f"Test size: {len(subset_splits['test'])}")

Training size: 167408
Validation size: 20926
Test size: 20927


In [16]:
# Create a mapping for the bio_labels
label_list = []

# Extract unique labels from the 'bio_labels' column
def get_unique_labels(dataset):
    unique_labels = set()
    for row in dataset["bio_labels"]:
        unique_labels.update(row)  # Add all labels in the current row to the set
    return sorted(unique_labels)  # Return sorted labels for consistency

# Apply the function to the entire dataset
unique_labels = get_unique_labels(dataset["train"])  # Run on training set or all splits if needed

# Print the unique labels
print("Unique BIO Labels:", unique_labels)

# Combine with the existing label_list
label_list = sorted(set(label_list + list(unique_labels)))  # Ensure no duplicates
print("Final Label List:", label_list)
print(len(label_list))

Unique BIO Labels: ['B-ACCOUNTNAME', 'B-ACCOUNTNUMBER', 'B-AGE', 'B-AMOUNT', 'B-BIC', 'B-BITCOINADDRESS', 'B-BUILDINGNUMBER', 'B-CITY', 'B-COMPANYNAME', 'B-COUNTY', 'B-CREDITCARDCVV', 'B-CREDITCARDISSUER', 'B-CREDITCARDNUMBER', 'B-CURRENCY', 'B-CURRENCYCODE', 'B-CURRENCYNAME', 'B-CURRENCYSYMBOL', 'B-DATE', 'B-DOB', 'B-EMAIL', 'B-ETHEREUMADDRESS', 'B-EYECOLOR', 'B-FIRSTNAME', 'B-GENDER', 'B-HEIGHT', 'B-IBAN', 'B-IP', 'B-IPV4', 'B-IPV6', 'B-JOBAREA', 'B-JOBTITLE', 'B-JOBTYPE', 'B-LASTNAME', 'B-LITECOINADDRESS', 'B-MAC', 'B-MASKEDNUMBER', 'B-MIDDLENAME', 'B-NEARBYGPSCOORDINATE', 'B-ORDINALDIRECTION', 'B-PASSWORD', 'B-PHONEIMEI', 'B-PHONENUMBER', 'B-PIN', 'B-PREFIX', 'B-SECONDARYADDRESS', 'B-SEX', 'B-SSN', 'B-STATE', 'B-STREET', 'B-TIME', 'B-URL', 'B-USERAGENT', 'B-USERNAME', 'B-VEHICLEVIN', 'B-VEHICLEVRM', 'B-ZIPCODE', 'I-ACCOUNTNAME', 'I-ACCOUNTNUMBER', 'I-AGE', 'I-AMOUNT', 'I-BIC', 'I-BITCOINADDRESS', 'I-BUILDINGNUMBER', 'I-CITY', 'I-COMPANYNAME', 'I-COUNTY', 'I-CREDITCARDCVV', 'I-CREDI

In [56]:
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

# Preprocessing function
def preprocess_batch(batch):
    """Preprocess a batch of text into tokenized input and aligned labels."""
    tokenized = tokenizer(
        batch["unmasked_text"],  # Raw text input column
        truncation=True,
        padding="max_length",  # Don't pad here, do it later in collate_fn
        max_length=512,  # Limit sequence length
        return_tensors="pt",
        return_attention_mask=True  # Ensure attention_mask is included
    )
    word_ids = tokenized.word_ids()  # Maps tokens back to original words
    aligned_labels = []
    for word_id in word_ids:
        if word_id is None:  # Special tokens
            aligned_labels.append(-100)  # Ignore these tokens during evaluation
        else:
            # Convert string label to integer using the label_to_id dictionary
            aligned_labels.append(label_to_id[batch["bio_labels"][word_id]])
    tokenized["labels"] = torch.tensor(aligned_labels)
    return tokenized

# Define a collate function for DataLoader
def collate_fn(batch):
    """Collate a batch of tokenized sequences with padding."""
    # Tokenize and align labels for each example in the batch
    tokenized_batch = [preprocess_batch(row) for row in batch]

    # Pad all sequences to the maximum length in the batch
    padded_batch = tokenizer.pad(
        tokenized_batch,
        padding=True,  # Pad to the max sequence length in this batch
        return_tensors="pt"  # Return as pytorch tensors
    )
    return padded_batch

# Define metrics calculation function
def compute_metrics(predictions, references):
    acc = accuracy_score(references, predictions)
    precision = precision_score(references, predictions, average="weighted", zero_division=1)
    recall = recall_score(references, predictions, average="weighted", zero_division=1)
    f1 = f1_score(references, predictions, average="weighted", zero_division=1)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [57]:
# Create DataLoader for validation set
validation_dataloader = DataLoader(subset_splits["validation"], batch_size=16, collate_fn=collate_fn)

# Evaluate the model on the validation set
model.eval()
true_labels = []
predicted_labels = []

with torch.no_grad():
    for batch in validation_dataloader:
        inputs = {key: value.to(device) for key, value in batch.items() if key != "labels"}
        labels = batch["labels"].to(device)

        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)

        # Collect true and predicted labels
        true_labels.extend(labels.cpu().numpy().tolist())
        predicted_labels.extend(predictions.cpu().numpy().tolist())

# Flatten and compute metrics
true_labels_flat = [label for sublist in true_labels for label in sublist if label != -100]
predicted_labels_flat = [label for sublist in predicted_labels for label in sublist if label != -100]

metrics = compute_metrics(predicted_labels_flat, true_labels_flat)
print("Evaluation Metrics:", metrics)

You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


RuntimeError: The size of tensor a (768) must match the size of tensor b (512) at non-singleton dimension 3

* deal with the redaction issue
  * 'biolabels' column
  * find the unique labels
  * add them to the label map

* don't need to finetune deBERTa anymore
  * just calculate its accuracy

* can also finetune a not-finetuned deBERTa model if you have time

In [4]:
train_set = load_dataset("ai4privacy/pii-masking-300k", split='train')
dataset = train_set.shuffle().select(range(100))

# Define label mapping
label_map = {"O": 0, "TIME": 1, "DATE": 2, "LASTNAME1": 3, "LASTNAME2": 4, "EMAIL": 5, "SOCIALNUMBER": 6}

# Function to tokenize and align labels
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example['source_text'], truncation=True, padding='max_length', max_length=512)
    labels = ["O"] * len(tokenized_inputs["input_ids"])

    # Adjust indexing to fit the tokenization
    example['source_text'] = example['source_text'][:512]  # Ensure source_text matches max_length

    for span in example["privacy_mask"]:
        start, end, label_name = span["start"], span["end"], span["label"]
        label_id = label_map.get(label_name, 0)

        span_tokens = tokenizer(example["source_text"][start:end], add_special_tokens=False).tokens()
        # Properly create and align labels (assuming BERT tokenizer)
        for i, token in enumerate(tokenized_inputs.tokens()):
            if token in span_tokens:
                labels[i] = label_id

    numeric_labels = [label_map.get(label, -100) for label in labels]
    numeric_labels += [-100] * (len(tokenized_inputs["input_ids"]) - len(numeric_labels))

    tokenized_inputs["labels"] = numeric_labels
    return tokenized_inputs


# Apply function to dataset
encoded_dataset = dataset.map(tokenizer, batched=False)

# Convert to PyTorch tensors
input_ids = torch.tensor(encoded_dataset["input_ids"])
attention_mask = torch.tensor(encoded_dataset["attention_mask"])
labels = torch.tensor(encoded_dataset["labels"])

# Define metric computation
metric = evaluate.load("accuracy")  # You may want to load relevant metrics for token classification

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=2)  # Note: axis=2 for token classification
    true_labels = labels != -100  # Masking out unnecessary labels

    # Flatten predictions and true_labels to compute accuracy
    flattened_predictions = predictions[true_labels]
    flattened_labels = labels[true_labels]

    results = metric.compute(references=flattened_labels, predictions=flattened_predictions)
    return results


# Tokenize validation set
val_set = load_dataset("ai4privacy/pii-masking-300k", split='validation')  # Ensure you have a validation set
small_val = val_set.shuffle().select(range(100))
encoded_small_val = small_val.map(tokenizer, batched=False)

# Training arguments
train_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=10,
    eval_steps=10,
    gradient_accumulation_steps=2,  # Adjust if needed
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_small_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

README.md:   0%|          | 0.00/15.9k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/555M [00:00<?, ?B/s]

1english_openpii_30k.jsonl:   0%|          | 0.00/103M [00:00<?, ?B/s]

dutch_openpii_28k.jsonl:   0%|          | 0.00/102M [00:00<?, ?B/s]

french_openpii_31k.jsonl:   0%|          | 0.00/114M [00:00<?, ?B/s]

german_openpii_30k.jsonl:   0%|          | 0.00/108M [00:00<?, ?B/s]

italian_openpii_29k.jsonl:   0%|          | 0.00/104M [00:00<?, ?B/s]

spanish_openpii_29k.jsonl:   0%|          | 0.00/102M [00:00<?, ?B/s]

1english_openpii_8k.jsonl:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

dutch_openpii_7k.jsonl:   0%|          | 0.00/27.0M [00:00<?, ?B/s]

french_openpii_8k.jsonl:   0%|          | 0.00/30.7M [00:00<?, ?B/s]

german_openpii_8k.jsonl:   0%|          | 0.00/29.2M [00:00<?, ?B/s]

italian_openpiii_8k.jsonl:   0%|          | 0.00/28.3M [00:00<?, ?B/s]

spanish_openpii_8k.jsonl:   0%|          | 0.00/27.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/177677 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/47728 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [6]:
type(dataset)

In [None]:
# Run training
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
from transformers import pipeline

# Initialize a token classification pipeline
pii_pipeline = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Function to apply PII masking to a specific text column
def redact_column(dataset, column_name):
    redacted_texts = []

    for text in dataset[column_name]:
        # Get model predictions for the text
        predictions = pii_pipeline(text)

        # Replace PII spans with placeholders
        redacted_text = text
        for entity in sorted(predictions, key=lambda x: x['start'], reverse=True):  # Sort in reverse to avoid shifting positions
            label = entity['entity_group']
            redacted_text = redacted_text[:entity['start']] + f"[{label}]" + redacted_text[entity['end']:]

        redacted_texts.append(redacted_text)

    return redacted_texts

In [None]:
df = pd.DataFrame({
    'source_text': [
        'Her name is Jane and her phone number is 805-847-4340.',
        'Please reach out to John Smith at john.smith@example.com for more details.',
        'His credit card number is 1234-5678-9012-3456, the routing number is 834245680, and it expires on 11/25.',
        'You can contact Mary at 555-123-4567 or at her office in 123 Elm Street, Denver, CO 62704.',
        'The passport number is K12345678 issued by the US Department of State.',
        "Robert's social security number is 987-65-4320, which needs to be redacted.",
        "The user's login attempt was from IP address 192.168.0.1.",
        "Patient Sarah Taylor's medical record indicates a diagnosis of hypertension, and her insurance ID is ABCD123456.",
        "I live in Denver, CO."
    ]
})

redact_column(df, 'source_text')

In [None]:
# Use the model directly
inputs = tokenizer(
    "Her name is Jane and her phone number is 805-847-4340", add_special_tokens=False, return_tensors="pt"
)

with torch.no_grad():
    logits = model(**inputs).logits

predicted_token_class_ids = logits.argmax(-1)

# Note that tokens are classified rather then input words which means that there might be more predicted token classes than words.
# Multiple token classes might account for the same word
predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]

labels = predicted_token_class_ids
loss = model(**inputs, labels=labels).loss

In [None]:
# Tokenize the input text to get tokenized words for each label
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

# # Print each token with its predicted label
# for token, label in zip(tokens, predicted_tokens_classes):
#     print(f"{token}: {label}")

# Reconstruct the sentence with labels
labeled_sentence = []
for token, label in zip(tokens, predicted_tokens_classes):
    if not token.startswith("##"):  # Handle subword tokens
        labeled_sentence.append(f"{token}({label})")
    else:
        # Append subword tokens without a space
        labeled_sentence[-1] = labeled_sentence[-1][:-len(label)-2] + token[2:] + f"({label})"

# Join the labeled sentence into a single string
result_sentence = " ".join(labeled_sentence)

# Print the result
print(result_sentence)

In [None]:
# Use the model directly
inputs = tokenizer(
    "Her name is Jane and her phone number is 512-247-6993", add_special_tokens=False, return_tensors="pt"
)

with torch.no_grad():
    logits = model(**inputs).logits

predicted_token_class_ids = logits.argmax(-1)

# Map token class IDs to labels
predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]

# Tokenize the input text to get the words for each token
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

# Reconstruct the original input sentence
original_sentence = "Her name is Jane and her phone number is 512-247-6993"  # This is the input

# Create the label sentence (space-separated labels for each token)
label_sentence = []
for token, label in zip(tokens, predicted_tokens_classes):
    if not token.startswith("##"):  # Handle subword tokens
        label_sentence.append(label)
    else:
        # Subword tokens don't affect labels but ensure they follow token merging rules
        continue

label_sentence = " ".join(label_sentence)

# Print both sentences
print(f"original sentence: {original_sentence}")
print(f"label sentence: {label_sentence}")