In [1]:
pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [11]:
# Import necessary libraries
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load tokenizer and pre-trained model
tokenizer = AutoTokenizer.from_pretrained("lakshyakh93/deberta_finetuned_pii")
model = AutoModelForTokenClassification.from_pretrained("lakshyakh93/deberta_finetuned_pii")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Load the dataset
dataset = load_dataset("Isotonic/pii-masking-200k")

# Proportional split ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Calculate proportional max sizes (capped at 3,000 for each split)
total_max_samples = 3000 / (train_ratio + val_ratio + test_ratio)
train_max = int(train_ratio * total_max_samples)
val_max = int(val_ratio * total_max_samples)
test_max = int(test_ratio * total_max_samples)

# Split the dataset
split_dataset = dataset["train"].train_test_split(test_size=(1 - train_ratio), seed=42)
validation_test_split = split_dataset["test"].train_test_split(test_size=test_ratio / (test_ratio + val_ratio), seed=42)

# Create a DatasetDict for the splits
dataset_splits = DatasetDict({
    "train": split_dataset["train"].shuffle(seed=42).select(range(min(len(split_dataset["train"]), train_max))),
    "validation": validation_test_split["train"].shuffle(seed=42).select(range(min(len(validation_test_split["train"]), val_max))),
    "test": validation_test_split["test"].shuffle(seed=42).select(range(min(len(validation_test_split["test"]), test_max))),
})

# Confirm reduced dataset sizes
print(f"Training size: {len(dataset_splits['train'])}")
print(f"Validation size: {len(dataset_splits['validation'])}")
print(f"Test size: {len(dataset_splits['test'])}")

Training size: 2100
Validation size: 450
Test size: 450


In [2]:
# Create a mapping for the bio_labels
label_list = []

# Extract unique labels from the 'bio_labels' column
def get_unique_labels(dataset):
    unique_labels = set()
    for row in dataset["bio_labels"]:
        unique_labels.update(row)  # Add all labels in the current row to the set
    return sorted(unique_labels)  # Return sorted labels for consistency

# Apply the function to the entire dataset
unique_labels = get_unique_labels(dataset["train"])  # Run on training set or all splits if needed

# Print the unique labels
print("Unique BIO Labels:", unique_labels)

# Combine with the existing label_list
label_list = sorted(set(label_list + list(unique_labels)))  # Ensure no duplicates
print("Final Label List:", label_list)
print(len(label_list))

Unique BIO Labels: ['B-ACCOUNTNAME', 'B-ACCOUNTNUMBER', 'B-AGE', 'B-AMOUNT', 'B-BIC', 'B-BITCOINADDRESS', 'B-BUILDINGNUMBER', 'B-CITY', 'B-COMPANYNAME', 'B-COUNTY', 'B-CREDITCARDCVV', 'B-CREDITCARDISSUER', 'B-CREDITCARDNUMBER', 'B-CURRENCY', 'B-CURRENCYCODE', 'B-CURRENCYNAME', 'B-CURRENCYSYMBOL', 'B-DATE', 'B-DOB', 'B-EMAIL', 'B-ETHEREUMADDRESS', 'B-EYECOLOR', 'B-FIRSTNAME', 'B-GENDER', 'B-HEIGHT', 'B-IBAN', 'B-IP', 'B-IPV4', 'B-IPV6', 'B-JOBAREA', 'B-JOBTITLE', 'B-JOBTYPE', 'B-LASTNAME', 'B-LITECOINADDRESS', 'B-MAC', 'B-MASKEDNUMBER', 'B-MIDDLENAME', 'B-NEARBYGPSCOORDINATE', 'B-ORDINALDIRECTION', 'B-PASSWORD', 'B-PHONEIMEI', 'B-PHONENUMBER', 'B-PIN', 'B-PREFIX', 'B-SECONDARYADDRESS', 'B-SEX', 'B-SSN', 'B-STATE', 'B-STREET', 'B-TIME', 'B-URL', 'B-USERAGENT', 'B-USERNAME', 'B-VEHICLEVIN', 'B-VEHICLEVRM', 'B-ZIPCODE', 'I-ACCOUNTNAME', 'I-ACCOUNTNUMBER', 'I-AGE', 'I-AMOUNT', 'I-BIC', 'I-BITCOINADDRESS', 'I-BUILDINGNUMBER', 'I-CITY', 'I-COMPANYNAME', 'I-COUNTY', 'I-CREDITCARDCVV', 'I-CREDI

In [3]:
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

# Define metrics calculation function
def compute_metrics(predictions, references):
    acc = accuracy_score(references, predictions)
    precision = precision_score(references, predictions, average="weighted", zero_division=1)
    recall = recall_score(references, predictions, average="weighted", zero_division=1)
    f1 = f1_score(references, predictions, average="weighted", zero_division=1)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [4]:
pip install peft accelerate



In [12]:
# Map string labels to integers
def map_labels(example):
    example["bio_labels"] = [label_to_id[label] for label in example["bio_labels"]]
    return example

# Apply the mapping
dataset_splits['train'] = dataset_splits['train'].map(map_labels)
dataset_splits['validation'] = dataset_splits['validation'].map(map_labels)
dataset_splits['test'] = dataset_splits['test'].map(map_labels)

Map:   0%|          | 0/2100 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

In [13]:
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model
from datasets import load_dataset

# Tokenization and label alignment function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokenised_text"],
        truncation=True,
        max_length=128,  # Specify a maximum sequence length
        padding="max_length",  # Pad to max length for consistent tensor size
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples["bio_labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)  # Ignore special tokens
            else:
                aligned_labels.append(label[word_id])
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply tokenization to the dataset
tokenized_dataset = dataset_splits.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["tokenised_text", "bio_labels"]  # Adjust based on column names
)

Map:   0%|          | 0/2100 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

In [14]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['masked_text', 'unmasked_text', 'privacy_mask', 'span_labels', 'language', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2100
    })
    validation: Dataset({
        features: ['masked_text', 'unmasked_text', 'privacy_mask', 'span_labels', 'language', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 450
    })
    test: Dataset({
        features: ['masked_text', 'unmasked_text', 'privacy_mask', 'span_labels', 'language', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 450
    })
})

In [10]:
# LoRA Configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["output.dense"],  # Adjust based on inspected module names
    lora_dropout=0.1,
    bias="none",
    task_type="TOKEN_CLS",  # Specify the task type
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Use "epoch" for evaluation after each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    fp16=True,  # Enable mixed precision
    report_to="none",  # Disable reporting to external tools
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,5.228,4.437744


KeyboardInterrupt: 

# Below are unfinished code

In [None]:
# Apply tokenization to the dataset
tokenized_dataset = dataset_splits.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["tokenised_text", "bio_labels"]  # Adjust based on column names
)

# Create DataLoaders
train_loader = DataLoader(tokenized_dataset["train"], batch_size=16, shuffle=True)
val_loader = DataLoader(tokenized_dataset["validation"], batch_size=16)
test_loader = DataLoader(tokenized_dataset["test"], batch_size=16)

In [None]:
tokenized_dataset['train'].features

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
test_loader = DataLoader(
    tokenized_dataset["test"],
    batch_size=16,
    collate_fn=data_collator
)

In [None]:
# Evaluation function
def evaluate_model(model, dataloader):
    model.eval()
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            # Convert batch elements to device
            inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(**inputs)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            # Collect predictions and labels
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Flatten lists for metric computation
    all_predictions = [p for batch in all_predictions for p in batch]
    all_labels = [l for batch in all_labels for l in batch]

    return compute_metrics(all_predictions, all_labels)

# Evaluate on test set
test_metrics = evaluate_model(model, test_loader)

# Print metrics
print("Test Set Metrics:")
for metric, value in test_metrics.items():
    print(f"{metric}: {value:.4f}")

* deal with the redaction issue
  * 'biolabels' column
  * find the unique labels
  * add them to the label map

* don't need to finetune deBERTa anymore
  * just calculate its accuracy

* can also finetune a not-finetuned deBERTa model if you have time