In [None]:
# data handling and manipulation
import json
import numpy as np
import pandas as pd

# machine learning with pytorch and transformers
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn import CrossEntropyLoss
from transformers import AutoTokenizer, AutoModelForTokenClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# progress bar
from tqdm.notebook import tqdm

# random number generation
import random

# count list
from collections import Counter

In [None]:
# personal identifiable information setup
pii_labels = ['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM',
              'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME',
              'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM',
              'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'O']

# integer label to BIO format label mapping
pii_id2label = dict(enumerate(pii_labels))

# BIO format label to integer label mapping
pii_label2id = {v:k for k,v in pii_id2label.items()}

# number of PII (NER) tags
pii_num_labels = len(pii_labels)

In [None]:
pii_id2label

{0: 'B-EMAIL',
 1: 'B-ID_NUM',
 2: 'B-NAME_STUDENT',
 3: 'B-PHONE_NUM',
 4: 'B-STREET_ADDRESS',
 5: 'B-URL_PERSONAL',
 6: 'B-USERNAME',
 7: 'I-ID_NUM',
 8: 'I-NAME_STUDENT',
 9: 'I-PHONE_NUM',
 10: 'I-STREET_ADDRESS',
 11: 'I-URL_PERSONAL',
 12: 'O'}

In [None]:
# load the pre-trained deberta-v3 tokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [None]:
def tokenize_and_align_labels(tokens, ner_tags):
    # tokenize and pad the input tokens
    tokenized_inputs = tokenizer(tokens, truncation=True, is_split_into_words=True, padding='max_length', max_length=30)

    # mapping of each token to its corresponding word in the original input
    word_ids = tokenized_inputs.word_ids()

    # initialize label_ids to store the aligned labels
    label_ids = []

    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        else:
            label_ids.append(ner_tags[word_idx])

    # add the aligned NER labels to the tokenized_inputs dictionary
    tokenized_inputs['labels'] = label_ids

    return tokenized_inputs

In [None]:
# add augmentation data
aug_train = pd.read_csv('/content/drive/MyDrive/personal_project/DEBERTA/AUG_NEW.csv')
aug_train

Unnamed: 0,tokens,ner_tags
0,"['www.loganrodriguez.net', '.', '<ying.chen@we...","[5, 12, 0, 12, 3, 9, 9, 9, 12, 6, 12, 4, 10, 1..."
1,"['<lian.ma@webmail.com.cn>', '.', 'Hua', 'Sun'...","[0, 12, 2, 8, 12, 5, 12, 3, 9, 9, 12, 1, 12, 4..."
2,"['id987654', '.', 'Olga', 'Ivanova', '.', 'www...","[1, 12, 2, 8, 12, 5, 12, 6, 12, 4, 10, 12, 3, ..."
3,"['245', 'E', '24th', 'St', 'Apt', '12E,', 'New...","[4, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 2,..."
4,"['tech_geek_101', '.', 'Feng', 'Liu', '.', '<m...","[6, 12, 2, 8, 12, 0, 12, 5, 12, 4, 10, 10, 10,..."
...,...,...
20995,"['We', 'just', 'opened', 'a', 'new', 'cafe', '...","[12, 12, 12, 12, 12, 12, 12, 4, 10, 10, 10, 10..."
20996,"['I', 'live', 'in', '45', 'Quincy', 'St', 'Cam...","[12, 12, 12, 4, 10, 10, 10, 10, 10, 12, 12, 12..."
20997,"['Our', 'new', 'shop', 'is', 'at', '1439', 'El...","[12, 12, 12, 12, 12, 4, 10, 10, 10, 10, 10, 10..."
20998,"['Our', 'office', 'is', 'situated', 'at', '1',...","[12, 12, 12, 12, 12, 4, 10, 10, 10, 10, 10, 10..."


In [None]:
# process, tokenize, and align labels for training and testing data
def data_process(data, test_size=0.1, random_state=42):

    # initialize empty arrays
    words = np.empty(len(data), dtype=object)
    labels = np.empty(len(data), dtype=object)

    # process and fill each data point
    for i, x in tqdm(enumerate(data), total=len(data)):
        words[i] = np.array(x['tokens'])
        labels[i] = np.array([pii_label2id[label] for label in x['labels']])

    # change data structure to list
    words_list = [arr.tolist() for arr in list(words)]
    labels_list = [arr.tolist() for arr in list(labels)]

    # store list in dictionary
    df_data = {
        'tokens': words_list,
        'ner_tags': labels_list
    }

    # make pandas dataframe
    df = pd.DataFrame(df_data)

    # split the dataframe into train and test sets
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)

    # apply tokenization and alignment to the train and test sets
    train_tokenized_inputs = [tokenize_and_align_labels(row['tokens'], row['ner_tags']) for _, row in train_df.iterrows()]
    test_tokenized_inputs = [tokenize_and_align_labels(row['tokens'], row['ner_tags']) for _, row in test_df.iterrows()]

    return train_df, test_df, train_tokenized_inputs, test_tokenized_inputs

In [None]:
# google drive path for training data
data_path = '/content/drive/MyDrive/personal_project/pii/pii_data'

# load the data
data = json.load(open(f'{data_path}/train.json'))

In [None]:
# get the processed data
train_df, test_df, train_tokenized_inputs, test_tokenized_inputs = data_process(data)

  0%|          | 0/6807 [00:00<?, ?it/s]

In [None]:
train_df = pd.concat([train_df, aug_train], ignore_index=True)
train_df

Unnamed: 0,tokens,ner_tags
0,"[Final, Assignment, :, Track, Your, Trash, \n\...","[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1..."
1,"[Reflection, –, Storytelling, \n\n, Challenge,...","[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1..."
2,"[LEARNING, LAUNCH, \n\n, Challenge, \n\n, I, c...","[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1..."
3,"[Brain, Planning, :, \n\n, Difficulties, :, \n...","[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1..."
4,"[The, 3D, -, Printing, -, Life, \n\n, Descript...","[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1..."
...,...,...
27121,"['We', 'just', 'opened', 'a', 'new', 'cafe', '...","[12, 12, 12, 12, 12, 12, 12, 4, 10, 10, 10, 10..."
27122,"['I', 'live', 'in', '45', 'Quincy', 'St', 'Cam...","[12, 12, 12, 4, 10, 10, 10, 10, 10, 12, 12, 12..."
27123,"['Our', 'new', 'shop', 'is', 'at', '1439', 'El...","[12, 12, 12, 12, 12, 4, 10, 10, 10, 10, 10, 10..."
27124,"['Our', 'office', 'is', 'situated', 'at', '1',...","[12, 12, 12, 12, 12, 4, 10, 10, 10, 10, 10, 10..."


In [None]:
import ast

def clean_ner_tags(tag_list):
    try:
        # try to evaluate the string as a literal Python expression
        cleaned = ast.literal_eval(tag_list)
        # ensure all elements are integers
        return [int(tag) for tag in cleaned]
    except:
        # if it fails, it's probably already a list of integers
        return tag_list

train_df['ner_tags'] = train_df['ner_tags'].apply(clean_ner_tags)

In [None]:
# check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'using device: {device}')

using device: cuda


In [None]:
# convert training inputs to PyTorch tensors
train_input_ids = torch.tensor([x['input_ids'] for x in train_tokenized_inputs]).to(device)
train_attention_mask = torch.tensor([x['attention_mask'] for x in train_tokenized_inputs]).to(device)
train_labels = torch.tensor([x['labels'] for x in train_tokenized_inputs]).to(device)

# convert testing inputs to PyTorch tensors
test_input_ids = torch.tensor([x['input_ids'] for x in test_tokenized_inputs]).to(device)
test_attention_mask = torch.tensor([x['attention_mask'] for x in test_tokenized_inputs]).to(device)
test_labels = torch.tensor([x['labels'] for x in test_tokenized_inputs]).to(device)

In [None]:
# customize a dataset class that inherits from PyTorch's 'Dataset' class
class NERDataset(Dataset):

    # initialize the dataset object with input IDs, attention masks, and labels
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    # '__len__' method to return the number of samples in the dataset
    def __len__(self):
        return len(self.input_ids)

    # '__getitem__'method to retrieve a single sample from the dataset
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

# create an instance of NERDataset and a DataLoader for the training dataset
train_dataset = NERDataset(train_input_ids, train_attention_mask, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# create an instance of NERDataset and a DataLoader for the testing dataset
test_dataset = NERDataset(test_input_ids, test_attention_mask, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
# load and define the deberta-v3-base model
model = AutoModelForTokenClassification.from_pretrained('microsoft/deberta-v3-base',
                                                        num_labels=pii_num_labels).to(device)

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# number of epochs
num_epochs = 7

# multiply the number of epochs and number of batches to calculate the total training steps
num_training_steps = num_epochs * len(train_dataloader)

# select 10% of total training steps as warming steps
num_warmup_steps = int(0.1 * num_training_steps)

# initialize the AdamW optimizer which includes weight decay for regularization and prevents overfitting
optimizer = AdamW(model.parameters(), lr=2e-5)

# linearly increases the learning rate during a warm-up period and then decreases it linearly to zero over the remaining training steps
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)



In [None]:
def compute_class_weights(labels, mu=0.15):
    label_counts = Counter(labels)
    total_counts = sum(label_counts.values())
    class_weights = {label: total_counts / count for label, count in label_counts.items()}

    # logarithmic scaling
    class_weights = {label: np.log1p(mu * total_counts / count) for label, count in label_counts.items()}

    # normalization to make the average weight 1
    weight_sum = sum(class_weights.values())
    class_weights = {label: weight / weight_sum * len(class_weights) for label, weight in class_weights.items()}

    return class_weights

# flatten the nested lists in 'ner_tags' to create a single list of all labels
all_labels = [label for sublist in train_df['ner_tags'].tolist() for label in sublist]
class_weights = compute_class_weights(all_labels)

class_weights[4] *= 10
class_weights[10] *= 10

# create a tensor of weights
weights = torch.tensor([class_weights[i] if i in class_weights else 1.0 for i in range(len(pii_labels))], dtype=torch.float).to(device)

# initialize the weighted loss function
loss_fn = CrossEntropyLoss(weight=weights)

In [None]:
# store losses for potential plotting
train_losses = []

# set the model to training mode
model.train()

# loop over training epochs
for epoch in range(num_epochs):

    # initialize loss for this epoch
    epoch_train_loss = 0

    # loop over batches in the training dataset
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}/{num_epochs}"):

        # clear previous gradients in case of accumulation
        optimizer.zero_grad()

        # ensure correct computation location (GPU) for input data
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # get model output for this batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        # retrieve loss value from model's output
        loss = loss_fn(outputs.logits.view(-1, pii_num_labels), labels.view(-1))

        # backwards propagation
        loss.backward()
        optimizer.step()

        # update learning rate schedule
        scheduler.step()

        # accumulate loss for this epoch
        epoch_train_loss += loss.item()

        # store loss for plotting
        train_losses.append(loss.item())

    # devide total epoch loss by the number of bathces in the dataloader
    avg_train_loss = epoch_train_loss / len(train_dataloader)

    # show the result
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss}")

Training Epoch 1/7:   0%|          | 0/383 [00:00<?, ?it/s]

Epoch 1/7, Training Loss: 0.014429705754476727


Training Epoch 2/7:   0%|          | 0/383 [00:00<?, ?it/s]

Epoch 2/7, Training Loss: 0.014730598929415818


Training Epoch 3/7:   0%|          | 0/383 [00:00<?, ?it/s]

Epoch 3/7, Training Loss: 0.014862154175037416


Training Epoch 4/7:   0%|          | 0/383 [00:00<?, ?it/s]

Epoch 4/7, Training Loss: 0.015822965210234817


Training Epoch 5/7:   0%|          | 0/383 [00:00<?, ?it/s]

Epoch 5/7, Training Loss: 0.015519850582299892


Training Epoch 6/7:   0%|          | 0/383 [00:00<?, ?it/s]

Epoch 6/7, Training Loss: 0.014640200829539781


Training Epoch 7/7:   0%|          | 0/383 [00:00<?, ?it/s]

Epoch 7/7, Training Loss: 0.01535481041002175


In [None]:
# Specify the path in your Google Drive where you want to save the model
model_save_path = '/content/drive/MyDrive/.../ner_model.pth'

# Save the model's state dictionary
torch.save(model.state_dict(), model_save_path)

print(f"Model saved to {model_save_path}")

Model saved to /content/drive/MyDrive/personal_project/DEBERTA/CONTEXT_ADDRESS_2.pth
