In [1]:
import json
import argparse
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from itertools import chain
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from tqdm import tqdm



In [2]:
INFERENCE_MAX_LENGTH = 768
TRAINING_MODEL_PATH = "/kaggle/input/pii-data-detection-models/checkpoint-1250/checkpoint-1250"

In [3]:
config = json.load(open(Path(TRAINING_MODEL_PATH) / "config.json"))

id2label = {
    0: "B-EMAIL",
    1: "B-ID_NUM",
    2: "B-NAME_STUDENT",
    3: "B-PHONE_NUM",
    4: "B-STREET_ADDRESS",
    5: "B-URL_PERSONAL",
    6: "B-USERNAME",
    7: "I-ID_NUM",
    8: "I-NAME_STUDENT",
    9: "I-PHONE_NUM",
    10: "I-STREET_ADDRESS",
    11: "I-URL_PERSONAL",
    12: "O"
}

label2id = {
    "B-EMAIL": 0,
    "B-ID_NUM": 1,
    "B-NAME_STUDENT": 2,
    "B-PHONE_NUM": 3,
    "B-STREET_ADDRESS": 4,
    "B-URL_PERSONAL": 5,
    "B-USERNAME": 6,
    "I-ID_NUM": 7,
    "I-NAME_STUDENT": 8,
    "I-PHONE_NUM": 9,
    "I-STREET_ADDRESS": 10,
    "I-URL_PERSONAL": 11,
    "O": 12
}

all_labels = [
    "B-EMAIL",
    "B-ID_NUM",
    "B-NAME_STUDENT",
    "B-PHONE_NUM",
    "B-STREET_ADDRESS",
    "B-URL_PERSONAL",
    "B-USERNAME",
    "I-ID_NUM",
    "I-NAME_STUDENT",
    "I-PHONE_NUM",
    "I-STREET_ADDRESS",
    "I-URL_PERSONAL",
    "O"
]

In [4]:
df = pd.read_json("../kaggle_dataset/competition/test.json")

tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)

def get_labels(word_ids, word_labels):
    label_ids = []
    for word_idx in word_ids:                            
        if word_idx is None:
            label_ids.append(-100)
        else:
            label_ids.append(label2id[word_labels[word_idx]])
    return label_ids

# Tokenize texts, possibly generating more than one tokenized sample for each text
def tokenize(df, to_tensor=True, with_labels=True):
    
    # This is what"s different from a longformer
    # Read the parameters with attention
    encoded = tokenizer(
        df["tokens"].tolist(),
        is_split_into_words=True,
        return_overflowing_tokens=True,
        stride=0,
        max_length=INFERENCE_MAX_LENGTH,
        padding="max_length",
        truncation=True
    )

    if with_labels:
        encoded["labels"] = []

    encoded["wids"] = []
    n = len(encoded["overflow_to_sample_mapping"])
    for i in range(n):

        # Map back to original row
        text_idx = encoded["overflow_to_sample_mapping"][i]
        
        # Get word indexes (this is a global index that takes into consideration the chunking :D )
        word_ids = encoded.word_ids(i)
        
        if with_labels:
            # Get word labels of the full un-chunked text
            word_labels = df["labels"].iloc[text_idx]
        
            # Get the labels associated with the word indexes
            label_ids = get_labels(word_ids, word_labels)
            encoded["labels"].append(label_ids)
            
        encoded["wids"].append([w if w is not None else -1 for w in word_ids])
    
    if to_tensor:
        encoded = {key: torch.as_tensor(val) for key, val in encoded.items()}
    return encoded

In [5]:
class PIIDataset(Dataset):
    def __init__(self, tokenized_ds):
        self.data = tokenized_ds

    def __getitem__(self, index):
        item = {k: self.data[k][index] for k in self.data.keys()}
        return item

    def __len__(self):
        return len(self.data["input_ids"])

tokenized_test = tokenize(df, with_labels=False)
test_dataset = PIIDataset(tokenized_test)
test_dataloader = DataLoader(test_dataset, batch_size=1)

model = AutoModelForTokenClassification.from_pretrained(
    TRAINING_MODEL_PATH,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

In [6]:
model.cuda()

DebertaV2ForTokenClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwis

In [7]:
def inference(df, dl):
    
    # These 2 dictionaries will hold text-level data
    # Helping in the merging process by accumulating data
    # Through all the chunks

    seen_words_idx = defaultdict(set)
    
    document, token, label, token_str = [], [], [], []
    
    for batch in tqdm(dl):
        ids = batch["input_ids"].to("cuda")
        mask = batch["attention_mask"].to("cuda")
        
        preds = model(ids, attention_mask=mask, return_dict=False)[0].cpu().detach().numpy() 
        pred_softmax = np.exp(preds) / np.sum(np.exp(preds), axis=2).reshape(preds.shape[0], preds.shape[1], 1)
        preds = preds.argmax(-1)
        preds_without_O = pred_softmax[:, :, :12].argmax(-1)
        O_preds = pred_softmax[:, :, 12]
        threshold = 0.9
        preds_final = np.where(O_preds < threshold, preds_without_O, preds)

        del ids, mask
    
        # Go over each prediction, getting the text_id reference
        
        for k, (chunk_preds, text_id) in enumerate(zip(preds_final, batch["overflow_to_sample_mapping"].tolist())):
            # The word_ids are absolute references in the original text
            word_ids = batch["wids"][k].numpy()
            
            # Map from ids to labels
            chunk_preds = [id2label[i] for i in chunk_preds]        
            
            for idx, word_idx in enumerate(word_ids):                            
                if word_idx != -1 and chunk_preds[idx] != "O" and word_idx not in seen_words_idx[text_id]:
                    document.append(df.loc[text_id, "document"])
                    token.append(word_idx)
                    token_str.append(df.loc[text_id, "tokens"][word_idx])
                    label.append(chunk_preds[idx])
                    seen_words_idx[text_id].add(word_idx)
                    
    df = pd.DataFrame({
        "document": document,
        "token": token,
        "label": label,
        "token_str": token_str
    })
    df["row_id"] = list(range(len(df)))
    
    return df

In [8]:
pred_df = inference(df, test_dataloader)

100%|██████████| 14/14 [00:04<00:00,  3.03it/s]


In [9]:
pred_df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)