In [None]:
import json
import re
import argparse
from itertools import chain
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from tqdm import tqdm

In [None]:
INFERENCE_MAX_LENGTH = 1024
INFERENCE_STRIDE = 256
TRAINING_MODEL_PATH = ["/kaggle/input/pii-data-detection-models/checkpoint-900"]

In [None]:
id2label = {
    0: "B-EMAIL",
    1: "B-ID_NUM",
    2: "B-NAME_STUDENT",
    3: "B-PHONE_NUM",
    4: "B-STREET_ADDRESS",
    5: "B-URL_PERSONAL",
    6: "B-USERNAME",
    7: "I-ID_NUM",
    8: "I-NAME_STUDENT",
    9: "I-PHONE_NUM",
    10: "I-STREET_ADDRESS",
    11: "I-URL_PERSONAL",
    12: "O"
}

label2id = {
    "B-EMAIL": 0,
    "B-ID_NUM": 1,
    "B-NAME_STUDENT": 2,
    "B-PHONE_NUM": 3,
    "B-STREET_ADDRESS": 4,
    "B-URL_PERSONAL": 5,
    "B-USERNAME": 6,
    "I-ID_NUM": 7,
    "I-NAME_STUDENT": 8,
    "I-PHONE_NUM": 9,
    "I-STREET_ADDRESS": 10,
    "I-URL_PERSONAL": 11,
    "O": 12
}

all_labels = [
    "B-EMAIL",
    "B-ID_NUM",
    "B-NAME_STUDENT",
    "B-PHONE_NUM",
    "B-STREET_ADDRESS",
    "B-URL_PERSONAL",
    "B-USERNAME",
    "I-ID_NUM",
    "I-NAME_STUDENT",
    "I-PHONE_NUM",
    "I-STREET_ADDRESS",
    "I-URL_PERSONAL",
    "O"
]

In [None]:
df = pd.read_json("/kaggle/input/pii-detection-removal-from-educational-data/test.json")

def get_labels(word_ids, word_labels):
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        else:
            label_ids.append(label2id[word_labels[word_idx]])
    return label_ids



# Tokenize texts, possibly generating more than one tokenized sample for each text
def tokenize(df, to_tensor=True, with_labels=True):
    # This is what"s different from a longformer

    # Read the parameters with attention

    encoded = tokenizer(
        df["tokens"].tolist(),
        is_split_into_words=True,
        return_overflowing_tokens=True,
        stride=INFERENCE_STRIDE,
        max_length=INFERENCE_MAX_LENGTH,
        padding="max_length",
        truncation=True
    )


    if with_labels:
        encoded["labels"] = []

    encoded["wids"] = []

    n = len(encoded["overflow_to_sample_mapping"])
    for i in range(n):
        # Map back to original row
        text_idx = encoded["overflow_to_sample_mapping"][i]
        # Get word indexes (this is a global index that takes into consideration the chunking :D )
        word_ids = encoded.word_ids(i)
        if with_labels:
            # Get word labels of the full un-chunked text
            word_labels = df["labels"].iloc[text_idx]
            # Get the labels associated with the word indexes
            label_ids = get_labels(word_ids, word_labels)

            encoded["labels"].append(label_ids)

        encoded["wids"].append([w if w is not None else -1 for w in word_ids])

    if to_tensor:
        encoded = {key: torch.as_tensor(val) for key, val in encoded.items()}

    return encoded

In [None]:
class PIIDataset(Dataset):
    def __init__(self, tokenized_ds):
        self.data = tokenized_ds

    def __getitem__(self, index):
        item = {k: self.data[k][index] for k in self.data.keys()}
        return item

    def __len__(self):
        return len(self.data["input_ids"])

In [None]:
def inferenceV4(df, dl):

    # These 2 dictionaries will hold text-level data
    # Helping in the merging process by accumulating data
    # Through all the chunks

    token_pred = defaultdict(lambda: defaultdict(int))
    token_cnt = defaultdict(lambda: defaultdict(int))

    for batch in tqdm(dl):
        ids = batch["input_ids"].to("cuda")
        mask = batch["attention_mask"].to("cuda")
        preds = model(ids, attention_mask=mask, return_dict=False)[0].cpu().detach().numpy()
        preds_softmax = np.exp(preds) / np.sum(np.exp(preds), axis=2).reshape(preds.shape[0], preds.shape[1], 1)

        del ids, mask
        # Go over each prediction, getting the text_id reference
        for k, (chunk_preds, text_id) in enumerate(zip(preds_softmax, batch["overflow_to_sample_mapping"].tolist())):
            # The word_ids are absolute references in the original text
            word_ids = batch["wids"][k].numpy()

            for idx, word_idx in enumerate(word_ids):
                if word_idx != -1:
                    token_pred[text_id][word_idx] += chunk_preds[idx]
                    token_cnt[text_id][word_idx] += 1

    for text_id in token_pred:
        for word_idx in token_pred[text_id]:
            token_pred[text_id][word_idx] /= token_cnt[text_id][word_idx]

    return token_pred

In [None]:
final_token_pred = defaultdict(lambda: defaultdict(int))
for model_path in TRAINING_MODEL_PATH:
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenized_test = tokenize(df, with_labels=False)

    test_dataset = PIIDataset(tokenized_test)
    test_dataloader = DataLoader(test_dataset, batch_size=1)
    model = AutoModelForTokenClassification.from_pretrained(
        model_path,
        num_labels=len(all_labels),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    )
    model.cuda()
    token_pred = inferenceV4(df, test_dataloader)
    for text_id in token_pred:
        for word_idx in token_pred[text_id]:
            final_token_pred[text_id][word_idx] += token_pred[text_id][word_idx] / len(TRAINING_MODEL_PATH)

In [None]:
document, token, label = [], [], []
for text_id in final_token_pred:
    for word_idx in final_token_pred[text_id]:
        pred = final_token_pred[text_id][word_idx].argmax(-1)
        pred_without_O = final_token_pred[text_id][word_idx][:12].argmax(-1)
        if final_token_pred[text_id][word_idx][12] < 0.55:
            final_pred = pred_without_O
        else:
            final_pred = pred
        if id2label[final_pred] != "O":
            document.append(df.loc[text_id, "document"])
            token.append(word_idx)
            label.append(id2label[final_pred])
            
pred_df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label
})

In [None]:
def score(row):
    doc = row.document
    tok = row.token
    doc_idx = df.query("document == @doc").index[0]
    return token_pred[doc_idx][tok][label2id[row.label]]

In [None]:
pred_df["score"] = pred_df.apply(lambda x: score(x), axis=1)

## Postprocess

In [None]:
df = df[["document", "tokens"]].copy()

In [None]:
df = df.explode(["tokens"]).reset_index(drop=True).rename(columns={"tokens": "token"})

In [None]:
df["token_str"] = df["token"]
df["token"] = df.groupby("document").cumcount()

In [None]:
new_pred_df = pd.merge(df, pred_df[["document", "token", "label", "score"]], on=["document", "token"], how="left")
new_pred_df["label"] = new_pred_df["label"].fillna("O")

In [None]:
def pp(new_pred_df):
    df = new_pred_df.copy()
    i = 0
    while i < len(df):
        st = i
        doc = df.loc[st, "document"]
        tok = df.loc[st, "token"]
        pred_tok = df.loc[st, "label"]
        if pred_tok == "O":
            i += 1
            continue
        lab = pred_tok.split("-")[1]
        cur_doc = doc
        cur_lab = lab
        last_tok = tok
        cur_tok = last_tok
        while i < len(df) and cur_doc == doc and cur_lab == lab and last_tok == cur_tok:
            last_tok = cur_tok + 1
            i += 1
            cur_doc = df.loc[i, "document"]
            cur_tok = df.loc[i, "token"]
            if i >= len(df) or df.loc[i, "label"] == "O":
                break
            cur_lab = df.loc[i, "label"].split("-")[1]

        # exception
        if st - 2 >= 0 and df.loc[st - 2, "document"] == df.loc[st, "document"] and df.loc[st - 1, "token_str"] == "\n" and df.loc[st - 2, "label"] != "O" and df.loc[st - 2, "label"].split("-")[1] == lab:
            df.loc[st - 1, "label"] = "I-" + lab
            df.loc[st - 1, "score"] = 1
            for j in range(st, i):
                if df.loc[j, "label"] != "I-" + lab:
                    df.loc[j, "score"] = 1
                    df.loc[j, "label"] = "I-" + lab
            continue

        # fix
        for j in range(st, i):
            if j == st:
                if df.loc[j, "label"] != "B-" + lab:
                    df.loc[j, "score"] = 1
                    df.loc[j, "label"] = "B-" + lab
            else:
                if df.loc[j, "label"] != "I-" + lab:
                    df.loc[j, "score"] = 1
                    df.loc[j, "label"] = "I-" + lab

        if lab == "NAME_STUDENT" and any(len(item) == 2 and item[0].isupper() and item[1] == "." for item in df.loc[st:i-1, "token_str"]):
            for j in range(st, i):
                df.loc[j, "score"] = 0
                df.loc[j, "label"] = "O"

    return df

In [None]:
new_pred_df = pp(new_pred_df)

In [None]:
new_pred_df

In [None]:
new_pred_df = new_pred_df.query("label != 'O'").reset_index(drop=True)

In [None]:
rows_to_delete = []
for idx, row in new_pred_df.iterrows():
    if row.label == "I-PHONE_NUM":
        if row.token_str == ")":
            rows_to_delete.append(idx)
        elif not bool(re.search(r"\d", row.token_str)):
            rows_to_delete.append(idx)
    elif row.label == "B-EMAIL":
        if "@" not in row.token_str:
            rows_to_delete.append(idx)

In [None]:
new_pred_df = new_pred_df.drop(rows_to_delete, axis=0, inplace=False)

In [None]:
new_pred_df["row_id"] = list(range(len(new_pred_df)))

In [None]:
new_pred_df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)

In [None]:
new_pred_df[["row_id", "document", "token", "label", "token_str"]]