In [1]:
import warnings
import json
import argparse
from itertools import chain
from functools import partial
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

import torch
import torch.nn as nn
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
from datasets import Dataset, features
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

2025-04-14 19:59:03.292693: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744660743.532520      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744660743.596926      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
def seed_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_everything()

In [3]:
models = [
    "google-bert/bert-base-uncased",
    "microsoft/deberta-v3-base",
]

lr = [2e-5, 1e-5, 2e-3]
sl = [768, 1024]
frac = [0.6, 1]
bs = [4, 8]
dataset = [1, 2]

In [4]:
class ConfigCreator:
    def __init__(self, model_name=models[1], lr = lr[0], sl = sl[0], frac = frac[0], bs = bs[0], dataset = dataset[0]):
        self.model_name = model_name
        self.lr = lr
        self.sl = sl
        self.frac = frac
        self.bs = bs
        self.dataset = dataset
    
    def get_config(self):
        class Config:
            MODEL_NAME = self.model_name
            MAX_SEQ_LEN = self.sl
        
            MODEL_SAVE_PATH = f"/kaggle/working/{self.model_name.split('/')[1]}"
            SEED = 42
            FRAC = self.frac
            
            NUM_FOLDS = 3
        
            DATASETS = {
                0: "/kaggle/input/pii-detection-removal-from-educational-data/train.json",
                1: "/kaggle/input/pii-dd-mistral-generated/mixtral-8x7b-v1.json",
            }
            USE_DATASETS = range(self.dataset)
            
            TOKENIZER = AutoTokenizer.from_pretrained(f"/kaggle/input/pii-detection-v1/{self.model_name.split('/')[1]}_fold_0")
            LR = self.lr
            WARMUP = 1e-1
            WD = 1e-2
            BS = self.bs
            LOG = 10
            ACC_STEPS = 2
            EPOCHS = 3
        
        return Config

In [5]:
class Preprocessor():
    def __init__(self, config):
        self.config = config
        self.data_paths = [self.config.DATASETS[i] for i in self.config.USE_DATASETS]

    def __build_text_and_labels(self, example, target):
        """
        Rebuild the text and align the labels based on tokens, provided labels, and whitespace information.
    
        Args:
            example (dict): The input example containing tokens, labels, and whitespace information.
    
        Returns:
            tuple: A tuple (text, aligned_labels), where text is the rebuilt text and aligned_labels is the list of corresponding labels.
        """
        text = []
        aligned_labels = []
        targets = []

        for token, label, has_whitespace in zip(example["tokens"], example["given_labels"], example["trailing_whitespace"]):
            text.append(token)
            aligned_labels.extend([label] * len(token))
    
            if label in target:
                targets.append(1)
            else:
                targets.append(0)
    
            if has_whitespace:
                text.append(" ")
                aligned_labels.append("O")
    
        return "".join(text), np.array(aligned_labels), targets

    def __tokenize_text(self, text, max_seq_len, truncation=True):
        """
        Tokenize the full text using the tokenizer and return tokenized output.
    
        Args:
            text (str): The text to tokenize.
            tokenizer (object): The tokenizer to use for tokenization.
            max_seq_len (int): The maximum length for the tokenized sequence.
    
        Returns:
            dict: The tokenized output containing input_ids, attention_mask, and offset_mapping.
        """
        return self.config.TOKENIZER(text, return_offsets_mapping=True, truncation=truncation, max_length=max_seq_len)
    
    def __adjust_start_index(self, start_idx, text):
        """
        Adjust the starting index if the token starts with whitespace.
    
        Args:
            start_idx (int): The start index of the token.
            text (str): The full text to check for whitespace.
    
        Returns:
            int: The adjusted start index.
        """
        if text[start_idx].isspace():
            return start_idx + 1
        return start_idx
    
    def __align_labels_to_tokens(self, offset_mapping, text, aligned_labels, ltoi):
        """
        Align the labels to the tokenized tokens using offset mapping.
    
        Args:
            offset_mapping (list): The list of token offsets.
            text (str): The full original text.
            aligned_labels (numpy array): The aligned labels for the original tokens.
            label2id (dict): A dictionary mapping label names to IDs.
    
        Returns:
            list: A list of token-level labels.
        """
        tokenized_labels = []
    
        for start, end in offset_mapping:
            # Handle special tokens (like [CLS] token)
            if start == 0 and end == 0:
                tokenized_labels.append(ltoi["O"])
                continue
    
            start_idx = self.__adjust_start_index(start, text)
            tokenized_labels.append(ltoi[aligned_labels[start_idx]])
    
        return tokenized_labels

    def __load_data(self, paths):
        df_list = [pd.read_json(path) for path in paths]
        df = pd.concat(df_list, ignore_index=True)
        print(f"Loaded {df.shape[0]} rows and {df.shape[1]} columns")
        print("Preprocessing Data..")

        # positive samples
        df["postive_samples"] = df["labels"].apply(lambda row: any((label[:2] == "B-" or label[:2] == "I-") for label in row))

        # negative samples
        negative_samples = df[~df["postive_samples"]].sample(frac=self.config.FRAC, random_state=self.config.SEED)

        new_df = pd.concat([df[df["postive_samples"]], negative_samples], ignore_index=True)

        # drop column used for filtering out positive samples
        new_df.drop(columns=["postive_samples"], inplace=True)

        df_json = new_df.to_json(orient="records")

        data = json.loads(df_json)

        return data
    
    def _preprocess_example(self, example, ltoi, max_seq_len, targets):
        """
        Preprocess an example by tokenizing the text and assigning the corresponding labels.
    
        Args:
            example (dict): The input example containing tokens, labels, and whitespace information.
            tokenizer (object): The tokenizer used to convert text into token IDs.
            label2id (dict): A dictionary mapping label names to IDs.
            max_seq_len (int): The maximum length of the tokenized sequence.
    
        Returns:
            dict: A dictionary containing the tokenized inputs, labels, and sequence length.
        """
        # Step 1: Rebuild text and labels
        text, aligned_labels, targets = self.__build_text_and_labels(example, targets)
    
        # Step 2: Tokenize the text
        tokenized_output = self.__tokenize_text(text, max_seq_len)
    
        # Step 3: Align token-level labels
        tokenized_labels = self.__align_labels_to_tokens(tokenized_output["offset_mapping"], text, aligned_labels, ltoi)
    
        # Step 4: Return the final tokenized output with labels and sequence length
        sequence_length = len(tokenized_output["input_ids"])
    
        return {
            **tokenized_output, 
            "labels": tokenized_labels, 
            "length": sequence_length, 
            "group": 1 if sum(targets) > 0 else 0
        }

    def get_dataset(self):
        data = self.__load_data(self.data_paths)
        
        dataset = Dataset.from_dict({
            "full_text": [x["full_text"] for x in data],
            "document": [str(x["document"]) for x in data],
            "tokens": [x["tokens"] for x in data],
            "trailing_whitespace": [x["trailing_whitespace"] for x in data],
            "given_labels": [x["labels"] for x in data],
        })

        labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
        ltoi = {l: i for i, l in enumerate(labels)}
        itol = {v: k for k, v in ltoi.items()}
        
        targets = ['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL']
        
        dataset = dataset.map(self._preprocess_example, fn_kwargs={
            "ltoi": ltoi, 
            "max_seq_len": self.config.MAX_SEQ_LEN, 
            "targets": targets,
        }, num_proc=8)
        
        # Add fold information for cross-validation
        dataset = dataset.add_column("fold", [i % self.config.NUM_FOLDS for i in range(len(dataset))])
        dataset = dataset.class_encode_column("group")

        return {
            "dataset": dataset,
            "labels": labels,
            "ltoi": ltoi,
            "itol": itol,
            "targets": targets,
        }

    def get_test_dataset(self):
        data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/test.json"))
        
        dataset = Dataset.from_dict({
            "full_text": [x["full_text"] for x in data],
            "document": [x["document"] for x in data],
            "tokens": [x["tokens"] for x in data],
            "trailing_whitespace": [x["trailing_whitespace"] for x in data],
        })

        def tokenize(data, tokenizer):
    
            text, token_map = [], []
            idx = 0
            
            for tok, ws in zip(data["tokens"], data["trailing_whitespace"]):
                
                text.append(tok)
                token_map.extend([idx] * len(tok))
                
                if ws:
                    text.append(" ")
                    token_map.append(-1)
                    
                idx += 1
                
            tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=self.config.MAX_SEQ_LEN)
            
                
            return {**tokenized, "token_map": token_map}

        dataset = dataset.map(tokenize, fn_kwargs={"tokenizer": self.config.TOKENIZER}, num_proc=2)

        return dataset

In [None]:
class ModelInference(object):
    """
    A class for performing inference using a trained model for token classification tasks.
    
    This class handles data loading, model preparation, and inference for token classification,
    specifically designed for PII detection tasks.
    
    Attributes:
        config: Configuration object containing model settings and parameters
        fold: Integer indicating the current fold for cross-validation
        preprocessor: Preprocessor instance for data preparation
        tokenizer: Tokenizer instance for text tokenization
        dataset: Complete dataset
        trainer: Hugging Face Trainer instance
        train_dataset: Training dataset split
        eval_dataset: Evaluation dataset split
        train_losses: List storing training loss values
        eval_losses: List storing evaluation loss values
        train_f1_scores: List storing training F1 scores
        eval_f1_scores: List storing evaluation F1 scores
        steps: List storing training steps
    """
    
    def __init__(self, config, fold):
        """
        Initialize the ModelInference object.
        
        Args:
            config: Configuration object containing model settings
            fold: Integer indicating which fold to use for validation
        """
        self.config = config
        self.preprocessor = Preprocessor(self.config)
        self.tokenizer = self.config.TOKENIZER
        
        self.dataset = None
        self.trainer = None
        self.train_dataset = None
        self.eval_dataset = None
        
        # Store metrics for plotting
        self.train_losses = []
        self.eval_losses = []
        self.train_f1_scores = []
        self.eval_f1_scores = []
        self.steps = []

        self.fold = fold

    def load_data(self):
        """
        Load and split the dataset based on the current fold.
        
        Loads the complete dataset and splits it into training and validation sets
        based on the specified fold number.
        """
        self.dataset = self.preprocessor.get_dataset()
        
        # Split data based on current fold
        current_fold = self.fold
        self.train_dataset = self.dataset["dataset"].filter(lambda example: example["fold"] != current_fold)
        self.eval_dataset = self.dataset["dataset"].filter(lambda example: example["fold"] == current_fold)
        
        print(f"Train dataset size: {len(self.train_dataset)}")
        print(f"Validation dataset size: {len(self.eval_dataset)}")

    def compute_metrics(self, eval_pred):
        """
        Compute evaluation metrics for the model predictions.
        
        Args:
            eval_pred: Tuple containing predictions and labels
            
        Returns:
            dict: Dictionary containing computed metrics (F1 score)
        """
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=2)
        
        # Remove padding tokens from evaluation
        true_predictions = []
        true_labels = []
        
        for prediction, label in zip(predictions, labels):
            for i, l in enumerate(label):
                if l != -100:  # -100 is the index used for padding
                    true_predictions.append(prediction[i])
                    true_labels.append(l)
        
        # Convert numeric labels to string labels for better f1 calculation
        true_pred_labels = [self.dataset["itol"][p] for p in true_predictions]
        true_gold_labels = [self.dataset["itol"][l] for l in true_labels]
        
        # Calculate F1 score
        f1 = f1_score(true_gold_labels, true_pred_labels, average="weighted")
        
        return {
            "f1": f1,
        }

    def prep_model(self, fold):
        """
        Prepare the model for inference.
        
        Args:
            fold: Integer indicating which fold's model to load
        """
        model = AutoModelForTokenClassification.from_pretrained(f"/kaggle/input/pii-detection-v1/{self.config.MODEL_NAME.split('/')[1]}_fold_{fold}")

        collator = DataCollatorForTokenClassification(self.tokenizer, pad_to_multiple_of=16)

        args = TrainingArguments(
            ".", 
            per_device_eval_batch_size=1, 
            report_to="none",
        )

        print(f"Predicting on {args.device}")
        
        self.trainer = Trainer(
            model=model, 
            args=args, 
            data_collator=collator, 
            tokenizer=self.config.TOKENIZER,
        )

    def run(self):
        """
        Run the inference pipeline.
        
        This method performs the following steps:
        1. Loads the test dataset
        2. Makes predictions using the model
        3. Processes predictions with softmax and thresholding
        4. Creates final predictions dataframe
        5. Saves predictions to a CSV file
        """
        test_dataset = self.preprocessor.get_test_dataset()
        predictions_ = []
        self.prep_model(2)
        predictions_.append(self.trainer.predict(test_dataset).predictions)

        predictions = np.mean(predictions_, axis=0)
        pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis = 2).reshape(predictions.shape[0],predictions.shape[1],1)
        
        preds = predictions.argmax(-1)
        preds_without_O = pred_softmax[:,:,:12].argmax(-1)
        O_preds = pred_softmax[:,:,12]

        cfg = json.load(open(f"/kaggle/input/pii-detection-v1/{self.config.MODEL_NAME.split('/')[1]}_fold_0/config.json"))
        itol = cfg["id2label"]

        threshold = 0.9
        preds_final = np.where(O_preds < threshold, preds_without_O, preds)

        triplets = []
        document, token, label, token_str = [], [], [], []

        # Process predictions and create output dataframe
        for p, token_map, offsets, tokens, doc in zip(preds_final, test_dataset["token_map"], test_dataset["offset_mapping"], test_dataset["tokens"], test_dataset["document"]):
            for token_pred, (start_idx, end_idx) in zip(p, offsets):
                label_pred = itol[str(token_pred)]
        
                if start_idx + end_idx == 0: continue
        
                if token_map[start_idx] == -1:
                    start_idx += 1
        
                while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                    start_idx += 1
        
                if start_idx >= len(token_map): break
        
                token_id = token_map[start_idx]
        
                if label_pred != "O" and token_id != -1:
                    triplet = (label_pred, token_id, tokens[token_id])
        
                    if triplet not in triplets:
                        document.append(doc)
                        token.append(token_id)
                        label.append(label_pred)
                        token_str.append(tokens[token_id])
                        triplets.append(triplet)

        df = pd.DataFrame({
            "document": document,
            "token": token,
            "label": label,
            "token_str": token_str
        })
        df["row_id"] = list(range(len(df)))
        display(df.head(100))

        df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)

In [7]:
cfg = ConfigCreator(model_name=models[1], lr = lr[0], sl = sl[0], frac = frac[0], bs = bs[0], dataset = dataset[0]).get_config()
print(f"\n{'='*50}")
print(f"Inference model: {cfg.MODEL_NAME}")
print(f"{'='*50}\n")


model_inference = ModelInference(cfg, 0)
model_inference.run()


Inference model: microsoft/deberta-v3-base



Map (num_proc=2):   0%|          | 0/10 [00:00<?, ? examples/s]

Predicting on cuda:0


Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
5,7,742,I-NAME_STUDENT,Sylla,5
6,10,0,B-NAME_STUDENT,Diego,6
7,10,1,I-NAME_STUDENT,Estrada,7
8,10,464,B-NAME_STUDENT,Diego,8
9,10,465,I-NAME_STUDENT,Estrada,9
