In [1]:
import json
import re
import argparse
import os
from itertools import chain
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification, LongformerTokenizerFast
from datasets import Dataset
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from tqdm import tqdm

2024-04-23 07:41:07.913567: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-23 07:41:07.913679: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-23 07:41:08.049433: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
INFERENCE_STRIDE = 256

INFERENCE_MAX_LENGTH_NO_REPLACE = [896, 896, 896, 896]
TRAINING_MODEL_PATH_NO_REPLACE = [
    "/kaggle/input/kfold-ex-6-avg-0-9757/fold0/model_0.9745/checkpoint-2500",
    "/kaggle/input/kfold-ex-6-avg-0-9757/fold1/model_0.9754/checkpoint-2600",
    "/kaggle/input/kfold-ex-6-avg-0-9757/fold2/model_0.9794/checkpoint-2600",
    "/kaggle/input/kfold-ex-6-avg-0-9757/fold3/model_0.9735/checkpoint-1900"
]

INFERENCE_MAX_LENGTH_REPLACE = [896, 896, 896, 896]
TRAINING_MODEL_PATH_REPLACE = [
    "/kaggle/input/kfold-ex-18-avg-0-97735-replace-nn/fold0/model_0.9794/checkpoint-2100",
    "/kaggle/input/kfold-ex-18-avg-0-97735-replace-nn/fold1/model_0.9787/checkpoint-2100",
    "/kaggle/input/kfold-ex-18-avg-0-97735-replace-nn/fold2/model_0.9756/checkpoint-2300",
    "/kaggle/input/kfold-ex-18-avg-0-97735-replace-nn/fold3/model_0.9757/checkpoint-2000"
]  # 替换\n\n的

TRAINING_MODEL_PATH = TRAINING_MODEL_PATH_NO_REPLACE + TRAINING_MODEL_PATH_REPLACE

In [3]:
id2label = {
    0: 'B-EMAIL',
    1: 'B-ID_NUM',
    2: 'B-NAME_STUDENT',
    3: 'B-PHONE_NUM',
    4: 'B-STREET_ADDRESS',
    5: 'B-URL_PERSONAL',
    6: 'B-USERNAME',
    7: 'I-ID_NUM',
    8: 'I-NAME_STUDENT',
    9: 'I-PHONE_NUM',
    10: 'I-STREET_ADDRESS',
    11: 'I-URL_PERSONAL',
    12: 'O'
}
label2id = {
    'B-EMAIL': 0,
    'B-ID_NUM': 1,
    'B-NAME_STUDENT': 2,
    'B-PHONE_NUM': 3,
    'B-STREET_ADDRESS': 4,
    'B-URL_PERSONAL': 5,
    'B-USERNAME': 6,
    'I-ID_NUM': 7,
    'I-NAME_STUDENT': 8,
    'I-PHONE_NUM': 9,
    'I-STREET_ADDRESS': 10,
    'I-URL_PERSONAL': 11,
    'O': 12
}
all_labels = [
    'B-EMAIL',
    'B-ID_NUM',
    'B-NAME_STUDENT',
    'B-PHONE_NUM',
    'B-STREET_ADDRESS',
    'B-URL_PERSONAL',
    'B-USERNAME',
    'I-ID_NUM',
    'I-NAME_STUDENT',
    'I-PHONE_NUM',
    'I-STREET_ADDRESS',
    'I-URL_PERSONAL',
    'O'
]

In [4]:
def get_labels(word_ids, word_labels):
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        else:
            label_ids.append(label2id[word_labels[word_idx]])
    return label_ids

# Tokenize texts, possibly generating more than one tokenized sample for each text


def tokenize(df, tokenizer, inference_max_length, to_tensor=True, with_labels=True):

    # This is what's different from a longformer
    # Read the parameters with attention
    encoded = tokenizer(df['tokens'].tolist(),
                        is_split_into_words=True,
                        return_overflowing_tokens=True,
                        stride=INFERENCE_STRIDE,
                        max_length=inference_max_length,
                        padding="max_length",
                        truncation=True)

    if with_labels:
        encoded['labels'] = []

    encoded['wids'] = []
    n = len(encoded['overflow_to_sample_mapping'])
    for i in range(n):

        # Map back to original row
        text_idx = encoded['overflow_to_sample_mapping'][i]

        # Get word indexes (this is a global index that takes into consideration the chunking :D )
        word_ids = encoded.word_ids(i)

        if with_labels:
            # Get word labels of the full un-chunked text
            word_labels = df['labels'].iloc[text_idx]

            # Get the labels associated with the word indexes
            label_ids = get_labels(word_ids, word_labels)
            encoded['labels'].append(label_ids)
        encoded['wids'].append([w if w is not None else -1 for w in word_ids])

    if to_tensor:
        encoded = {key: torch.as_tensor(val) for key, val in encoded.items()}
    return encoded


class PIIDataset(Dataset):
    def __init__(self, tokenized_ds):
        self.data = tokenized_ds

    def __getitem__(self, index):
        item = {k: self.data[k][index] for k in self.data.keys()}
        return item

    def __len__(self):
        return len(self.data['input_ids'])

In [5]:
def inferenceV4(df, dl, model, gpu_id):

    # These 2 dictionaries will hold text-level data
    # Helping in the merging process by accumulating data
    # Through all the chunks

    token_pred = defaultdict(lambda: defaultdict(int))
    token_cnt = defaultdict(lambda: defaultdict(int))

    for batch in dl:
        ids = batch["input_ids"].to(devices[gpu_id])
        mask = batch["attention_mask"].to(devices[gpu_id])
        preds = model(ids, attention_mask=mask, return_dict=False)[0].cpu().detach().numpy()
        preds_softmax = np.exp(preds) / np.sum(np.exp(preds), axis=2).reshape(preds.shape[0], preds.shape[1], 1)

        del ids, mask

        # Go over each prediction, getting the text_id reference

        for k, (chunk_preds, text_id) in enumerate(zip(preds_softmax, batch['overflow_to_sample_mapping'].tolist())):
            # The word_ids are absolute references in the original text
            word_ids = batch['wids'][k].numpy()

            for idx, word_idx in enumerate(word_ids):
                if word_idx != -1:
                    token_pred[text_id][word_idx] += chunk_preds[idx]
                    token_cnt[text_id][word_idx] += 1

    for text_id in token_pred:
        for word_idx in token_pred[text_id]:
            token_pred[text_id][word_idx] /= token_cnt[text_id][word_idx]

    return token_pred

In [6]:
def split_dict_tensor(input_dict):
    split_dict_1 = {}
    split_dict_2 = {}

    for key, value in input_dict.items():
        # 如果value不是张量，则尝试将其转换为张量
        if not isinstance(value, torch.Tensor):
            value = torch.tensor(value)

        # 获取当前value的第一个维度长度
        split_index = value.shape[0] // 2

        # 如果value至少有一个元素，则尝试拆分
        if value.shape[0] > 0:
            split_value_1, split_value_2 = torch.split(tensor=value, split_size_or_sections=[split_index, value.shape[0] - split_index], dim=0)
            split_dict_1[key] = split_value_1
            split_dict_2[key] = split_value_2
        else:
            # 如果value为空，直接复制
            split_dict_1[key] = value
            split_dict_2[key] = value.clone()  # 确保是一个新的副本

    return split_dict_1, split_dict_2

In [7]:
# LIBRARIES TO CLEAN MEMORY
import ctypes
import gc
import threading
import time
libc = ctypes.CDLL("libc.so.6")
_ = gc.collect()
libc.malloc_trim(0)
device0 = torch.device("cuda:0")
device1 = torch.device("cuda:1")

In [8]:
df = pd.read_json("/kaggle/input/pii-detection-removal-from-educational-data/test.json")
df.tokens = df.tokens.apply(lambda x: ["|" if i == '\n\n' else i for i in x])

final_token_pred = defaultdict(lambda: defaultdict(int))
'''
有点小问题，可能切分的时候会把一个文档的切成两个部分去推理，这样重合的部分的word没有很好的平均概率的时候，可能会有点问题
'''
for idx, model_path in enumerate(TRAINING_MODEL_PATH_REPLACE):

    print('#'*25)
    print('=> Inferring', model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model0 = AutoModelForTokenClassification.from_pretrained(
        model_path,
        num_labels=len(all_labels),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    ).to(device0)
    model1 = AutoModelForTokenClassification.from_pretrained(
        model_path,
        num_labels=len(all_labels),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    ).to(device1)
    models = [model0, model1]
    devices = [device0, device1]

    # 这个tokenize成功运行
    tokenized_test = tokenize(
        df=df, 
        inference_max_length=INFERENCE_MAX_LENGTH_REPLACE[idx], 
        with_labels=False, 
        tokenizer=tokenizer
    )
    last_shape = list(tokenized_test.values())[0].shape[0]
    for k, v in tokenized_test.items():
        assert last_shape == v.shape[0]
        last_shape = v.shape[0]

    # 这个split成功运行
    sub_df_1, sub_df_2 = split_dict_tensor(tokenized_test)  # 问题所在
    # Create a lock to synchronize the threads
    lock = threading.Lock()

    single_pred = []
    # Define a function for inference

    def inference_thread(gpu_id, lock, tokenized_test):
        with lock:
            print(f"Thread {gpu_id} started on GPU {gpu_id}")
        # 这里也没有问题
        test_dataset = PIIDataset(tokenized_test)
        test_dataloader = DataLoader(test_dataset, batch_size=1)

        token_pred = inferenceV4(df=df, dl=test_dataloader, model=models[gpu_id], gpu_id=gpu_id)
        with lock:
            print(f"Thread {gpu_id} finished on GPU {gpu_id}")

        single_pred.append(token_pred)

    # Create two threads for inference
    thread1 = threading.Thread(target=inference_thread, args=(0, lock, sub_df_1))
    thread2 = threading.Thread(target=inference_thread, args=(1, lock, sub_df_2))

    # Start the threads
    thread1.start()
    thread2.start()

    # Wait for both threads to finish
    thread1.join()
    thread2.join()

    print("Both threads have finished.")
    print()
    for tmp_pred in single_pred:
        for text_id in tmp_pred:
            for word_idx in tmp_pred[text_id]:
                final_token_pred[text_id][word_idx] += tmp_pred[text_id][word_idx] / len(TRAINING_MODEL_PATH)

    # CLEAN MEMORY
    del model0, model1, models, tokenizer
    torch.cuda.empty_cache()
    _ = gc.collect()
    libc.malloc_trim(0)

#########################
=> Inferring /kaggle/input/kfold-ex-18-avg-0-97735-replace-nn/fold0/model_0.9794/checkpoint-2100
Thread 0 started on GPU 0
Thread 1 started on GPU 1
Thread 1 finished on GPU 1
Thread 0 finished on GPU 0
Both threads have finished.

#########################
=> Inferring /kaggle/input/kfold-ex-18-avg-0-97735-replace-nn/fold1/model_0.9787/checkpoint-2100
Thread 0 started on GPU 0
Thread 1 started on GPU 1
Thread 0 finished on GPU 0
Thread 1 finished on GPU 1
Both threads have finished.

#########################
=> Inferring /kaggle/input/kfold-ex-18-avg-0-97735-replace-nn/fold2/model_0.9756/checkpoint-2300
Thread 0 started on GPU 0
Thread 1 started on GPU 1
Thread 0 finished on GPU 0
Thread 1 finished on GPU 1
Both threads have finished.

#########################
=> Inferring /kaggle/input/kfold-ex-18-avg-0-97735-replace-nn/fold3/model_0.9757/checkpoint-2000
Thread 0 started on GPU 0
Thread 1 started on GPU 1
Thread 0 finished on GPU 0
Thread 1 finished on GPU

In [9]:
df = pd.read_json("/kaggle/input/pii-detection-removal-from-educational-data/test.json")
'''
有点小问题，可能切分的时候会把一个文档的切成两个部分去推理，这样重合的部分的word没有很好的平均概率的时候，可能会有点问题
'''
for idx, model_path in enumerate(TRAINING_MODEL_PATH_NO_REPLACE):

    print('#'*25)
    print('=> Inferring', model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model0 = AutoModelForTokenClassification.from_pretrained(
        model_path,
        num_labels=len(all_labels),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    ).to(device0)
    model1 = AutoModelForTokenClassification.from_pretrained(
        model_path,
        num_labels=len(all_labels),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    ).to(device1)
    models = [model0, model1]
    devices = [device0, device1]

    # 这个tokenize成功运行
    tokenized_test = tokenize(
        df=df, 
        inference_max_length=INFERENCE_MAX_LENGTH_NO_REPLACE[idx], 
        with_labels=False, 
        tokenizer=tokenizer
    )
    last_shape = list(tokenized_test.values())[0].shape[0]
    for k, v in tokenized_test.items():
        assert last_shape == v.shape[0]
        last_shape = v.shape[0]

    # 这个split成功运行
    sub_df_1, sub_df_2 = split_dict_tensor(tokenized_test)  # 问题所在
    # Create a lock to synchronize the threads
    lock = threading.Lock()

    single_pred = []
    # Define a function for inference

    def inference_thread(gpu_id, lock, tokenized_test):
        with lock:
            print(f"Thread {gpu_id} started on GPU {gpu_id}")
        # 这里也没有问题
        test_dataset = PIIDataset(tokenized_test)
        test_dataloader = DataLoader(test_dataset, batch_size=1)

        token_pred = inferenceV4(df=df, dl=test_dataloader, model=models[gpu_id], gpu_id=gpu_id)
        with lock:
            print(f"Thread {gpu_id} finished on GPU {gpu_id}")

        single_pred.append(token_pred)

    # Create two threads for inference
    thread1 = threading.Thread(target=inference_thread, args=(0, lock, sub_df_1))
    thread2 = threading.Thread(target=inference_thread, args=(1, lock, sub_df_2))

    # Start the threads
    thread1.start()
    thread2.start()

    # Wait for both threads to finish
    thread1.join()
    thread2.join()

    print("Both threads have finished.")
    print()
    for tmp_pred in single_pred:
        for text_id in tmp_pred:
            for word_idx in tmp_pred[text_id]:
                final_token_pred[text_id][word_idx] += tmp_pred[text_id][word_idx] / len(TRAINING_MODEL_PATH)

    # CLEAN MEMORY
    del model0, model1, models, tokenizer
    torch.cuda.empty_cache()
    _ = gc.collect()
    libc.malloc_trim(0)

#########################
=> Inferring /kaggle/input/kfold-ex-6-avg-0-9757/fold0/model_0.9745/checkpoint-2500
Thread 0 started on GPU 0
Thread 1 started on GPU 1
Thread 0 finished on GPU 0
Thread 1 finished on GPU 1
Both threads have finished.

#########################
=> Inferring /kaggle/input/kfold-ex-6-avg-0-9757/fold1/model_0.9754/checkpoint-2600
Thread 0 started on GPU 0
Thread 1 started on GPU 1
Thread 0 finished on GPU 0
Thread 1 finished on GPU 1
Both threads have finished.

#########################
=> Inferring /kaggle/input/kfold-ex-6-avg-0-9757/fold2/model_0.9794/checkpoint-2600
Thread 0 started on GPU 0
Thread 1 started on GPU 1
Thread 0 finished on GPU 0
Thread 1 finished on GPU 1
Both threads have finished.

#########################
=> Inferring /kaggle/input/kfold-ex-6-avg-0-9757/fold3/model_0.9735/checkpoint-1900
Thread 0 started on GPU 0
Thread 1 started on GPU 1
Thread 0 finished on GPU 0
Thread 1 finished on GPU 1
Both threads have finished.



In [10]:
document, token, label, score = [], [], [], []
for text_id in final_token_pred:
    for word_idx in final_token_pred[text_id]:
        pred = final_token_pred[text_id][word_idx].argmax(-1)
        pred_without_O = final_token_pred[text_id][word_idx][:12].argmax(-1)
        if final_token_pred[text_id][word_idx][12] < 0.0:
            final_pred = pred_without_O
            tmp_score = final_token_pred[text_id][word_idx][final_pred]

        else:
            final_pred = pred
            tmp_score = final_token_pred[text_id][word_idx][final_pred]

        if id2label[final_pred] != 'O':
            document.append(df.loc[text_id, "document"])
            token.append(word_idx)
            label.append(id2label[final_pred])
            score.append(tmp_score)

pred_df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "score": score
})

In [11]:
pred_df = pred_df.sort_values(['document', 'token']).reset_index(drop=True)

In [12]:
pred_df

Unnamed: 0,document,token,label,score
0,7,9,B-NAME_STUDENT,0.999237
1,7,10,I-NAME_STUDENT,0.999578
2,7,482,B-NAME_STUDENT,0.998972
3,7,483,I-NAME_STUDENT,0.999484
4,7,741,B-NAME_STUDENT,0.998966
5,7,742,I-NAME_STUDENT,0.999478
6,10,0,B-NAME_STUDENT,0.99931
7,10,1,I-NAME_STUDENT,0.999655
8,10,464,B-NAME_STUDENT,0.999031
9,10,465,I-NAME_STUDENT,0.999305


Postprocess

In [13]:
df = df[['document', 'tokens']].copy()

In [14]:
df = df.explode(['tokens']).reset_index(drop=True).rename(columns={'tokens': 'token'})

In [15]:
df['token_str'] = df['token']
df['token'] = df.groupby('document').cumcount()

In [16]:
new_pred_df = pd.merge(df, pred_df[['document', 'token', 'label', "score"]], on=['document', 'token'], how='left')
new_pred_df['label'] = new_pred_df['label'].fillna('O')

In [17]:
def pp(new_pred_df):
    df = new_pred_df.copy()
    i = 0
    while i < len(df):
        st = i
        doc = df.loc[st, "document"]
        tok = df.loc[st, "token"]
        pred_tok = df.loc[st, "label"]
        if pred_tok == 'O':
            i += 1
            continue
        lab = pred_tok.split('-')[1]
        cur_doc = doc
        cur_lab = lab
        last_tok = tok
        cur_tok = last_tok
        # prefix = []
        while i < len(df) and cur_doc == doc and cur_lab == lab and last_tok == cur_tok:
            # prefix.append(pred_tok.split('-')[0])
            last_tok = cur_tok + 1
            i += 1
            cur_doc = df.loc[i, "document"]
            cur_tok = df.loc[i, "token"]
            if i >= len(df) or df.loc[i, "label"] == 'O':
                break
            cur_lab = df.loc[i, "label"].split('-')[1]

        # exception
        if st - 2 >= 0 and df.loc[st - 2, "document"] == df.loc[st, "document"] and df.loc[st - 1, "token_str"] == '\n' and df.loc[st - 2, "label"] != 'O' and df.loc[st - 2, "label"].split('-')[1] == lab:
            df.loc[st - 1, "label"] = 'I-' + lab
            df.loc[st - 1, "score"] = 1
            for j in range(st, i):
                if df.loc[j, "label"] != 'I-' + lab:
                    df.loc[j, "score"] = 1
                    df.loc[j, "label"] = 'I-' + lab
            continue

        # fix
        for j in range(st, i):
            if j == st:
                if df.loc[j, "label"] != 'B-' + lab:
                    df.loc[j, "score"] = 1
                    df.loc[j, "label"] = 'B-' + lab
            else:
                if df.loc[j, "label"] != 'I-' + lab:
                    df.loc[j, "score"] = 1
                    df.loc[j, "label"] = 'I-' + lab
#         print(df.loc[st:i,:])
        if lab == 'NAME_STUDENT' and any(len(item) == 2 and item[0].isupper() and item[1] == "." for item in df.loc[st:i-1, 'token_str']):
            for j in range(st, i):
                df.loc[j, "score"] = 0
                df.loc[j, "label"] = 'O'

    return df

In [18]:
new_pred_df = pp(new_pred_df)

In [19]:
new_pred_df

Unnamed: 0,document,token,token_str,label,score
0,7,0,Design,O,
1,7,1,Thinking,O,
2,7,2,for,O,
3,7,3,innovation,O,
4,7,4,reflexion,O,
...,...,...,...,...,...
8500,123,1689,(,O,
8501,123,1690,https://www.melessa.uni-,O,
8502,123,1691,muenchen.de/team/vorstandssprecher/schmidt/pub...,O,
8503,123,1692,),O,


In [20]:
new_pred_df = new_pred_df.query("label != 'O'").reset_index(drop=True)

In [21]:
rows_to_delete = []
for idx, row in new_pred_df.iterrows():
    if row.label == 'I-PHONE_NUM':
        if row.token_str == ')':
            rows_to_delete.append(idx)
        elif not bool(re.search(r'\d', row.token_str)):
            rows_to_delete.append(idx)
    elif row.label == 'B-EMAIL':
        if '@' not in row.token_str:
            rows_to_delete.append(idx)

In [22]:
new_pred_df = new_pred_df.drop(rows_to_delete, axis=0, inplace=False)

In [23]:
new_pred_df["row_id"] = list(range(len(new_pred_df)))

In [27]:
new_pred_df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)

In [28]:
new_pred_df[["row_id", "document", "token", "label", "token_str"]]

Unnamed: 0,row_id,document,token,label,token_str
0,0,7,9,B-NAME_STUDENT,Nathalie
1,1,7,10,I-NAME_STUDENT,Sylla
2,2,7,482,B-NAME_STUDENT,Nathalie
3,3,7,483,I-NAME_STUDENT,Sylla
4,4,7,741,B-NAME_STUDENT,Nathalie
5,5,7,742,I-NAME_STUDENT,Sylla
6,6,10,0,B-NAME_STUDENT,Diego
7,7,10,1,I-NAME_STUDENT,Estrada
8,8,10,464,B-NAME_STUDENT,Diego
9,9,10,465,I-NAME_STUDENT,Estrada
