In [None]:
TRAINING_MODEL_PATH = "microsoft/deberta-v3-large"  # your model path
TRAINING_MAX_LENGTH = 768  # I use 1280 locally
OUTPUT_DIR = "output"  # your output path

In [None]:
!pip install seqeval evaluate -q

In [None]:
import json
import argparse
import os
import pandas as pd
import numpy as np
import evaluate
import torch
from itertools import chain
from functools import partial
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
from datasets import Dataset, features, DatasetDict
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score
from tqdm import tqdm

In [None]:
data = json.load(open("../kaggle_dataset/competition/train.json"))

print(len(data))
print(data[0].keys())

In [None]:
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i, l in enumerate(all_labels)}
id2label = {v: k for k, v in label2id.items()}

print(id2label)

In [None]:
target = [
    "B-EMAIL", "B-ID_NUM", "B-NAME_STUDENT", "B-PHONE_NUM",
    "B-STREET_ADDRESS", "B-URL_PERSONAL", "B-USERNAME", "I-ID_NUM",
    "I-NAME_STUDENT", "I-PHONE_NUM", "I-STREET_ADDRESS", "I-URL_PERSONAL"
]

In [None]:
def tokenize(example, tokenizer, label2id):
    text = []

    # these are at the character level
    labels = []
    targets = []

    for t, l, ws in zip(example["tokens"], example["provided_labels"], example["trailing_whitespace"]):

        text.append(t)
        labels.extend([l] * len(t))

        if l in target:
            targets.append(1)
        else:
            targets.append(0)
        # if there is trailing whitespace
        if ws:
            text.append(" ")
            labels.append("O")

    tokenized = tokenizer(
        "".join(text), 
        return_offsets_mapping=True,
        truncation=True, 
        max_length=TRAINING_MAX_LENGTH
    )

    target_num = sum(targets)
    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:

        # CLS token
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {
        **tokenized,
        "labels": token_labels,
        "length": length,
        "target_num": target_num,
        "group": 1 if target_num > 0 else 0
    }

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)

ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [str(x["document"]) for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
    "provided_labels": [x["labels"] for x in data],
})

In [None]:
%%time
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id}, num_proc=2)
ds = ds.class_encode_column("group")

In [None]:
x = ds[0]

for t, l in zip(x["tokens"], x["provided_labels"]):
    if l != "O":
        print((t, l))

print("*" * 40)

for t, l in zip(tokenizer.convert_ids_to_tokens(x["input_ids"]), x["labels"]):
    if id2label[l] != "O":
        print((t, id2label[l]))

## Competition Metrics

In [None]:
def compute_metrics(p, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f1_score = (1 + 5 * 5) * recall * precision / (5 * 5 * precision + recall)

    results = {
        "recall": recall,
        "precision": precision,
        "f1": f1_score
    }
    return results

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    TRAINING_MODEL_PATH,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

In [None]:
FREEZE_EMBEDDINGS = False
FREEZE_LAYERS = 0

if FREEZE_EMBEDDINGS:
    print("Freezing embeddings.")
    for param in model.deberta.embeddings.parameters():
        param.requires_grad = False

if FREEZE_LAYERS > 0:
    print(f"Freezing {FREEZE_LAYERS} layers.")
    for layer in model.deberta.encoder.layer[:FREEZE_LAYERS]:
        for param in layer.parameters():
            param.requires_grad = False

In [None]:
final_ds = ds.train_test_split(test_size=0.30, seed=42)
final_ds

## 因为这里我们的分割导致train里面包含所有标签，我们将train作为实际的test，test作为实际的train。(如果是五五开，seed=48的话)

In [None]:
train_ds = final_ds["train"].to_pandas()
have_label = 0
no_label = 0
label_dict = {}
for idx, row in tqdm(train_ds.iterrows(), total=len(train_ds)):
    labels = row["provided_labels"].tolist()
    all_labels_are_O = all(label == "O" for label in labels)
    if all_labels_are_O:
        no_label += 1
    else:
        have_label += 1
    for label in labels:
        if label in label_dict:
            label_dict[label] += 1
        else:
            label_dict[label] = 1

sorted_keys = sorted(label_dict.keys())
for key in sorted_keys:
    print(f"{key}: {label_dict[key]}")

print("Number of examples with no label:", no_label)
print("Number of examples with labels:", have_label)

In [None]:
test_ds = final_ds["test"].to_pandas()
test_ds["is_have_label"] = 0

have_label = 0
no_label = 0
label_dict = {}
for idx, row in tqdm(test_ds.iterrows(), total=len(test_ds)):
    labels = row["provided_labels"].tolist()
    all_labels_are_O = all(label == "O" for label in labels)
    if all_labels_are_O:
        no_label += 1
        test_ds.at[idx, "is_have_label"] = 0
    else:
        have_label += 1
        test_ds.at[idx, "is_have_label"] = 1
    for label in labels:
        if label in label_dict:
            label_dict[label] += 1
        else:
            label_dict[label] = 1

sorted_keys = sorted(label_dict.keys())
for key in sorted_keys:
    print(f"{key}: {label_dict[key]}")

print("Number of examples with no label:", no_label)
print("Number of examples with labels:", have_label)

test_ds1 = test_ds[test_ds["is_have_label"] == 1]
print(test_ds1.shape)
test_ds0 = test_ds[test_ds["is_have_label"] == 0]
print(test_ds0.shape)
test_ds0 = test_ds0.sample(int(1 * len(test_ds1)))
test_ds = pd.concat([test_ds0, test_ds1])

test_ds.drop(columns=["is_have_label"], inplace=True)
test_ds.reset_index(drop=True, inplace=True)
print("Final shape of test_ds:", test_ds.shape)

## 额外的数据集不作为validation，只用作训练

In [None]:
extra_data = json.load(open("../kaggle_dataset/pjm_gpt_2k_0126_fixed.json"))
extra_ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in extra_data],
    "document": [str(x["document"]) for x in extra_data],
    "tokens": [x["tokens"] for x in extra_data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in extra_data],
    "provided_labels": [x["labels"] for x in extra_data],
})
extra_ds = extra_ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id}, num_proc=2)
extra_ds = extra_ds.class_encode_column("group")
print(extra_ds)
extra_ds = extra_ds.to_pandas()

train_ds = pd.concat([train_ds, extra_ds])
train_ds = train_ds.sample(len(train_ds), random_state=42)
train_ds.reset_index(drop=True, inplace=True)
print(train_ds.shape)

In [None]:
train_dataset = Dataset.from_pandas(train_ds)
test_dataset = Dataset.from_pandas(test_ds)

final_ds = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

print(final_ds)

## training

In [None]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    fp16=True,
    warmup_steps=25,
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    report_to="none",
    gradient_accumulation_steps=16,
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=6,
    overwrite_output_dir=True,
    load_best_model_at_end=True,
    lr_scheduler_type="linear",
    metric_for_best_model="f1",
    greater_is_better=True,
    weight_decay=0.001
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=final_ds["train"],
    eval_dataset=final_ds["test"],
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, all_labels=all_labels),
)

In [None]:
%%time
trainer.train()

In [None]:
trainer.save_model("final")
torch.cuda.empty_cache()

In [None]:
def delete_optimizer_files(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == "optimizer.pt":
                os.remove(os.path.join(root, file))
                print(f"Deleted: {os.path.join(root, file)}")

directory_path = "/kaggle/working/"
delete_optimizer_files(directory_path)