In [None]:
import json
import argparse
import pandas as pd
import numpy as np
import torch
import evaluate
import os
import random

from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
from datasets import Dataset, features, DatasetDict
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score
from itertools import chain
from functools import partial
from tqdm import tqdm

In [None]:
TRAINING_MODEL_PATH = "microsoft/deberta-v3-large"
TRAINING_MAX_LENGTH = 768  
OUTPUT_DIR = "output"

In [None]:
train1 = json.load(open("../kaggle_dataset/train_split.json"))
train2 = json.load(open("../kaggle_dataset/pjm_gpt_2k_0126_fixed.json"))
train = train1 + train2
random.shuffle(train)

In [None]:
test = json.load(open("../kaggle_dataset/test_split.json"))

In [None]:
print(len(train))
print(train[0].keys())

In [None]:
print(len(test))
print(test[0].keys())

In [None]:
all_labels = sorted(list(set(chain(*[x["labels"] for x in test]))))
label2id = {l: i for i, l in enumerate(all_labels)}
id2label = {v: k for k, v in label2id.items()}

print(id2label)

In [None]:
target = [
    "B-EMAIL", "B-ID_NUM", "B-NAME_STUDENT", "B-PHONE_NUM", 
    "B-STREET_ADDRESS", "B-URL_PERSONAL", "B-USERNAME", "I-ID_NUM", 
    "I-NAME_STUDENT", "I-PHONE_NUM", "I-STREET_ADDRESS", "I-URL_PERSONAL"
]

In [None]:
def tokenize(example, tokenizer, label2id):
    text = []

    # these are at the character level
    labels = []
    targets = []

    for t, l, ws in zip(example["tokens"], example["provided_labels"], example["trailing_whitespace"]):

        text.append(t)
        labels.extend([l] * len(t))
        
        if l in target:
            targets.append(1)
        else:
            targets.append(0)
        # if there is trailing whitespace
        if ws:
            text.append(" ")
            labels.append("O")

    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=TRAINING_MAX_LENGTH)
    
    target_num = sum(targets)
    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:

        # CLS token
        if start_idx == 0 and end_idx == 0: 
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {
        **tokenized,
        "labels": token_labels,
        "length": length,
        "target_num": target_num,
        "group": 1 if target_num > 0 else 0
    }

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)

In [None]:
train_ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in train],
    "document": [str(x["document"]) for x in train],
    "tokens": [x["tokens"] for x in train],
    "trailing_whitespace": [x["trailing_whitespace"] for x in train],
    "provided_labels": [x["labels"] for x in train],
})

In [None]:
%%time
train_ds = train_ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id}, num_proc=2)
train_ds = train_ds.class_encode_column("group")
print(train_ds)

In [None]:
test_ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in test],
    "document": [str(x["document"]) for x in test],
    "tokens": [x["tokens"] for x in test],
    "trailing_whitespace": [x["trailing_whitespace"] for x in test],
    "provided_labels": [x["labels"] for x in test],
})

In [None]:
%%time
test_ds = test_ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id}, num_proc=2)
test_ds = test_ds.class_encode_column("group")
print(test_ds)

In [None]:
x = train_ds[0]

for t, l in zip(x["tokens"], x["provided_labels"]):
    if l != "O":
        print((t, l))

print("*" * 100)

for t, l in zip(tokenizer.convert_ids_to_tokens(x["input_ids"]), x["labels"]):
    if id2label[l] != "O":
        print((t, id2label[l]))

In [None]:
def compute_metrics(p, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f1_score = (1 + 5 * 5) * recall * precision / (5 * 5 * precision + recall)
    
    results = {
        "recall": recall,
        "precision": precision,
        "f1": f1_score
    }
    return results

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    TRAINING_MODEL_PATH,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

In [None]:
FREEZE_EMBEDDINGS = False
FREEZE_LAYERS = 0

if FREEZE_EMBEDDINGS:
    print("Freezing embeddings.")
    for param in model.deberta.embeddings.parameters():
        param.requires_grad = False
        
if FREEZE_LAYERS > 0:
    print(f"Freezing {FREEZE_LAYERS} layers.")
    for layer in model.deberta.encoder.layer[:FREEZE_LAYERS]:
        for param in layer.parameters():
            param.requires_grad = False

In [None]:
final_ds = DatasetDict({
    "train": train_ds,
    "test": test_ds
})

print(final_ds)

## training

In [None]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR, 
    fp16=True,
    warmup_steps=25,
    learning_rate=1e-5,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    report_to="none",
    gradient_accumulation_steps=16,
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=6,
    save_only_model=True,
    overwrite_output_dir=True,
    load_best_model_at_end=True,
    lr_scheduler_type="linear",
    metric_for_best_model="f1",
    greater_is_better=True,
    weight_decay=0.001
)
trainer = Trainer(
    model=model, 
    args=args, 
    train_dataset=final_ds["train"], 
    eval_dataset=final_ds["test"], 
    data_collator=collator, 
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, all_labels=all_labels),
)

In [None]:
%%time
trainer.train()

In [None]:
trainer.save_model("final")
torch.cuda.empty_cache()

In [None]:
def delete_optimizer_files(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == "optimizer.pt":
                os.remove(os.path.join(root, file))
                print(f"Deleted: {os.path.join(root, file)}")

directory_path = "./"
delete_optimizer_files(directory_path)