In [1]:
TRAINING_MODEL_PATH = "microsoft/deberta-v3-base"
OUTPUT_DIR = "output"

In [2]:
!pip install seqeval evaluate -q

In [3]:
from pathlib import Path
import json
import argparse
from itertools import chain
from functools import partial

import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import evaluate
from datasets import Dataset, features
import numpy as np
import pandas as pd



In [4]:
data = json.load(open('/kaggle/input/14-03-24-data-training-sample/25_03_24_data_dirty.json'))

In [5]:
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v: k for k,v in label2id.items()}

print(id2label)

{0: 'CARD_NUMBER', 1: 'EMAIL', 2: 'NAME', 3: 'O', 4: 'PHONE_NUM', 5: 'STREET_ADDRESS', 6: 'URL_PERSONAL', 7: 'USERNAME'}


## ♟️ Data Tokenization
- This tokenizer is actually special, comparing to usual NLP challenges

In [6]:
def tokenize(example, tokenizer, label2id):

    # rebuild text from tokens
    text = []
    labels = []

    for t, l, ws in zip(
        example["tokens"], example["provided_labels"], example["trailing_whitespace"]
    ):
        text.append(t)
        labels.extend([l] * len(t))

        if ws:
            text.append(" ")
            labels.append("O")

    # actual tokenization
    tokenized = tokenizer("".join(text), return_offsets_mapping=True,
                          truncation=False)


    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:
        # CLS token
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {**tokenized, "labels": token_labels, "length": length, "token_map": labels}

In [7]:
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)

ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [str(x["document"]) for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
    "provided_labels": [x["labels"] for x in data],
})
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id}, num_proc=3)
# ds = ds.class_encode_column("group")

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



    

#0:   0%|          | 0/400 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/400 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/400 [00:00<?, ?ex/s]

In [8]:
x = ds[0]

for token, label in zip(x["tokens"], x["provided_labels"]):
    if label != "O":
        print((token, label))

print("*" * 100)

for token, label in zip(tokenizer.convert_ids_to_tokens(x["input_ids"]), x["labels"]):
    if id2label[label] != "O":
        print((token, id2label[label]))

('Мишин', 'NAME')
('maria_faulkner@silva', 'EMAIL')
('1', 'STREET_ADDRESS')
('160975', 'STREET_ADDRESS')
('84892411578', 'PHONE_NUM')
('https://instagram.com/ysullivan', 'URL_PERSONAL')
('juancampos', 'USERNAME')
('Мишин', 'NAME')
****************************************************************************************************
('▁Ми', 'NAME')
('ш', 'NAME')
('ин', 'NAME')
('▁maria', 'EMAIL')
('_', 'EMAIL')
('fau', 'EMAIL')
('lk', 'EMAIL')
('ner', 'EMAIL')
('@', 'EMAIL')
('sil', 'EMAIL')
('va', 'EMAIL')
('▁1', 'STREET_ADDRESS')
('▁160', 'STREET_ADDRESS')
('975', 'STREET_ADDRESS')
('▁848', 'PHONE_NUM')
('924', 'PHONE_NUM')
('11', 'PHONE_NUM')
('578', 'PHONE_NUM')
('▁https', 'URL_PERSONAL')
(':', 'URL_PERSONAL')
('/', 'URL_PERSONAL')
('/', 'URL_PERSONAL')
('instagram', 'URL_PERSONAL')
('.', 'URL_PERSONAL')
('com', 'URL_PERSONAL')
('/', 'URL_PERSONAL')
('y', 'URL_PERSONAL')
('s', 'URL_PERSONAL')
('ulli', 'URL_PERSONAL')
('van', 'URL_PERSONAL')
('▁ju', 'USERNAME')
('an', 'USERNAME')
('cam

In [9]:
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

def compute_metrics(p, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f5_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    results = {
        'recall': recall,
        'precision': precision,
        'f5': f5_score
    }
    return results

In [10]:
model = AutoModelForTokenClassification.from_pretrained(
    TRAINING_MODEL_PATH,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# I decided to uses no eval
final_ds = ds.train_test_split(test_size=0.2, seed=42) # cannot use stratify_by_column='group'
final_ds

DatasetDict({
    train: Dataset({
        features: ['full_text', 'document', 'tokens', 'trailing_whitespace', 'provided_labels', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels', 'length', 'token_map'],
        num_rows: 960
    })
    test: Dataset({
        features: ['full_text', 'document', 'tokens', 'trailing_whitespace', 'provided_labels', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels', 'length', 'token_map'],
        num_rows: 240
    })
})

In [12]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR, 
    fp16=True,
    learning_rate=2e-5,
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    report_to="none",
    evaluation_strategy="epoch",
    do_eval=False,
    save_total_limit=1,
    logging_steps=20,
    lr_scheduler_type='cosine',
    metric_for_best_model="f1",
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=0.01
)

trainer = Trainer(
    model=model, 
    args=args, 
    train_dataset=final_ds["train"], 
    eval_dataset=final_ds["test"], 
    data_collator=collator, 
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, all_labels=all_labels),
)

In [13]:
%%time
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Recall,Precision,F5
1,0.0351,0.034481,0.920028,0.876356,0.918268
2,0.0197,0.022231,0.953251,0.929431,0.952312




CPU times: user 4min 38s, sys: 7.9 s, total: 4min 46s
Wall time: 4min 46s


TrainOutput(global_step=960, training_loss=0.09566930920506517, metrics={'train_runtime': 285.999, 'train_samples_per_second': 6.713, 'train_steps_per_second': 3.357, 'total_flos': 451831090275840.0, 'train_loss': 0.09566930920506517, 'epoch': 2.0})

## 💾 Save models
- Сlick on "Save version" (top right) and "Save & Run All (Commit)"

In [14]:
trainer.save_model("deberta3base_ru_pii")
tokenizer.save_pretrained("deberta3base_ru_pii")

('deberta3base_ru_pii/tokenizer_config.json',
 'deberta3base_ru_pii/special_tokens_map.json',
 'deberta3base_ru_pii/spm.model',
 'deberta3base_ru_pii/added_tokens.json',
 'deberta3base_ru_pii/tokenizer.json')

In [15]:
model_path = "/kaggle/working/deberta3base_ru_pii"

In [16]:
def tokenize(example, tokenizer):
    text = []
    token_map = []
    
    idx = 0
    
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        
        text.append(t)
        token_map.extend([idx]*len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)
            
        idx += 1
        
        
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=False)
    
        
    return {
        **tokenized,
        "token_map": token_map,
    }

In [17]:
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [x["document"] for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
})

tokenizer = AutoTokenizer.from_pretrained(model_path)
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=2)

   

#0:   0%|          | 0/600 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/600 [00:00<?, ?ex/s]

In [18]:
model = AutoModelForTokenClassification.from_pretrained(model_path)
collator = DataCollatorForTokenClassification(tokenizer)
args = TrainingArguments(
    ".", 
    per_device_eval_batch_size=1, 
    report_to="none",
)
trainer = Trainer(
    model=model, 
    args=args, 
    data_collator=collator, 
    tokenizer=tokenizer,
)

In [19]:
predictions = trainer.predict(ds).predictions
pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis=2).reshape(predictions.shape[0],predictions.shape[1],1)

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [20]:
config = json.load(open("/kaggle/working/output/checkpoint-500/config.json"))
id2label = config["id2label"]
preds = predictions.argmax(-1)
preds_without_O = pred_softmax[:,:,:7].argmax(-1)
O_preds = pred_softmax[:,:,7]

preds_final = np.where(O_preds < 0.9, preds_without_O , preds)

In [21]:
pairs = []
processed = []
for p, token_map, offsets, tokens, doc in zip(preds_final, ds["token_map"],
                                              ds["offset_mapping"], ds["tokens"],
                                              ds["document"]):
    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]

        if start_idx + end_idx == 0:
            continue

        if token_map[start_idx] == -1:
            start_idx += 1

        # ignore "\n\n"
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        if start_idx >= len(token_map):
            break

        token_id = token_map[start_idx]

        # ignore "O" predictions and whitespace preds
        if label_pred not in ("O", "EMAIL", "PHONE_NUMBER") and token_id != -1:
            pair = (doc, token_id)

            if pair not in pairs:
                processed.append({"document": doc, "token": token_id, "label": label_pred, "token_str": tokens[token_id]})
                pairs.append(pair)

In [22]:
df = pd.DataFrame(processed)
df.head(100)

Unnamed: 0,document,token,label,token_str
0,1111111001,1,NAME,Мишин
1,1111111001,65,STREET_ADDRESS,1
2,1111111001,67,STREET_ADDRESS,160975
3,1111111001,95,PHONE_NUM,84892411578
4,1111111001,120,URL_PERSONAL,https://instagram.com/ysullivan
...,...,...,...,...
95,1111111009,157,CARD_NUMBER,5698169340608
96,1111111009,172,NAME,Казимир
97,1111111009,173,NAME,Логинов
98,1111111010,0,NAME,Гедеон
