In [1]:
from collections import defaultdict
import os
import torch.nn as nn
from tqdm import tqdm
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import evaluate
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
from transformers import AutoTokenizer, Trainer, TrainingArguments, TrainerCallback
import torch
import wandb
from functools import partial
from itertools import chain
import argparse
import json
TRAINING_MAX_LENGTH = 1024  # I use 1280 locally
OUTPUT_DIR = 'output'  # your output path
TRAINING_MODEL_PATH = "/home/xuanming/LLM/debertav3-large"
TRAINING_MODEL_PATH = "/home/xuanming/kaggle/PII/output/cv958-checkpoint-5100/"
data = json.load(open("./train.json"))

print(len(data))
print(data[0].keys())

6807
dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])


In [2]:
wandb.init(
    project="PII",
    name="aug_data",
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mlullabies[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
wandb.run.id

'v3jj4ur2'

In [2]:
from collections import OrderedDict
from copy import deepcopy
from transformers.trainer_callback import (
    CallbackHandler,
    DefaultFlowCallback,
    PrinterCallback,
    ProgressCallback,
    TrainerCallback,
    TrainerControl,
    TrainerState,
)


class EMA:
    def __init__(self, model, decay=0.9):
        self.module = deepcopy(model)
        self.module.eval()
        self.decay = decay
        self.device = device  # perform ema on different device from model if set
        if self.device is not None:
            self.module.to(device=device)

    def _update(self, model, update_fn):
        with torch.no_grad():
            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
                if self.device is not None:
                    model_v = model_v.to(device=self.device)
                ema_v.copy_(update_fn(ema_v, model_v))

    def update(self, model):
        self._update(model, update_fn=lambda e, m: self.decay * e + (1. - self.decay) * m)

    def set(self, model):
        self._update(model, update_fn=lambda e, m: m)


class EMACallback(TrainerCallback):
    def __init__(self, trainer, decay=0.99, use_ema_weights=True) -> None:
        super().__init__()
        self._trainer = trainer
        self.decay = decay
        self.use_ema_weights = use_ema_weights
        self.ema = None

    def on_init_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        """
        Event called at the end of the initialization of the [`Trainer`].
        """
        self.ema = EMA(self._trainer.model, decay=self.decay, device=None)
        return control

    def store(self, parameters):
        "Save the current parameters for restoring later."
        self.collected_params = [param.clone() for param in parameters]

    def restore(self, parameters):
        """
        Restore the parameters stored with the `store` method.
        Useful to validate the model with EMA parameters without affecting the
        original optimization process.
        """
        for c_param, param in zip(self.collected_params, parameters):
            param.data.copy_(c_param.data)

    def copy_to(self, shadow_parameters, parameters):
        "Copy current parameters into given collection of parameters."
        for s_param, param in zip(shadow_parameters, parameters):
            if param.requires_grad:
                param.data.copy_(s_param.data)

    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        """
        Event called at the end of a training step. If using gradient accumulation, one training step might take
        several inputs.
        """
        self.ema.update(self._trainer.model)
        self.store(self._trainer.model.parameters())
        self.copy_to(self.ema.module.parameters(),
                     self._trainer.model.parameters())
        return control

    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        """
        Event called after an evaluation phase.
        """
        self.restore(self._trainer.model.parameters())
        return control

    def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        """
        Event called at the end of training.
        """
        if self.use_ema_weights:
            self.copy_to(self.ema.module.parameters(),
                         self._trainer.model.parameters())
            # msg = "Model weights replaced with the EMA version."
            # log_main_process(_logger, logging.INFO, msg)
        return control

    def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        """
        Event called after a checkpoint save.
        """
        checkpoint_folder = f"ema-checkpoint-{self._trainer.state.global_step}"
        run_dir = self.args.output_dir
        output_dir = os.path.join(run_dir, checkpoint_folder)
        self.copy_to(self.ema.module.parameters(),
                     self._trainer.model.parameters())
        self._trainer.save_model(output_dir, _internal_call=True)
        self.restore(self._trainer.model.parameters())
        return control

In [5]:
class AWP:
    def __init__(self, model, adv_param="weight", adv_lr=0.1, adv_eps=1e-4):
        self.model = model
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.backup = {}
        self.backup_eps = {}

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    # 在损失函数之前获得梯度
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(
                            param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def _restore(self,):
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}


class CustomTrainer(Trainer):
    def __init__(self,
                 model=None,
                 args=None,
                 data_collator=None,
                 train_dataset=None,
                 eval_dataset=None,
                 tokenizer=None,
                 model_init=None,
                 compute_metrics=None,
                 callbacks=None,
                 optimizers=(None, None),
                 preprocess_logits_for_metrics=None,
                 awp_lr=0.1,
                 awp_eps=1e-4,
                 awp_start_epoch=0.5):

        super().__init__(model=model,
                         args=args,
                         data_collator=data_collator,
                         train_dataset=train_dataset,
                         eval_dataset=eval_dataset,
                         tokenizer=tokenizer,
                         model_init=model_init,
                         compute_metrics=compute_metrics,
                         callbacks=callbacks,
                         optimizers=optimizers,
                         preprocess_logits_for_metrics=preprocess_logits_for_metrics)

        self.awp_lr = awp_lr
        self.awp_eps = awp_eps
        self.awp_start_epoch = awp_start_epoch

    def compute_loss(self, model, inputs, return_outputs=False):
        sample_weights = inputs.pop("sample_weights")
        wids = inputs.pop("wids")
        overflow_to_sample_mapping = inputs.pop("overflow_to_sample_mapping")
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(
            [50.0] * 12 + [1.0], device=model.device), reduction="none")
        loss = loss_fct(
            logits.view(-1, self.model.config.num_labels), labels.view(-1))
        loss *= sample_weights.view(-1)

        return (loss.mean(), outputs) if return_outputs else loss.mean()

    def training_step(self, model, inputs):
        """
        Perform a training step on a batch of inputs.

        Subclass and override to inject custom behavior.

        Args:
            model (`nn.Module`):
                The model to train.
            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument `labels`. Check your model's documentation for all accepted arguments.

        Return:
            `torch.Tensor`: The tensor with training loss on this batch.
        """
        model.train()
        o_inputs = inputs.copy()
        # inputs = self._prepare_inputs(inputs)
        inputs = self._prepare_inputs(inputs)
      #  print('---'*60)
      #  print(inputs)

        with self.compute_loss_context_manager():
            loss = self.compute_loss(model, inputs)

        if self.args.n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu parallel training

        if self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            self.accelerator.backward(loss)

        ########################
        # AWP
        if self.awp_lr != 0 and self.state.epoch >= self.awp_start_epoch:
           # print(inputs)
           # print('Start amp')
            self.awp = AWP(model, adv_lr=self.awp_lr, adv_eps=self.awp_eps)
            self.awp._save()
            self.awp._attack_step()
            with self.compute_loss_context_manager():
                awp_loss = self.compute_loss(self.awp.model, o_inputs)

            if self.args.n_gpu > 1:
                awp_loss = awp_loss.mean()  # mean() to average on multi-gpu parallel training

            if self.use_apex:
                with amp.scale_loss(awp_loss, self.optimizer) as awp_scaled_loss:
                    awp_scaled_loss.backward()
            else:
                self.accelerator.backward(awp_loss)
            self.awp._restore()
        ########################

        return loss.detach() / self.args.gradient_accumulation_steps

In [7]:
# extra_labels = set(['B-NAME_OTHER','I-NAME_OTHER'])
extra_labels = set()
all_labels = sorted(
    list(set(chain(*[x["labels"] for x in data])) | extra_labels))
label2id = {l: i for i, l in enumerate(all_labels)}
id2label = {v: k for k, v in label2id.items()}

print(id2label)

{0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'O'}


In [8]:
target = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM',
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM',
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL'
]

In [9]:
import joblib
fake_b_student_name = joblib.load('fake_b_student_name.pkl')
fake_i_student_name = joblib.load('fake_i_student_name.pkl')

In [10]:
# This function is a simple map between text_split and entities
# We have verified that we have a 1:1 mapping above
# See above: (df_texts['text_split'].str.len() == df_texts['entities'].str.len()).all() == True
def get_labels(word_ids, word_labels, doc_idx):
    label_ids = []
    sample_weights = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
            sample_weights.append(1)
        else:
            label_ids.append(label2id[word_labels[word_idx]])
            if (doc_idx, word_idx) in fake_b_student_name:
                sample_weights.append(50)
            elif (doc_idx, word_idx) in fake_i_student_name:
                sample_weights.append(50)
            else:
                sample_weights.append(1)

    return label_ids, sample_weights

# Tokenize texts, possibly generating more than one tokenized sample for each text


def tokenize(df, to_tensor=True, with_labels=True):

    # This is what's different from a longformer
    # Read the parameters with attention
    encoded = tokenizer(df['tokens'].tolist(),
                        is_split_into_words=True,
                        return_overflowing_tokens=True,
                        stride=128,
                        max_length=1024,
                        padding="max_length",
                        truncation=True)

    if with_labels:
        encoded['labels'] = []

    encoded['wids'] = []
    encoded['sample_weights'] = []
    n = len(encoded['overflow_to_sample_mapping'])
    for i in range(n):

        # Map back to original row
        text_idx = encoded['overflow_to_sample_mapping'][i]

        # Get word indexes (this is a global index that takes into consideration the chunking :D )
        word_ids = encoded.word_ids(i)

        if with_labels:
            # Get word labels of the full un-chunked text
            word_labels = df['labels'].iloc[text_idx]
            # Get the labels associated with the word indexes
            label_ids, sample_weights = get_labels(
                word_ids, word_labels, df['document'].iloc[text_idx])
            encoded['labels'].append(label_ids)
            encoded['sample_weights'].append(sample_weights)
        encoded['wids'].append([w if w is not None else -1 for w in word_ids])

    if to_tensor:
        encoded = {key: torch.as_tensor(val) for key, val in encoded.items()}
    return encoded

In [11]:
train_df = pd.read_json('./train_split.json')
valid_df = pd.read_json('./test_split.json')

In [11]:
ai_data = pd.read_csv(
    './moredata_dataset_fixed.csv').rename(columns={"text": "full_text"})
ai_data['tokens'] = ai_data['tokens'].apply(lambda x: eval(x))
ai_data['labels'] = ai_data['labels'].apply(lambda x: eval(x))

In [12]:
aug_data1 = pd.read_json(
    './generated_data/lzc_more_data_merged_augmented.json')

In [14]:
aug_data2 = pd.read_json(
    './generated_data/lzc_persuade_2.0_based_augmented.json')

In [21]:
extra_df = pd.read_json('./persuade_train_v0.json')

In [22]:
mixtral = pd.read_json('./mixtral-8x7b-v1.json')

In [23]:
train_df = pd.concat([train_df, extra_df, aug_data1,
                     aug_data2, mixtral, ai_data]).reset_index(drop=True)

In [24]:
train_df = train_df.sample(frac=1).reset_index(drop=True)

In [33]:
train_df.query()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,22462,Mind Mapping - Reflection\n\nChallenge & Selec...,"[Mind, Mapping, -, Reflection, \n\n, Challenge...","[True, True, True, False, False, True, True, F...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,15020,Reflection – Mind Mapping Tool\n\nChallenge & ...,"[Reflection, –, Mind, Mapping, Tool, \n\n, Cha...","[True, True, True, True, False, False, True, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,16659,Design Thinking for Innovation: Visualization ...,"[Design, Thinking, for, Innovation, :, Visuali...","[True, True, True, False, True, True, False, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,13767,Example reflection – Learning Launch\n\nChalle...,"[Example, reflection, –, Learning, Launch, \n\...","[True, True, True, True, False, False, False, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,19430,Challenge & Selection:\n\nA visualization is a...,"[Challenge, &, Selection, :, \n\n, A, visualiz...","[True, True, False, False, False, True, True, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...,...,...
4756,10508,Reflection – Mind Mapping\n\n1. Challenge\n\nW...,"[Reflection, –, Mind, Mapping, \n\n, 1, ., Cha...","[True, True, True, False, False, False, True, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4757,20516,Reflection: Learning Launch\n\nChallenge\n\nI ...,"[Reflection, :, Learning, Launch, \n\n, Challe...","[False, True, True, False, False, False, False...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4758,16234,Reflection\n\nChallenge I’m working as ...,"[Reflection, \n\n, Challenge, , I, ’m, , ...","[False, False, True, False, False, True, False...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4759,15598,Challenge\n\nThe problem I present is about a ...,"[Challenge, \n\n, The, problem, I, present, is...","[False, False, True, True, True, True, True, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [12]:
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)
tokenized_train = tokenize(train_df)
tokenized_valid = tokenize(valid_df)

In [13]:
class PIIDataset(Dataset):
    def __init__(self, tokenized_ds):
        self.data = tokenized_ds

    def __getitem__(self, index):
        item = {k: self.data[k][index] for k in self.data.keys()}
        return item

    def __len__(self):
        return len(self.data['input_ids'])

In [14]:
train_dataset = PIIDataset(tokenized_train)
valid_dataset = PIIDataset(tokenized_valid)

In [15]:
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score


def compute_metrics(p, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)

    results = {
        'recall': recall,
        'precision': precision,
        'f5': f1_score
    }
    return results

In [16]:
# load training data and create reference dataframe ---
df = valid_df[['document', 'tokens', 'labels']].copy()
df = df.explode(['tokens', 'labels']).reset_index(drop=True).rename(
    columns={'tokens': 'token', 'labels': 'label'})
df['token_str'] = df['token']
df['token'] = df.groupby('document').cumcount()

label_list = df['label'].unique().tolist()

reference_df = df[df['label'] != 'O'].copy()
reference_df = reference_df.reset_index().rename(columns={'index': 'row_id'})
reference_df = reference_df[['row_id', 'document',
                             'token', 'label', 'token_str']].copy()
reference_df

Unnamed: 0,row_id,document,token,label,token_str
0,9,6435,9,B-NAME_STUDENT,Jose
1,10,6435,10,I-NAME_STUDENT,Martinez
2,3516,9854,0,B-NAME_STUDENT,Waseem
3,3517,9854,1,I-NAME_STUDENT,Mabunda
4,3519,9854,3,B-STREET_ADDRESS,591
...,...,...,...,...,...
883,1484958,5606,791,B-ID_NUM,143860010348
884,1484962,5606,795,B-ID_NUM,Kh:217952887271
885,1485225,13317,12,B-NAME_STUDENT,Oscar
886,1488283,6393,3,B-NAME_STUDENT,Rania


In [17]:
from collections import defaultdict
from typing import Dict


class PRFScore:
    """A precision / recall / F score."""

    def __init__(
        self,
        *,
        tp: int = 0,
        fp: int = 0,
        fn: int = 0,
    ) -> None:
        self.tp = tp
        self.fp = fp
        self.fn = fn

    def __len__(self) -> int:
        return self.tp + self.fp + self.fn

    def __iadd__(self, other):  # in-place add
        self.tp += other.tp
        self.fp += other.fp
        self.fn += other.fn
        return self

    def __add__(self, other):
        return PRFScore(
            tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
        )

    def score_set(self, cand: set, gold: set) -> None:
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
        self.fn += len(gold - cand)

    @property
    def precision(self) -> float:
        return self.tp / (self.tp + self.fp + 1e-100)

    @property
    def recall(self) -> float:
        return self.tp / (self.tp + self.fn + 1e-100)

    @property
    def f1(self) -> float:
        p = self.precision
        r = self.recall
        return 2 * ((p * r) / (p + r + 1e-100))

    @property
    def f5(self) -> float:
        beta = 5
        p = self.precision
        r = self.recall

        fbeta = (1+(beta**2))*p*r / ((beta**2)*p + r + 1e-100)
        return fbeta

    def to_dict(self) -> Dict[str, float]:
        return {"p": self.precision, "r": self.recall, "f5": self.f5}


def compute_lb_metrics(pred_df, gt_df):
    """
    Compute the LB metric (lb) and other auxiliary metrics
    """

    references = {(row.document, row.token, row.label)
                  for row in gt_df.itertuples()}
    predictions = {(row.document, row.token, row.label)
                   for row in pred_df.itertuples()}

    score_per_type = defaultdict(PRFScore)
    references = set(references)

    for ex in predictions:
        pred_type = ex[-1]  # (document, token, label)
        if pred_type != 'O':
            pred_type = pred_type[2:]  # avoid B- and I- prefix

        if pred_type not in score_per_type:
            score_per_type[pred_type] = PRFScore()

        if ex in references:
            score_per_type[pred_type].tp += 1
            references.remove(ex)
        else:
            score_per_type[pred_type].fp += 1

    for doc, tok, ref_type in references:
        if ref_type != 'O':
            ref_type = ref_type[2:]  # avoid B- and I- prefix

        if ref_type not in score_per_type:
            score_per_type[ref_type] = PRFScore()
        score_per_type[ref_type].fn += 1

    totals = PRFScore()

    for prf in score_per_type.values():
        totals += prf

    results = {
        "precision": totals.precision,
        "recall": totals.recall,
        "f5": totals.f5
    }
    # for k, v in score_per_type.items():
    #     v = v.to_dict()
    #     if k != 'O':
    #         results[f"precision_{k}"] = v["p"]
    #         results[f"recall_{k}"] = v["r"]
    #         results[f"f5_{k}"] = v["f5"]

    return results


def compute_metricsV2(p, valid_df, reference_df, valid_dataset, id2label):
    token_pred = defaultdict(lambda: defaultdict(int))
    token_cnt = defaultdict(lambda: defaultdict(int))

    preds, labels = p
    assert preds.shape[0] == len(valid_dataset)
    preds_softmax = np.exp(preds) / np.sum(np.exp(preds),
                                           axis=2).reshape(preds.shape[0], preds.shape[1], 1)

    for preds, batch in zip(preds_softmax, valid_dataset):
        word_ids = batch['wids'].numpy()
        text_id = batch['overflow_to_sample_mapping'].item()
        for idx, word_idx in enumerate(word_ids):
            if word_idx != -1:
                token_pred[text_id][word_idx] += preds[idx]
                token_cnt[text_id][word_idx] += 1
    for text_id in token_pred:
        for word_idx in token_pred[text_id]:
            token_pred[text_id][word_idx] /= token_cnt[text_id][word_idx]

    document, token, label, token_str = [], [], [], []
    for text_id in token_pred:
        for word_idx in token_pred[text_id]:
            pred = token_pred[text_id][word_idx].argmax(-1)
            if id2label[pred] != 'O':
                document.append(valid_df.loc[text_id, "document"])
                token.append(word_idx)
                label.append(id2label[pred])
                token_str.append(valid_df.loc[text_id, "tokens"][word_idx])
    df = pd.DataFrame({
        "document": document,
        "token": token,
        "label": label,
        "token_str": token_str
    })

    results = compute_lb_metrics(df, reference_df)

    return results

In [18]:
model = AutoModelForTokenClassification.from_pretrained(
    TRAINING_MODEL_PATH,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

In [19]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    fp16=True,
    gradient_accumulation_steps=8,
    logging_steps=50,
    warmup_ratio=0.05,
    learning_rate=1e-5,
    num_train_epochs=5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    report_to="wandb",
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=1,
    overwrite_output_dir=True,
    load_best_model_at_end=True,
    lr_scheduler_type='cosine',
    metric_for_best_model="f5",
    greater_is_better=True,
    weight_decay=0.01,
    save_only_model=True,
    remove_unused_columns=False
)
trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metricsV2, valid_df=valid_df,
                            reference_df=reference_df, valid_dataset=valid_dataset, id2label=id2label),
)

In [20]:
trainer.train()
wandb.finish()

Step,Training Loss,Validation Loss,Precision,Recall,F5
50,0.0081,0.026503,0.803522,0.976351,0.968341
100,0.0036,0.021182,0.913143,0.899775,0.900282
150,0.008,0.029087,0.915691,0.880631,0.881929
200,0.0069,0.023545,0.814259,0.977477,0.969999
250,0.0072,0.022038,0.821023,0.976351,0.969298
300,0.0015,0.019784,0.854271,0.957207,0.952792
350,0.0042,0.025781,0.881393,0.912162,0.910939
400,0.0041,0.043266,0.85336,0.943694,0.939867
450,0.0415,0.0305,0.802871,0.94482,0.938438
500,0.0137,0.01706,0.931981,0.879505,0.881413


KeyboardInterrupt: 

2

In [3]:
import joblib

In [4]:
token_pred = joblib.load('./token_pred.pkl')

EOFError: 

In [13]:
from collections import defaultdict
from typing import Dict


class PRFScore:
    """A precision / recall / F score."""

    def __init__(
        self,
        *,
        tp: int = 0,
        fp: int = 0,
        fn: int = 0,
    ) -> None:
        self.tp = tp
        self.fp = fp
        self.fn = fn

    def __len__(self) -> int:
        return self.tp + self.fp + self.fn

    def __iadd__(self, other):  # in-place add
        self.tp += other.tp
        self.fp += other.fp
        self.fn += other.fn
        return self

    def __add__(self, other):
        return PRFScore(
            tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
        )

    def score_set(self, cand: set, gold: set) -> None:
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
        self.fn += len(gold - cand)

    @property
    def precision(self) -> float:
        return self.tp / (self.tp + self.fp + 1e-100)

    @property
    def recall(self) -> float:
        return self.tp / (self.tp + self.fn + 1e-100)

    @property
    def f1(self) -> float:
        p = self.precision
        r = self.recall
        return 2 * ((p * r) / (p + r + 1e-100))

    @property
    def f5(self) -> float:
        beta = 5
        p = self.precision
        r = self.recall

        fbeta = (1+(beta**2))*p*r / ((beta**2)*p + r + 1e-100)
        return fbeta

    def to_dict(self) -> Dict[str, float]:
        return {"p": self.precision, "r": self.recall, "f5": self.f5}


def compute_metrics(pred_df, gt_df):
    """
    Compute the LB metric (lb) and other auxiliary metrics
    """

    references = {(row.document, row.token, row.label)
                  for row in gt_df.itertuples()}
    predictions = {(row.document, row.token, row.label)
                   for row in pred_df.itertuples()}

    score_per_type = defaultdict(PRFScore)
    references = set(references)

    for ex in predictions:
        pred_type = ex[-1]  # (document, token, label)
        if pred_type != 'O':
            pred_type = pred_type[2:]  # avoid B- and I- prefix

        if pred_type not in score_per_type:
            score_per_type[pred_type] = PRFScore()

        if ex in references:
            score_per_type[pred_type].tp += 1
            references.remove(ex)
        else:
            score_per_type[pred_type].fp += 1

    for doc, tok, ref_type in references:
        if ref_type != 'O':
            ref_type = ref_type[2:]  # avoid B- and I- prefix

        if ref_type not in score_per_type:
            score_per_type[ref_type] = PRFScore()
        score_per_type[ref_type].fn += 1

    totals = PRFScore()

    for prf in score_per_type.values():
        totals += prf

    return {
        "ents_p": totals.precision,
        "ents_r": totals.recall,
        "ents_f5": totals.f5,
        "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items() if k != 'O'},
    }

In [14]:
valid_df

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,6435,WEEK 5: FINAL ASSIGNMENT\n\nLEARNING LAUNCHING...,"[WEEK, 5, :, FINAL, ASSIGNMENT, \n\n, LEARNING...","[True, False, True, True, False, False, True, ...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,18447,Visualization\n\nChallenge\n\nAs Payment syste...,"[Visualization, \n\n, Challenge, \n\n, As, Pay...","[False, False, False, False, True, True, True,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,20872,Storytelling\n\nChallenge and Selection Sto...,"[Storytelling, \n\n, Challenge, and, Selection...","[False, False, True, True, True, False, True, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,13277,Information system for urban planning document...,"[Information, system, for, urban, planning, do...","[True, True, True, True, True, True, True, Tru...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,9854,Waseem Mabunda 591 Smith Centers Apt. 656\nJo...,"[Waseem, Mabunda, , 591, Smith, Centers, Apt,...","[True, True, False, True, True, True, False, T...","[B-NAME_STUDENT, I-NAME_STUDENT, O, B-STREET_A..."
...,...,...,...,...,...
2041,14125,Example Reflection – Visualization\n\nChalleng...,"[Example, Reflection, –, Visualization, \n\n, ...","[True, True, True, False, False, False, False,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2042,22235,ENVISION\n\nChallenge & Selection\n\nThe first...,"[ENVISION, \n\n, Challenge, &, Selection, \n\n...","[False, False, True, True, False, False, True,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2043,14865,Reﬂection- Storytelling as a Tool\n\nChallenge...,"[Reﬂection-, Storytelling, as, a, Tool, \n\n, ...","[True, True, True, True, False, False, True, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2044,19993,Challenge & Selection\n\nThe first tool I used...,"[Challenge, &, Selection, \n\n, The, first, to...","[True, True, False, False, True, True, True, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [18]:
# This function is a simple map between text_split and entities
# We have verified that we have a 1:1 mapping above
# See above: (df_texts['text_split'].str.len() == df_texts['entities'].str.len()).all() == True
def get_labels(word_ids, word_labels):
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        else:
            label_ids.append(label2id[word_labels[word_idx]])
    return label_ids

# Tokenize texts, possibly generating more than one tokenized sample for each text


def tokenize(df, to_tensor=True, with_labels=True):

    # This is what's different from a longformer
    # Read the parameters with attention
    encoded = tokenizer(df['tokens'].tolist(),
                        is_split_into_words=True,
                        return_overflowing_tokens=True,
                        stride=128,
                        max_length=1024,
                        padding="max_length",
                        truncation=True)

    if with_labels:
        encoded['labels'] = []

    encoded['wids'] = []
    n = len(encoded['overflow_to_sample_mapping'])
    for i in range(n):

        # Map back to original row
        text_idx = encoded['overflow_to_sample_mapping'][i]

        # Get word indexes (this is a global index that takes into consideration the chunking :D )
        word_ids = encoded.word_ids(i)

        if with_labels:
            # Get word labels of the full un-chunked text
            word_labels = df['labels'].iloc[text_idx]

            # Get the labels associated with the word indexes
            label_ids = get_labels(word_ids, word_labels)
            encoded['labels'].append(label_ids)
        encoded['wids'].append([w if w is not None else -1 for w in word_ids])

    if to_tensor:
        encoded = {key: torch.as_tensor(val) for key, val in encoded.items()}
    return encoded

In [19]:
# load training data and create reference dataframe ---
df = valid_df[['document', 'tokens', 'labels']].copy()
df = df.explode(['tokens', 'labels']).reset_index(drop=True).rename(
    columns={'tokens': 'token', 'labels': 'label'})
df['token_str'] = df['token']
df['token'] = df.groupby('document').cumcount()

label_list = df['label'].unique().tolist()

reference_df = df[df['label'] != 'O'].copy()
reference_df = reference_df.reset_index().rename(columns={'index': 'row_id'})
reference_df = reference_df[['row_id', 'document',
                             'token', 'label', 'token_str']].copy()
reference_df

Unnamed: 0,row_id,document,token,label,token_str
0,9,6435,9,B-NAME_STUDENT,Jose
1,10,6435,10,I-NAME_STUDENT,Martinez
2,3516,9854,0,B-NAME_STUDENT,Waseem
3,3517,9854,1,I-NAME_STUDENT,Mabunda
4,3519,9854,3,B-STREET_ADDRESS,591
...,...,...,...,...,...
883,1484958,5606,791,B-ID_NUM,143860010348
884,1484962,5606,795,B-ID_NUM,Kh:217952887271
885,1485225,13317,12,B-NAME_STUDENT,Oscar
886,1488283,6393,3,B-NAME_STUDENT,Rania


In [20]:
tokenized_train = tokenize(train_df)
tokenized_valid = tokenize(valid_df)

In [21]:
test_dataset = PIIDataset(tokenized_valid)
test_dataloader = DataLoader(test_dataset, batch_size=1)

In [22]:
# for idx, id in zip(tokenized_valid['wids'][0],tokenized_valid['input_ids'][0]):
#     print(idx, id)
#     print(tokenizer.convert_ids_to_tokens([id]))

In [23]:
def inference(df, dl):

    # These 2 dictionaries will hold text-level data
    # Helping in the merging process by accumulating data
    # Through all the chunks

    seen_words_idx = defaultdict(set)

    document, token, label, token_str = [], [], [], []

    for batch in tqdm(dl):
        ids = batch["input_ids"].to("cuda")
        mask = batch["attention_mask"].to("cuda")
        preds = model(ids, attention_mask=mask, return_dict=False)[
            0].cpu().detach().numpy()
        pred_softmax = np.exp(preds) / np.sum(np.exp(preds),
                                              axis=2).reshape(preds.shape[0], preds.shape[1], 1)
        preds = preds.argmax(-1)
        preds_without_O = pred_softmax[:, :, :12].argmax(-1)
        O_preds = pred_softmax[:, :, 12]
        threshold = 0.9
        preds_final = np.where(O_preds < threshold, preds_without_O, preds)

        del ids, mask

        # Go over each prediction, getting the text_id reference

        for k, (chunk_preds, text_id) in enumerate(zip(preds_final, batch['overflow_to_sample_mapping'].tolist())):
            # The word_ids are absolute references in the original text
            word_ids = batch['wids'][k].numpy()

            # Map from ids to labels
            chunk_preds = [id2label[i] for i in chunk_preds]

            for idx, word_idx in enumerate(word_ids):
                if word_idx != -1 and chunk_preds[idx] != 'O' and word_idx not in seen_words_idx[text_id]:
                    document.append(df.loc[text_id, "document"])
                    token.append(word_idx)
                    token_str.append(df.loc[text_id, "tokens"][word_idx])
                    label.append(chunk_preds[idx])
                    seen_words_idx[text_id].add(word_idx)
    df = pd.DataFrame({
        "document": document,
        "token": token,
        "label": label,
        "token_str": token_str
    })
    df["row_id"] = list(range(len(df)))

    return df

In [24]:
def inferenceV2(df, dl, thres=0.9):

    # These 2 dictionaries will hold text-level data
    # Helping in the merging process by accumulating data
    # Through all the chunks

    token_pred = defaultdict(lambda: defaultdict(int))

    document, token, label, token_str = [], [], [], []

    for batch in tqdm(dl):
        ids = batch["input_ids"].to("cuda")
        mask = batch["attention_mask"].to("cuda")
        preds = model(ids, attention_mask=mask, return_dict=False)[
            0].cpu().detach().numpy()
        preds_softmax = np.exp(preds) / np.sum(np.exp(preds),
                                               axis=2).reshape(preds.shape[0], preds.shape[1], 1)

        del ids, mask

        # Go over each prediction, getting the text_id reference

        for k, (chunk_preds, text_id) in enumerate(zip(preds_softmax, batch['overflow_to_sample_mapping'].tolist())):
            # The word_ids are absolute references in the original text
            word_ids = batch['wids'][k].numpy()

            for idx, word_idx in enumerate(word_ids):
                if word_idx != -1:
                    token_pred[text_id][word_idx] += chunk_preds[idx]

    for text_id in token_pred:
        for word_idx in token_pred[text_id]:
            pred = token_pred[text_id][word_idx].argmax(-1)
            pred_without_O = token_pred[text_id][word_idx][:12].argmax(-1)
            if token_pred[text_id][word_idx][12] < thres:
                final_pred = pred_without_O
            else:
                final_pred = pred
            if id2label[final_pred] != 'O':
                document.append(df.loc[text_id, "document"])
                token.append(word_idx)
                label.append(id2label[final_pred])
                token_str.append(df.loc[text_id, "tokens"][word_idx])
    df = pd.DataFrame({
        "document": document,
        "token": token,
        "label": label,
        "token_str": token_str
    })
    df["row_id"] = list(range(len(df)))

    return df

In [25]:
def inferenceV3(df, dl, thres=0.9):

    # These 2 dictionaries will hold text-level data
    # Helping in the merging process by accumulating data
    # Through all the chunks

    token_pred = defaultdict(lambda: defaultdict(int))
    seen_words_idx = defaultdict(set)

    for batch in tqdm(dl):
        ids = batch["input_ids"].to("cuda")
        mask = batch["attention_mask"].to("cuda")
        preds = model(ids, attention_mask=mask, return_dict=False)[
            0].cpu().detach().numpy()
        preds_softmax = np.exp(preds) / np.sum(np.exp(preds),
                                               axis=2).reshape(preds.shape[0], preds.shape[1], 1)

        del ids, mask

        # Go over each prediction, getting the text_id reference

        for k, (chunk_preds, text_id) in enumerate(zip(preds_softmax, batch['overflow_to_sample_mapping'].tolist())):
            # The word_ids are absolute references in the original text
            word_ids = batch['wids'][k].numpy()

            for idx, word_idx in enumerate(word_ids):
                if word_idx != -1 and word_idx not in seen_words_idx[text_id]:
                    token_pred[text_id][word_idx] += chunk_preds[idx]
                    seen_words_idx[text_id].add(word_idx)
    return token_pred


model = model.cuda()

In [26]:
def inferenceV4(df, dl, thres=0.9):

    # These 2 dictionaries will hold text-level data
    # Helping in the merging process by accumulating data
    # Through all the chunks

    token_pred = defaultdict(lambda: defaultdict(int))
    token_cnt = defaultdict(lambda: defaultdict(int))

    for batch in tqdm(dl):
        ids = batch["input_ids"].to("cuda")
        mask = batch["attention_mask"].to("cuda")
        preds = model(ids, attention_mask=mask, return_dict=False)[
            0].cpu().detach().numpy()
        preds_softmax = np.exp(preds) / np.sum(np.exp(preds),
                                               axis=2).reshape(preds.shape[0], preds.shape[1], 1)

        del ids, mask

        # Go over each prediction, getting the text_id reference

        for k, (chunk_preds, text_id) in enumerate(zip(preds_softmax, batch['overflow_to_sample_mapping'].tolist())):
            # The word_ids are absolute references in the original text
            word_ids = batch['wids'][k].numpy()

            for idx, word_idx in enumerate(word_ids):
                if word_idx != -1:
                    token_pred[text_id][word_idx] += chunk_preds[idx]
                    token_cnt[text_id][word_idx] += 1
    for text_id in token_pred:
        for word_idx in token_pred[text_id]:
            token_pred[text_id][word_idx] /= token_cnt[text_id][word_idx]
    return token_pred


model = model.cuda()

In [27]:
def inferenceV5(df, dl, thres=0.9):

    # These 2 dictionaries will hold text-level data
    # Helping in the merging process by accumulating data
    # Through all the chunks

    token_pred = defaultdict(lambda: defaultdict(int))
    token_cnt = defaultdict(lambda: defaultdict(int))

    for batch in tqdm(dl):
        ids = batch["input_ids"].to("cuda")
        mask = batch["attention_mask"].to("cuda")
        preds = model(ids, attention_mask=mask, return_dict=False)[
            0].cpu().detach().numpy()
        preds_softmax = np.exp(preds) / np.sum(np.exp(preds),
                                               axis=2).reshape(preds.shape[0], preds.shape[1], 1)

        del ids, mask

        # Go over each prediction, getting the text_id reference
        seen_words_idx = set()
        for k, (chunk_preds, text_id) in enumerate(zip(preds_softmax, batch['overflow_to_sample_mapping'].tolist())):
            # The word_ids are absolute references in the original text
            word_ids = batch['wids'][k].numpy()

            for idx, word_idx in enumerate(word_ids):
                if word_idx != -1 and word_idx not in seen_words_idx:
                    token_pred[text_id][word_idx] += chunk_preds[idx]
                    seen_words_idx.add(word_idx)
                    token_cnt[text_id][word_idx] += 1

    for text_id in token_pred:
        for word_idx in token_pred[text_id]:
            token_pred[text_id][word_idx] /= token_cnt[text_id][word_idx]
    return token_pred

In [69]:
pred_df2 = inferenceV2(valid_df, test_dataloader, 0)

100%|██████████| 2051/2051 [08:19<00:00,  4.11it/s]


In [26]:
pred_df = inference(valid_df, test_dataloader)

100%|██████████| 2787/2787 [01:28<00:00, 31.51it/s]


In [178]:
token_pred = inferenceV3(valid_df, test_dataloader)

100%|██████████| 2787/2787 [01:48<00:00, 25.59it/s]


In [28]:
token_pred4 = inferenceV4(valid_df, test_dataloader)

100%|██████████| 2290/2290 [02:44<00:00, 13.93it/s]


In [81]:
token_pred5 = inferenceV5(valid_df, test_dataloader)

100%|██████████| 2290/2290 [02:44<00:00, 13.93it/s]


In [29]:
def evaluate(token_pred, df, thres):
    document, token, label, token_str = [], [], [], []
    for text_id in token_pred:
        for word_idx in token_pred[text_id]:
            pred = token_pred[text_id][word_idx].argmax(-1)
            pred_without_O = token_pred[text_id][word_idx][:12].argmax(-1)
            if token_pred[text_id][word_idx][12] < thres:
                final_pred = pred_without_O
            else:
                final_pred = pred
            if id2label[final_pred] != 'O':
                document.append(df.loc[text_id, "document"])
                token.append(word_idx)
                label.append(id2label[final_pred])
                token_str.append(df.loc[text_id, "tokens"][word_idx])
    df = pd.DataFrame({
        "document": document,
        "token": token,
        "label": label,
        "token_str": token_str
    })
    df["row_id"] = list(range(len(df)))

    eval_dict = compute_metrics(df, reference_df)
    m = eval_dict['ents_f5']
    print(f"LB = {round(m, 3)}")
    print(json.dumps(eval_dict['ents_per_type'], indent=4))
    return df

In [30]:
valid_df

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,6435,WEEK 5: FINAL ASSIGNMENT\n\nLEARNING LAUNCHING...,"[WEEK, 5, :, FINAL, ASSIGNMENT, \n\n, LEARNING...","[True, False, True, True, False, False, True, ...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,18447,Visualization\n\nChallenge\n\nAs Payment syste...,"[Visualization, \n\n, Challenge, \n\n, As, Pay...","[False, False, False, False, True, True, True,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,20872,Storytelling\n\nChallenge and Selection Sto...,"[Storytelling, \n\n, Challenge, and, Selection...","[False, False, True, True, True, False, True, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,13277,Information system for urban planning document...,"[Information, system, for, urban, planning, do...","[True, True, True, True, True, True, True, Tru...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,9854,Waseem Mabunda 591 Smith Centers Apt. 656\nJo...,"[Waseem, Mabunda, , 591, Smith, Centers, Apt,...","[True, True, False, True, True, True, False, T...","[B-NAME_STUDENT, I-NAME_STUDENT, O, B-STREET_A..."
...,...,...,...,...,...
2041,14125,Example Reflection – Visualization\n\nChalleng...,"[Example, Reflection, –, Visualization, \n\n, ...","[True, True, True, False, False, False, False,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2042,22235,ENVISION\n\nChallenge & Selection\n\nThe first...,"[ENVISION, \n\n, Challenge, &, Selection, \n\n...","[False, False, True, True, False, False, True,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2043,14865,Reﬂection- Storytelling as a Tool\n\nChallenge...,"[Reﬂection-, Storytelling, as, a, Tool, \n\n, ...","[True, True, True, True, False, False, True, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2044,19993,Challenge & Selection\n\nThe first tool I used...,"[Challenge, &, Selection, \n\n, The, first, to...","[True, True, False, False, True, True, True, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [36]:
pred_df4 = evaluate(token_pred4, valid_df, 0.8)

LB = 0.965
{
    "NAME_STUDENT": {
        "p": 0.6775618374558304,
        "r": 0.9845956354300385,
        "f5": 0.9677294123356139
    },
    "URL_PERSONAL": {
        "p": 0.7407407407407407,
        "r": 0.975609756097561,
        "f5": 0.963855421686747
    },
    "STREET_ADDRESS": {
        "p": 0.5263157894736842,
        "r": 0.9090909090909091,
        "f5": 0.8843537414965987
    },
    "ID_NUM": {
        "p": 0.8275862068965517,
        "r": 0.8888888888888888,
        "f5": 0.8863636363636362
    },
    "EMAIL": {
        "p": 0.9333333333333333,
        "r": 1.0,
        "f5": 0.9972602739726028
    },
    "USERNAME": {
        "p": 0.6,
        "r": 1.0,
        "f5": 0.975
    },
    "PHONE_NUM": {
        "p": 1.0,
        "r": 1.0,
        "f5": 1.0
    }
}


In [88]:
pred_df = evaluate(token_pred4, valid_df, 0.8)

LB = 0.971
{
    "NAME_STUDENT": {
        "p": 0.6710069444444444,
        "r": 0.9922978177150192,
        "f5": 0.974354002036166
    },
    "URL_PERSONAL": {
        "p": 0.6153846153846154,
        "r": 0.975609756097561,
        "f5": 0.9541284403669725
    },
    "EMAIL": {
        "p": 0.7,
        "r": 1.0,
        "f5": 0.9837837837837837
    },
    "ID_NUM": {
        "p": 0.5909090909090909,
        "r": 0.9629629629629629,
        "f5": 0.9401947148817803
    },
    "STREET_ADDRESS": {
        "p": 1.0,
        "r": 0.9090909090909091,
        "f5": 0.9122807017543859
    },
    "PHONE_NUM": {
        "p": 0.6190476190476191,
        "r": 1.0,
        "f5": 0.9768786127167631
    },
    "USERNAME": {
        "p": 0.6666666666666666,
        "r": 0.6666666666666666,
        "f5": 0.6666666666666666
    }
}


In [31]:
new_pred_df = pd.merge(df[['document', 'token', 'token_str']], pred_df[[
                       'document', 'token', 'label']], on=['document', 'token'], how='left')
new_pred_df['label'] = new_pred_df['label'].fillna('O')
new_pred_df = new_pred_df[['document', 'token', 'token_str', 'label']]

Unnamed: 0,document,token,token_str,label
0,6435,0,WEEK,O
1,6435,1,5,O
2,6435,2,:,O
3,6435,3,FINAL,O
4,6435,4,ASSIGNMENT,O
...,...,...,...,...
1497395,10311,1333,in,O
1497396,10311,1334,the,O
1497397,10311,1335,future,O
1497398,10311,1336,.,O


In [79]:
df.query("token_str == ' '")

Unnamed: 0,document,token,label,token_str
33,6435,33,O,
52,6435,52,O,
73,6435,73,O,
94,6435,94,O,
112,6435,112,O,
...,...,...,...,...
1497304,10311,1242,O,
1497323,10311,1261,O,
1497346,10311,1284,O,
1497363,10311,1301,O,


In [33]:
def get_pred(token_pred, doc, token, df):
    tmp = token_pred[df.query('document == @doc').index.values[0]][token]
    for n, s in zip(all_labels, tmp):
        print(n, round(s, 3))

In [37]:
result = pred_df4.merge(
    reference_df, on=['document', 'token'], how='outer', indicator=True)
unique_to_df1 = result[result['_merge'] == 'left_only']
unique_to_df2 = result[result['_merge'] == 'right_only']
both = result[result['_merge'] == 'both']

In [38]:
both.query('label_x != label_y')

Unnamed: 0,document,token,label_x,token_str_x,row_id_x,row_id_y,label_y,token_str_y,_merge
548,19280,55,B-ID_NUM,30407059,548.0,637273.0,I-ID_NUM,30407059,both
970,3202,34,B-URL_PERSONAL,nYZqnhEXw,970.0,1151942.0,I-URL_PERSONAL,nYZqnhEXw,both
1084,4090,2,B-NAME_STUDENT,Aakash,1084.0,1302896.0,I-NAME_STUDENT,Aakash,both
1088,4090,768,B-NAME_STUDENT,Aakash,1088.0,1303662.0,I-NAME_STUDENT,Aakash,both


In [39]:
unique_to_df2

Unnamed: 0,document,token,label_x,token_str_x,row_id_x,row_id_y,label_y,token_str_y,_merge
1267,9854,9,,,,3525.0,I-STREET_ADDRESS,\n,right_only
1268,9911,365,,,,90342.0,B-NAME_STUDENT,Cobus,right_only
1269,9911,366,,,,90343.0,I-NAME_STUDENT,Mpanza,right_only
1270,12512,787,,,,104552.0,B-NAME_STUDENT,Jordi,right_only
1271,12512,792,,,,104557.0,B-NAME_STUDENT,Jordi,right_only
1272,20984,6,,,,212105.0,B-ID_NUM,V69230,right_only
1273,8918,0,,,,335121.0,B-NAME_STUDENT,Chris,right_only
1274,19280,54,,,,637272.0,B-ID_NUM,Z.S.,right_only
1275,6117,62,,,,709974.0,B-NAME_STUDENT,Hari,right_only
1276,6117,63,,,,709975.0,I-NAME_STUDENT,Sharma,right_only


In [35]:
train_df_o = pd.read_json('./train_split.json')

In [62]:
document_list = train_df_o['document']

In [62]:
pred_df.query('document == 3202')

Unnamed: 0,document,token,label,token_str,row_id
1007,3202,23,B-NAME_STUDENT,Ahmed,1007
1008,3202,24,I-NAME_STUDENT,Salem,1008
1009,3202,31,B-URL_PERSONAL,tps://www.facebook.com/bclark,1009
1010,3202,33,B-URL_PERSONAL,https://www.youtube.com/channel/UC1ElAcppeuhfet,1010
1011,3202,34,B-ID_NUM,nYZqnhEXw,1011
1012,3202,36,B-NAME_STUDENT,Ahmed,1012
1013,3202,37,I-NAME_STUDENT,Salem,1013


In [66]:
df.query('document == 3202 and token == 34')

Unnamed: 0,document,token,label,token_str
1151942,3202,34,I-URL_PERSONAL,nYZqnhEXw


In [None]:
df.query('document == 3202 and token == 32')

In [44]:
df.query('document == 3202 and token == 32')

Unnamed: 0,document,token,label,token_str
1151940,3202,32,O,\n\n


In [41]:
both.query('label_x != label_y and document in @document_list')

Unnamed: 0,document,token,label,token_str
0,6435,0,O,WEEK
1,6435,1,O,5
2,6435,2,O,:
3,6435,3,O,FINAL
4,6435,4,O,ASSIGNMENT
...,...,...,...,...
1497395,10311,1333,O,in
1497396,10311,1334,O,the
1497397,10311,1335,O,future
1497398,10311,1336,O,.


In [71]:
unique_to_df2.query('document in @document_list')

Unnamed: 0,document,token,label_x,token_str_x,row_id_x,row_id_y,label_y,token_str_y,_merge
105503,11442,156,,,,6547866.0,I-STREET_ADDRESS,\n,right_only


In [278]:
get_pred(token_pred, 9911, 366, valid_df)

B-EMAIL 0.0
B-ID_NUM 0.001
B-NAME_STUDENT 0.001
B-PHONE_NUM 0.0
B-STREET_ADDRESS 0.0
B-URL_PERSONAL 0.002
B-USERNAME 0.0
I-ID_NUM 0.0
I-NAME_STUDENT 0.009
I-PHONE_NUM 0.0
I-STREET_ADDRESS 0.001
I-URL_PERSONAL 0.0
O 0.986


In [64]:
pred_df.query("document == 11900")

Unnamed: 0,document,token,label,token_str,row_id
24838,11900,0,B-NAME_STUDENT,Lisa,24838
24839,11900,1,I-NAME_STUDENT,Lee,24839
24840,11900,3,B-ID_NUM,723847538279,24840
24841,11900,22,B-NAME_STUDENT,John,24841
24842,11900,37,B-NAME_STUDENT,John,24842
24843,11900,90,B-NAME_STUDENT,John,24843
24844,11900,105,B-NAME_STUDENT,John,24844
24845,11900,129,B-NAME_STUDENT,John,24845
24846,11900,147,B-NAME_STUDENT,John,24846
24847,11900,239,B-NAME_STUDENT,Lisa,24847


In [65]:
reference_df.query("document == 11900")

Unnamed: 0,row_id,document,token,label,token_str
23949,1745426,11900,0,B-NAME_STUDENT,Lisa
23950,1745427,11900,1,I-NAME_STUDENT,Lee
23951,1745429,11900,3,B-ID_NUM,723847538279
23952,1745665,11900,239,B-NAME_STUDENT,Lisa
23953,1745667,11900,241,B-NAME_STUDENT,Benjamin
23954,1745669,11900,243,B-ID_NUM,534516353860
23955,1745756,11900,330,B-NAME_STUDENT,Bikram
23956,1745757,11900,331,I-NAME_STUDENT,Das
23957,1745759,11900,333,B-ID_NUM,871483046449
23958,1745795,11900,369,B-NAME_STUDENT,Maria


In [68]:
df.query("document == 11900 and token == 240")

Unnamed: 0,document,token,label,token_str
1745666,11900,240,O,\n\n


In [54]:
print(list(train_df.query("document == 'pj_440'")['full_text'])[0])

In today's modern world, where technology has become an integral part of our lives, it is not surprising to see students like Aaron Cervantes embracing the digital age. With his email address, clarkalicia@yahoo.com, Aaron is ready to connect with the world and explore new opportunities. His username, iharris, reflects his desire to leave a digital footprint and make his mark in the online community.

But Aaron is not just a face in the crowd. His unique ID number, BQXK76992179312948, sets him apart and allows him to be easily identified among his peers. Whether it's for academic purposes or administrative matters, this ID number ensures that Aaron's presence is acknowledged and accounted for.

When it comes to communication, Aaron is not limited to just one phone number. With 516-821-8326 and 359.301.7707x8667, he is always within reach. Whether it's a quick text or a lengthy conversation, Aaron's phone numbers ensure that he is accessible to his friends, family, and colleagues.

In th

In [169]:
print(list(df.query(
    "label != 'O' and label != 'B-NAME_STUDENT' and label != 'I-NAME_STUDENT'and document in @document_list")['token_str']))

['https://youtu.be/AsSLvOLkUYn', 'vmartinez@hotmail.com', '188408534931', 'http://www.jackson.com/appcategory.html', 'https://youtu.be/1pM3sb7AMPs', 'https://glenn.org/postsindex.jsp', 'https://youtu.be/Kb-hqNGr5lJ', '982645662261', '409046248321', 'boydcynthia@yahoo.com', 'holmespatrick', 'http://osborne.org/main/posts/tagprivacy.asp', 'http://thomas.biz/list/taghome.php', 'http://mcneil.org/list/taghomepage.htm', 'http://mcneil.org/list/taghomepage.htm', '762035863358', 'lisarose@gmail.com', 'diazkristen@gmail.com', 'https://www.facebook.com/amanda37', 'lisarose@gmail.com', 'diazkristen@gmail.com', 'https://www.facebook.com/amanda37', 'https://alvarado.com/categoriesindex.html', '172801513686', '172801513686', '208798413907', '347376430553', '943995368223', 'ras21', 'https://www.smith-flores.com/mainlogin.htm', 'http://www.jackson.com/categories/search/tagsmain.html', 'http://fisher.com/category/wp-contentregister.htm', '779875708882', '800306846075', '955487471144', 'https://www.lop

In [145]:
test = pp(new_pred_df.query("document == 11442").reset_index(drop=True))

157


In [146]:
test.query("document == 11442 and token == 156")

Unnamed: 0,document,token,token_str,label
156,11442,156,\n,I-STREET_ADDRESS


In [148]:
test.query("document == 11442 and token == 158")

Unnamed: 0,document,token,token_str,label
158,11442,158,",",I-STREET_ADDRESS


In [112]:
test.query("token == 156")

Unnamed: 0,document,token,token_str,label
156,11442,156,\n,I-STREET_ADDRESS


In [144]:
def pp(new_pred_df):
    df = new_pred_df.copy()
    i = 0
    new_df = pd.DataFrame()
    while i < len(df):
        st = i
        doc = df.loc[st, "document"]
        tok = df.loc[st, "token"]
        pred_tok = df.loc[st, "label"]
        if pred_tok == 'O':
            i += 1
            continue
        lab = pred_tok.split('-')[1]
        cur_doc = doc
        cur_lab = lab
        last_tok = tok
        cur_tok = last_tok
        # prefix = []
        while i < len(df) and cur_doc == doc and cur_lab == lab and last_tok == cur_tok:
            # prefix.append(pred_tok.split('-')[0])
            last_tok = cur_tok + 1
            i += 1
            cur_doc = df.loc[i, "document"]
            cur_tok = df.loc[i, "token"]
            if i >= len(df) or df.loc[i, "label"] == 'O':
                break
            cur_lab = df.loc[i, "label"].split('-')[1]
        # exception
        if st - 2 >= 0 and df.loc[st - 2, "document"] == df.loc[st, "document"] and df.loc[st - 1, "token_str"] == '\n' and df.loc[st - 2, "label"] != 'O' and df.loc[st - 2, "label"].split('-')[1] == lab:
            df.loc[st - 1, "label"] = 'I-' + lab
            for j in range(st, i):
                df.loc[j, "label"] = 'I-' + lab
            print(st)
            continue

        # fix
        for j in range(st, i):
            if j == st:
                df.loc[j, "label"] = 'B-' + lab
            else:
                df.loc[j, "label"] = 'I-' + lab

    df = pd.concat([df, new_df]).reset_index(drop=True)

    return df

In [None]:
'-'

In [96]:
pred_df.query("document == 11442")

Unnamed: 0,document,token,label,token_str,row_id
91607,11442,150,B-STREET_ADDRESS,743,91607
91608,11442,151,I-STREET_ADDRESS,Erika,91608
91609,11442,152,I-STREET_ADDRESS,Bypass,91609
91610,11442,153,I-STREET_ADDRESS,Apt,91610
91611,11442,154,I-STREET_ADDRESS,.,91611
91612,11442,155,I-STREET_ADDRESS,419,91612
91613,11442,157,I-STREET_ADDRESS,Andreahaven,91613
91614,11442,158,I-STREET_ADDRESS,",",91614
91615,11442,159,I-STREET_ADDRESS,IL,91615
91616,11442,160,I-STREET_ADDRESS,54207,91616


In [99]:
reference_df.query("document == 11442")

Unnamed: 0,row_id,document,token,label,token_str
88453,6547860,11442,150,B-STREET_ADDRESS,743
88454,6547861,11442,151,I-STREET_ADDRESS,Erika
88455,6547862,11442,152,I-STREET_ADDRESS,Bypass
88456,6547863,11442,153,I-STREET_ADDRESS,Apt
88457,6547864,11442,154,I-STREET_ADDRESS,.
88458,6547865,11442,155,I-STREET_ADDRESS,419
88459,6547866,11442,156,I-STREET_ADDRESS,\n
88460,6547867,11442,157,I-STREET_ADDRESS,Andreahaven
88461,6547868,11442,158,I-STREET_ADDRESS,","
88462,6547869,11442,159,I-STREET_ADDRESS,IL


In [83]:
tokenizer("asd,ads")

{'input_ids': [1, 283, 407, 261, 34998, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [40]:
from collections import defaultdict
from typing import Dict


class PRFScore:
    """A precision / recall / F score."""

    def __init__(
        self,
        *,
        tp: int = 0,
        fp: int = 0,
        fn: int = 0,
    ) -> None:
        self.tp = tp
        self.fp = fp
        self.fn = fn

    def __len__(self) -> int:
        return self.tp + self.fp + self.fn

    def __iadd__(self, other):  # in-place add
        self.tp += other.tp
        self.fp += other.fp
        self.fn += other.fn
        return self

    def __add__(self, other):
        return PRFScore(
            tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
        )

    def score_set(self, cand: set, gold: set) -> None:
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
        self.fn += len(gold - cand)

    @property
    def precision(self) -> float:
        return self.tp / (self.tp + self.fp + 1e-100)

    @property
    def recall(self) -> float:
        return self.tp / (self.tp + self.fn + 1e-100)

    @property
    def f1(self) -> float:
        p = self.precision
        r = self.recall
        return 2 * ((p * r) / (p + r + 1e-100))

    @property
    def f5(self) -> float:
        beta = 5
        p = self.precision
        r = self.recall

        fbeta = (1+(beta**2))*p*r / ((beta**2)*p + r + 1e-100)
        return fbeta

    def to_dict(self) -> Dict[str, float]:
        return {"p": self.precision, "r": self.recall, "f5": self.f5}


def compute_metrics(pred_df, gt_df):
    """
    Compute the LB metric (lb) and other auxiliary metrics
    """

    references = {(row.document, row.token, row.label)
                  for row in gt_df.itertuples()}
    predictions = {(row.document, row.token, row.label)
                   for row in pred_df.itertuples()}

    score_per_type = defaultdict(PRFScore)
    references = set(references)

    for ex in predictions:
        pred_type = ex[-1]  # (document, token, label)
        if pred_type != 'O':
            pred_type = pred_type[2:]  # avoid B- and I- prefix

        if pred_type not in score_per_type:
            score_per_type[pred_type] = PRFScore()

        if ex in references:
            score_per_type[pred_type].tp += 1
            references.remove(ex)
        else:
            score_per_type[pred_type].fp += 1

    for doc, tok, ref_type in references:
        if ref_type != 'O':
            ref_type = ref_type[2:]  # avoid B- and I- prefix

        if ref_type not in score_per_type:
            score_per_type[ref_type] = PRFScore()
        score_per_type[ref_type].fn += 1

    totals = PRFScore()

    for prf in score_per_type.values():
        totals += prf

    return {
        "ents_p": totals.precision,
        "ents_r": totals.recall,
        "ents_f5": totals.f5,
        "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items() if k != 'O'},
    }

In [182]:
df = valid_df[['document', 'tokens', 'labels']].copy()
df = df.explode(['tokens', 'labels']).reset_index(drop=True).rename(
    columns={'tokens': 'token', 'labels': 'label'})
df

Unnamed: 0,document,token,label
0,6435,WEEK,O
1,6435,5,O
2,6435,:,O
3,6435,FINAL,O
4,6435,ASSIGNMENT,O
...,...,...,...
1497395,10311,in,O
1497396,10311,the,O
1497397,10311,future,O
1497398,10311,.,O


In [117]:
df = df.sort_values(by=["document", "token"]).reset_index(drop=True)

In [124]:
df

Unnamed: 0,document,token,label,token_str
0,112,0,O,Reflection
1,112,1,O,–
2,112,2,O,Learning
3,112,3,O,Launch
4,112,4,O,\n\n
...,...,...,...,...
1497395,22687,815,O,process
1497396,22687,816,O,explained
1497397,22687,817,O,above
1497398,22687,818,O,.


In [175]:
i = 0
total = []
last_idx = defaultdict(list)
hm_min = defaultdict(lambda: 1 << 60)
while i < len(df):
    st = i
    doc = df.loc[st, "document"]
    if df.loc[st, "label"] == 'O':
        i += 1
        continue
    lab = df.loc[st, "label"].split('-')[1]
    cur_doc = doc
    cur_lab = lab
    prefix = []
    while i < len(df) and cur_doc == doc and cur_lab == lab:
        prefix.append(df.loc[i, "label"].split('-')[0])
        i += 1
        cur_doc = df.loc[i, "document"]
        if df.loc[i, "label"] == 'O':
            break
        cur_lab = df.loc[i, "label"].split('-')[1]
    total.append([prefix, st, i])
    if last_idx[lab] and st - last_idx[lab][-1] <= hm_min[lab]:
        print(lab, st, st - last_idx[lab][-1])
        hm_min[lab] = st - last_idx[lab][-1]
    last_idx[lab].append(i - 1)

NAME_STUDENT 535 529
NAME_STUDENT 3784 295
NAME_STUDENT 6520 194
URL_PERSONAL 6536 1944
NAME_STUDENT 7271 5
EMAIL 20086 13553
URL_PERSONAL 24101 2
EMAIL 46294 566
USERNAME 47659 578
NAME_STUDENT 48917 4
ID_NUM 67470 44364
ID_NUM 67474 4
ID_NUM 67718 4
ID_NUM 68020 4
ID_NUM 68245 4
EMAIL 78219 431
PHONE_NUM 78223 428
EMAIL 78246 27
PHONE_NUM 78250 24
NAME_STUDENT 111888 2
NAME_STUDENT 217566 2
NAME_STUDENT 217569 2
NAME_STUDENT 217572 2
NAME_STUDENT 222358 2
NAME_STUDENT 222453 2
NAME_STUDENT 459084 2
NAME_STUDENT 459086 2
NAME_STUDENT 459088 2
NAME_STUDENT 490665 2
NAME_STUDENT 525249 2
URL_PERSONAL 637752 2
URL_PERSONAL 637946 2


In [190]:
df.loc[24099]

document                              3202
token                                   31
label                       B-URL_PERSONAL
token_str    tps://www.facebook.com/bclark
Name: 24099, dtype: object

In [191]:
pred_df

Unnamed: 0,document,token,label,token_str,row_id
0,6435,9,B-NAME_STUDENT,Jose,0
1,6435,10,I-NAME_STUDENT,Martinez,1
2,9854,0,B-NAME_STUDENT,Waseem,2
3,9854,1,I-NAME_STUDENT,Mabunda,3
4,9854,3,B-STREET_ADDRESS,591,4
...,...,...,...,...,...
1289,5606,795,B-ID_NUM,Kh:217952887271,1289
1290,13317,12,B-NAME_STUDENT,Oscar,1290
1291,6393,3,B-NAME_STUDENT,Rania,1291
1292,6393,4,I-NAME_STUDENT,Mohammed,1292


In [192]:
pred_df

Unnamed: 0,document,token,label,token_str,row_id
0,6435,9,B-NAME_STUDENT,Jose,0
1,6435,10,I-NAME_STUDENT,Martinez,1
2,9854,0,B-NAME_STUDENT,Waseem,2
3,9854,1,I-NAME_STUDENT,Mabunda,3
4,9854,3,B-STREET_ADDRESS,591,4
...,...,...,...,...,...
1289,5606,795,B-ID_NUM,Kh:217952887271,1289
1290,13317,12,B-NAME_STUDENT,Oscar,1290
1291,6393,3,B-NAME_STUDENT,Rania,1291
1292,6393,4,I-NAME_STUDENT,Mohammed,1292


Unnamed: 0,document,token,label,token_str,row_id
0,6435,9,B-NAME_STUDENT,Jose,0
1,6435,10,I-NAME_STUDENT,Martinez,1
2,9854,0,B-NAME_STUDENT,Waseem,2
3,9854,1,I-NAME_STUDENT,Mabunda,3
4,9854,3,B-STREET_ADDRESS,591,4
...,...,...,...,...,...
1306,5606,795,B-ID_NUM,Kh:217952887271,1306
1307,13317,12,B-NAME_STUDENT,Oscar,1307
1308,6393,3,B-NAME_STUDENT,Rania,1308
1309,6393,4,I-NAME_STUDENT,Mohammed,1309


In [354]:
def get_pred_token(pred_df, valid_df_expolde, doc, token):
    if len(pred_df.query("document == @doc and token == @token")) == 0:
        return 'O'
    return list(pred_df.query("document == @doc and token == @token")['label'])[0]

In [361]:
valid_df_explode = df.copy()

In [365]:
df

Unnamed: 0,document,token,label,token_str
0,112,0,O,Reflection
1,112,1,O,–
2,112,2,O,Learning
3,112,3,O,Launch
4,112,4,O,\n\n
...,...,...,...,...
1497395,22687,815,O,process
1497396,22687,816,O,explained
1497397,22687,817,O,above
1497398,22687,818,O,.


In [413]:
def pp(new_pred_df):
    df = new_pred_df.copy()
    i = 0
    new_df = pd.DataFrame()
    while i < len(df):
        st = i
        doc = df.loc[st, "document"]
        tok = df.loc[st, "token"]
        pred_tok = df.loc[st, "label"]
        if pred_tok == 'O':
            i += 1
            continue
        lab = pred_tok.split('-')[1]
        cur_doc = doc
        cur_lab = lab
        last_tok = tok
        cur_tok = last_tok
        # prefix = []
        while i < len(df) and cur_doc == doc and cur_lab == lab and last_tok == cur_tok:
            # prefix.append(pred_tok.split('-')[0])
            last_tok = cur_tok + 1
            i += 1
            cur_doc = df.loc[i, "document"]
            cur_tok = df.loc[i, "token"]
            if i >= len(df) or df.loc[i, "label"] == 'O':
                break
            cur_lab = df.loc[i, "label"].split('-')[1]
        # exception
        if st - 2 >= 0 and df.loc[st - 2, "document"] == df.loc[st, "document"] and df.loc[st - 1, "token"] == '\n' and df.loc[st - 2, "label"] != 'O' and df.loc[st - 2, "label"].split('-')[1] == lab:
            new_df = df.loc[[st], :].copy()
            new_df.loc[st, "token"] = df.loc[st, "token"] - 1
            new_df.loc[st, "token_str"] = df.loc[st - 1, "token_str"]
            print(new_df)
            continue

        # fix
        for j in range(st, i):
            if j == st:
                df.loc[j, "label"] = 'B-' + lab
            else:
                df.loc[j, "label"] = 'I-' + lab

    return df

In [409]:
new_pred_df.loc[6, "document"]

112

In [408]:
new_pred_df.loc[5, "document"]

112

In [412]:
new_pred_df.loc[5, "token"]

5

In [411]:
new_pred_df.loc[6, "token"]

6

In [414]:
pred_df_pp = pp(new_pred_df)

In [415]:
pred_df_pp1 = pred_df_pp.query("label != 'O'")

In [416]:
pred_df_pp1

Unnamed: 0,document,token,token_str,label
5,112,5,Francisco,B-NAME_STUDENT
6,112,6,Ferreira,I-NAME_STUDENT
535,166,0,Pepa,B-NAME_STUDENT
536,166,1,Medrano,I-NAME_STUDENT
1523,214,4,Fareed,B-NAME_STUDENT
...,...,...,...,...
1434753,22106,683,SOFÍA,I-NAME_STUDENT
1434754,22106,684,CARMONA,I-NAME_STUDENT
1434755,22106,685,DÍAZ-,I-NAME_STUDENT
1436734,22124,271,Luke,B-NAME_STUDENT


In [417]:
eval_dict = compute_metrics(pred_df_pp1, reference_df)
m = eval_dict['ents_f5']
print(f"LB = {round(m, 3)}")

LB = 0.978


In [418]:
result = pred_df_pp.merge(
    reference_df, on=['document', 'token'], how='outer', indicator=True)
unique_to_df1 = result[result['_merge'] == 'left_only']
unique_to_df2 = result[result['_merge'] == 'right_only']
both = result[result['_merge'] == 'both']

In [419]:
both.query("label_x != label_y")

Unnamed: 0,document,token,token_str_x,label_x,row_id,label_y,token_str_y,_merge
24102,3202,34,nYZqnhEXw,B-ID_NUM,1151942.0,I-URL_PERSONAL,nYZqnhEXw,both
117837,7786,623,jacob59,I-PHONE_NUM,213662.0,B-USERNAME,jacob59,both
568254,12483,6,Weyhacy_7000693584,I-NAME_STUDENT,981333.0,B-ID_NUM,Weyhacy_7000693584,both


In [337]:
pred_df_pp.query("document == 6435")

Unnamed: 0,document,token,label,token_str,row_id
0,6435,9,B-NAME_STUDENT,Jose,0
1,6435,10,B-NAME_STUDENT,Martinez,1


In [336]:
pred_df_pp.query("document == 6435")

Unnamed: 0,document,token,label,token_str,row_id
0,6435,9,B-NAME_STUDENT,Jose,0
1,6435,10,B-NAME_STUDENT,Martinez,1


In [None]:
aa

In [209]:
eval_dict = compute_metrics(pred_df, reference_df)
m = eval_dict['ents_f5']
print(f"LB = {round(m, 3)}")

LB = 0.971


In [210]:
eval_dict = compute_metrics(pred_df_pp, reference_df)
m = eval_dict['ents_f5']
print(f"LB = {round(m, 3)}")

LB = 0.971


In [112]:
# load training data and create reference dataframe ---
df = valid_df[['document', 'tokens', 'labels']].copy()
df = df.explode(['tokens', 'labels']).reset_index(drop=True).rename(
    columns={'tokens': 'token', 'labels': 'label'})
df['token_str'] = df['token']
df['token'] = df.groupby('document').cumcount()

label_list = df['label'].unique().tolist()

reference_df = df[df['label'] != 'O'].copy()
reference_df = reference_df.reset_index().rename(columns={'index': 'row_id'})
reference_df = reference_df[['row_id', 'document',
                             'token', 'label', 'token_str']].copy()
reference_df

Unnamed: 0,row_id,document,token,label,token_str
0,9,6435,9,B-NAME_STUDENT,Jose
1,10,6435,10,I-NAME_STUDENT,Martinez
2,3516,9854,0,B-NAME_STUDENT,Waseem
3,3517,9854,1,I-NAME_STUDENT,Mabunda
4,3519,9854,3,B-STREET_ADDRESS,591
...,...,...,...,...,...
883,1484958,5606,791,B-ID_NUM,143860010348
884,1484962,5606,795,B-ID_NUM,Kh:217952887271
885,1485225,13317,12,B-NAME_STUDENT,Oscar
886,1488283,6393,3,B-NAME_STUDENT,Rania


In [72]:
eval_dict = compute_metrics(pred_df2, reference_df)
m = eval_dict['ents_f5']
print(f"LB = {round(m, 3)}")

LB = 0.973


In [40]:
eval_dict = compute_metrics(pred_df, reference_df)
m = eval_dict['ents_f5']
print(f"LB = {round(m, 3)}")

LB = 0.968


In [41]:
eval_dict3 = compute_metrics(pred_df3, reference_df)
m = eval_dict3['ents_f5']
print(f"LB = {round(m, 3)}")

LB = 0.971


In [42]:
# fine grained results
print(json.dumps(eval_dict['ents_per_type'], indent=4))

{
    "NAME_STUDENT": {
        "p": 0.64149377593361,
        "r": 0.9922978177150192,
        "f5": 0.9718568665377176
    },
    "ID_NUM": {
        "p": 0.49019607843137253,
        "r": 0.9259259259259259,
        "f5": 0.8953168044077136
    },
    "PHONE_NUM": {
        "p": 0.9285714285714286,
        "r": 1.0,
        "f5": 0.9970501474926253
    },
    "EMAIL": {
        "p": 0.7368421052631579,
        "r": 1.0,
        "f5": 0.9864498644986449
    },
    "URL_PERSONAL": {
        "p": 0.5,
        "r": 0.975609756097561,
        "f5": 0.9411764705882353
    },
    "USERNAME": {
        "p": 0.75,
        "r": 1.0,
        "f5": 0.9873417721518988
    },
    "STREET_ADDRESS": {
        "p": 0.9090909090909091,
        "r": 0.9090909090909091,
        "f5": 0.9090909090909092
    }
}


In [43]:
# fine grained results
print(json.dumps(eval_dict3['ents_per_type'], indent=4))

{
    "NAME_STUDENT": {
        "p": 0.6531645569620254,
        "r": 0.993581514762516,
        "f5": 0.97405614714424
    },
    "ID_NUM": {
        "p": 0.5952380952380952,
        "r": 0.9259259259259259,
        "f5": 0.9065550906555091
    },
    "PHONE_NUM": {
        "p": 0.7647058823529411,
        "r": 1.0,
        "f5": 0.9883040935672515
    },
    "EMAIL": {
        "p": 0.7368421052631579,
        "r": 1.0,
        "f5": 0.9864498644986449
    },
    "URL_PERSONAL": {
        "p": 0.5797101449275363,
        "r": 0.975609756097561,
        "f5": 0.9506398537477149
    },
    "USERNAME": {
        "p": 1.0,
        "r": 1.0,
        "f5": 1.0
    },
    "STREET_ADDRESS": {
        "p": 1.0,
        "r": 0.9090909090909091,
        "f5": 0.9122807017543859
    }
}


In [221]:
def display_bad_case(bad_case, df):
    for idx, row in bad_case.iterrows():
        d = row.document
        t = row.token
        tokens = list(valid_df.query("document == @d")["tokens"])[0]
        print("*" * 20)
        print(d, t)
        print(tokens[t])
        print(tokens[max(0, (t - 3)): t + 3])
        print(" ".join(tokens[max(0, (t - 10)): t + 10]))
        print("*" * 20)

In [None]:
def morepred(pred_df, reference_df):
    pred_df.merge(reference_df, on=['document',
                  'token'], how='outer', indicator=True)

In [186]:
result = pred_df.merge(
    reference_df, on=['document', 'token'], how='outer', indicator=True)
unique_to_df1 = result[result['_merge'] == 'left_only']
unique_to_df2 = result[result['_merge'] == 'right_only']
both = result[result['_merge'] == 'both']

In [216]:
both.query('label_x != label_y')

Unnamed: 0,document,token,label_x,token_str_x,row_id_x,row_id_y,label_y,token_str_y,_merge
21,7779,589,B-NAME_STUDENT,Leroy,21.0,9898.0,I-NAME_STUDENT,Leroy,both
24,7779,654,B-NAME_STUDENT,Leroy,24.0,9963.0,I-NAME_STUDENT,Leroy,both
26,7779,741,B-NAME_STUDENT,Leroy,26.0,10050.0,I-NAME_STUDENT,Leroy,both
28,7779,755,B-NAME_STUDENT,Leroy,28.0,10064.0,I-NAME_STUDENT,Leroy,both
141,6243,470,B-PHONE_NUM,820)913,141.0,135251.0,I-PHONE_NUM,820)913,both
471,19280,55,B-ID_NUM,30407059,471.0,637273.0,I-ID_NUM,30407059,both
826,3202,34,B-URL_PERSONAL,nYZqnhEXw,826.0,1151942.0,I-URL_PERSONAL,nYZqnhEXw,both


In [215]:
display_bad_case(both.query('label_x != label_y'), valid_df)

********************
7779 589
daily weekly or monthly . When i discussed with Sullivan Leroy ,   he explained me that each six month
********************
********************
7779 654
n’t   call it exactly that . So if Sullivan Leroy can create that kind of product for farmers or
********************
********************
7779 741
the end of my story   I presented them Sullivan Leroy whom I had invited .. All of my colleagues
********************
********************
7779 755
invited .. All of my colleagues   really appreciated Sullivan Leroy participation to the meeting . He shared his experience
********************
********************
6243 470
Jana Telfah   Email : nbarker@hotmail.com   Mobile : ( 820)913 - 3241x894 

 Therefore , we proposed a rethinking
********************
********************
19280 55
Technical and Artistic Theatre and Performing Art division 

 Z.S. 30407059 

 Challenge 

 Working in the Technical and Artistic
********************
********************
3202 34


In [222]:
display_bad_case(unique_to_df2, valid_df)

********************
9854 9


['Apt', '.', '656', '\n', 'Joshuamouth', ',']
Waseem Mabunda   591 Smith Centers Apt . 656 
 Joshuamouth , RI 95963 ( The Netherlands )  
********************
********************
7779 823
Sullivan
['the', 'end', 'of', 'Sullivan', 'Leroy', '’s']
challenge was not   impossible . At the end of Sullivan Leroy ’s speech , they all realised that we
********************
********************
7779 824
Leroy
['end', 'of', 'Sullivan', 'Leroy', '’s', 'speech']
was not   impossible . At the end of Sullivan Leroy ’s speech , they all realised that we could
********************
********************
11288 484
Juan
['youngest', 'brother', '.', 'Juan', "'s", 'arrival']
life like the birth of my   youngest brother . Juan 's arrival was a life - altering event that
********************
********************
9911 365
Cobus
['this', 'tool', 'from', 'Cobus', 'Mpanza', ',']
“ Systems Mapping ” . I learnt this tool from Cobus Mpanza , who also attended   Alexandr Ospina ’s
*******

In [198]:
both.query('label_x != label_y and document == 7779 and token == 589')['

Unnamed: 0,document,token,label_x,token_str_x,row_id_x,row_id_y,label_y,token_str_y,_merge
21,7779,589,B-NAME_STUDENT,Leroy,21.0,9898.0,I-NAME_STUDENT,Leroy,both


In [191]:
unique_to_df1

Unnamed: 0,document,token,label_x,token_str_x,row_id_x,row_id_y,label_y,token_str_y,_merge
14,9854,15,I-STREET_ADDRESS,The,14.0,,,,left_only
15,9854,16,I-STREET_ADDRESS,Netherlands,15.0,,,,left_only
17,9854,21,B-EMAIL,vpi@mn.nl,17.0,,,,left_only
42,20415,321,B-NAME_STUDENT,Vibhor,42.0,,,,left_only
56,21578,0,B-NAME_STUDENT,Brett,56.0,,,,left_only
...,...,...,...,...,...,...,...,...,...
1040,19567,843,B-NAME_STUDENT,Jah,1040.0,,,,left_only
1041,19567,844,I-NAME_STUDENT,Page,1041.0,,,,left_only
1042,19567,1108,B-NAME_STUDENT,Jah,1042.0,,,,left_only
1043,19567,1109,I-NAME_STUDENT,Page,1043.0,,,,left_only


In [232]:
valid_df.query("document == 9854")["labels"].values

array([list(['B-NAME_STUDENT', 'I-NAME_STUDENT', 'O', 'B-STREET_ADDRESS', 'I-STREET_ADDRESS', 'I-STREET_ADDRESS', 'I-STREET_ADDRESS', 'I-STREET_ADDRESS', 'I-STREET_ADDRESS', 'I-STREET_ADDRESS', 'I-STREET_ADDRESS', 'I-STREET_ADDRESS', 'I-STREET_ADDRESS', 'I-STREET_ADDRESS', 'O', 'O', 'O', 'O', 'O', 'B-PHONE_NUM', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '

In [229]:
valid_df.query("document == 9854")["tokens"].values

array([list(['Waseem', 'Mabunda', ' ', '591', 'Smith', 'Centers', 'Apt', '.', '656', '\n', 'Joshuamouth', ',', 'RI', '95963', '(', 'The', 'Netherlands', ')', ' ', '410.526.1667', ' ', 'vpi@mn.nl', '\n\n', 'Mind', 'Mapping', ',', '     ', 'Challenge', ':', '    ', 'For', 'several', 'years', 'I', 'have', 'been', 'working', 'for', 'an', 'Asset', 'manager', 'in', 'the', 'Netherlands', '.', 'During', 'this', 'period', 'I', 'have', 'been', 'involved', 'in', 'many', ' ', 'projects', '.', 'Certainly', 'in', 'the', 'world', 'of', 'asset', 'management', ',', 'much', 'has', 'changed', 'in', 'recent', 'years', 'in', 'the', 'area', 'of', 'Law', 'and', 'Regulations', '.', ' ', 'What', 'I', 'mainly', 'experience', 'in', 'these', 'projects', 'is', 'that', 'all', 'departments', 'have', 'a', 'different', 'interest', 'in', 'starting', 'a', 'new', 'project', '.', 'This', ' ', 'certainly', 'does', 'not', 'benefit', 'the', 'project', '.', 'How', 'do', 'you', 'get', 'everyone', 'to', 'complete', 'a', 'projec

In [188]:
unique_to_df2

Unnamed: 0,document,token,label_x,token_str_x,row_id_x,row_id_y,label_y,token_str_y,_merge
1073,9854,9,,,,3525.0,I-STREET_ADDRESS,\n,right_only
1074,7779,823,,,,10132.0,B-NAME_STUDENT,Sullivan,right_only
1075,7779,824,,,,10133.0,I-NAME_STUDENT,Leroy,right_only
1076,11288,484,,,,68953.0,B-NAME_STUDENT,Juan,right_only
1077,9911,365,,,,90342.0,B-NAME_STUDENT,Cobus,right_only
1078,9911,366,,,,90343.0,I-NAME_STUDENT,Mpanza,right_only
1079,19280,54,,,,637272.0,B-ID_NUM,Z.S.,right_only
1080,9421,82,,,,646337.0,B-URL_PERSONAL,http://www.moore.com/,right_only
1081,6117,62,,,,709974.0,B-NAME_STUDENT,Hari,right_only
1082,6117,63,,,,709975.0,I-NAME_STUDENT,Sharma,right_only


In [179]:
pred_df

Unnamed: 0,document,token,label,token_str,row_id
0,6435,9,B-NAME_STUDENT,Jose,0
1,6435,10,I-NAME_STUDENT,Martinez,1
2,9854,0,B-NAME_STUDENT,Waseem,2
3,9854,1,I-NAME_STUDENT,Mabunda,3
4,9854,3,B-STREET_ADDRESS,591,4
...,...,...,...,...,...
1068,5606,795,B-ID_NUM,Kh:217952887271,1068
1069,13317,12,B-NAME_STUDENT,Oscar,1069
1070,6393,3,B-NAME_STUDENT,Rania,1070
1071,6393,4,I-NAME_STUDENT,Mohammed,1071
