In [1]:
import csv
import json
import random
import re
from collections import Counter, defaultdict

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
import pandas as pd
import seaborn as sns

import torch

from datasets import (
    Dataset,
    DatasetDict,
    concatenate_datasets,
    load_dataset,
    load_from_disk,
)
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
    set_seed,
)

import evaluate

from nameparser import HumanName
from names_dataset import NameDataset, NameWrapper
from ethnicseer import EthnicClassifier
import nltk
from nltk.corpus import wordnet as wn

import pycountry_convert as pc
import pycountry
import pickle

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, cohen_kappa_score

from transformers import BertTokenizerFast, BertForTokenClassification
from datasets import ClassLabel
from evaluate import load as load_metric
from sklearn.metrics import cohen_kappa_score
from itertools import combinations
import krippendorff




In [2]:
model_name = "bert-base-cased"

In [5]:
conll_main = load_from_disk("../splits/conll_main")
conll_clean = load_from_disk("../splits/conll_clean")

ontonotes_main = load_from_disk("../splits/ontonotes_main")
ontonotes_clean = load_from_disk("../splits/ontonotes_clean")

# Load GPU

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


# Tokenisation & Alignment

In [7]:
ontonotes_id_to_label = {
    0: "O", 1: "B-CARDINAL", 2: "B-DATE", 3: "I-DATE", 4: "B-PERSON", 5: "I-PERSON",
    6: "B-NORP", 7: "B-GPE", 8: "I-GPE", 9: "B-LAW", 10: "I-LAW", 11: "B-ORG", 12: "I-ORG",
    13: "B-PERCENT", 14: "I-PERCENT", 15: "B-ORDINAL", 16: "B-MONEY", 17: "I-MONEY",
    18: "B-WORK_OF_ART", 19: "I-WORK_OF_ART", 20: "B-FAC", 21: "B-TIME", 22: "I-CARDINAL",
    23: "B-LOC", 24: "B-QUANTITY", 25: "I-QUANTITY", 26: "I-NORP", 27: "I-LOC",
    28: "B-PRODUCT", 29: "I-TIME", 30: "B-EVENT", 31: "I-EVENT", 32: "I-FAC",
    33: "B-LANGUAGE", 34: "I-PRODUCT", 35: "I-ORDINAL", 36: "I-LANGUAGE"
}

conll_label_to_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3,
                     'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
id2label = {v: k for k, v in conll_label_to_id.items()}

ontonotes_to_conll_entity = {
    "PERSON": "PER", "ORG": "ORG", "GPE": "LOC", "LOC": "LOC",
    "NORP": "MISC", "FAC": "MISC", "EVENT": "MISC", "WORK_OF_ART": "MISC",
    "LAW": "MISC", "PRODUCT": "MISC", "LANGUAGE": "MISC",
    "DATE": None, "TIME": None, "PERCENT": None, "MONEY": None,
    "QUANTITY": None, "ORDINAL": None, "CARDINAL": None
}

In [8]:
def process_data(data_list):

    def process_single(data):
        word_ids = data['word_ids']
        predictions = data['predictions']
        gold = data['gold']
        tokenized_tokens = data['tokens']

        word_ids = [a for a in word_ids if a is not None]

        processed_predictions = []
        processed_gold = []

        current_word_id = None
        current_predictions = []
        current_gold = []

        for idx, word_id in enumerate(word_ids):
            if word_id != current_word_id:
                if current_predictions:
                    processed_predictions.append(
                        Counter(current_predictions).most_common(1)[0][0])
                    processed_gold.append(
                        Counter(current_gold).most_common(1)[0][0])

                current_word_id = word_id
                current_predictions = [predictions[idx]]
                current_gold = [gold[idx]]
            else:
                current_predictions.append(predictions[idx])
                current_gold.append(gold[idx])

        if current_predictions:
            processed_predictions.append(
                Counter(current_predictions).most_common(1)[0][0])
            processed_gold.append(
                Counter(current_gold).most_common(1)[0][0])

        return processed_predictions, processed_gold

    processed_predictions_list = []
    processed_gold_list = []

    for data in data_list:
        processed_predictions, processed_gold = process_single(data)
        processed_predictions_list.append(processed_predictions)
        processed_gold_list.append(processed_gold)

    return processed_predictions_list, processed_gold_list


def evaluate_predictions(p, test_data):
    predictions, labels, _ = p

    pred_indices = [np.argmax(p, axis=-1) for p in predictions]
    label_indices = labels

    pred_tags = [[id2label[p] for p, l in zip(p_seq, l_seq) if l != -100]
                 for p_seq, l_seq in zip(pred_indices, label_indices)]
    gold_tags = [[id2label[l] for l in l_seq if l != -100]
                 for l_seq in label_indices]

    def add_preds(example, idx):
        length = len(example['word_ids'])
        example['predictions'] = pred_tags[idx][:length]
        example['gold'] = gold_tags[idx][:length]
        return example

    test_data = test_data.map(add_preds, with_indices=True)

    length = len(test_data['predictions'][0])

    pred, gold = process_data(test_data)

    flat_pred = [label for seq in pred for label in seq]
    flat_gold = [label for seq in gold for label in seq]

    print(classification_report(flat_gold, flat_pred, zero_division=0))

    return (flat_pred, flat_gold)

# Model

In [9]:
tokenizer = BertTokenizerFast.from_pretrained(model_name)

label_list = ['O', 'B-PER', 'I-PER', 'B-ORG',
              'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding=True,
        return_special_tokens_mask=True,
        return_offsets_mapping=True,
    )
    all_word_ids = []
    all_labels = []
    for i, labels in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        all_word_ids.append(word_ids)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(labels[word_idx])
            previous_word_idx = word_idx
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    tokenized_inputs["word_ids"] = all_word_ids
    return tokenized_inputs

In [10]:
conll_main = conll_main.map(tokenize_and_align_labels, batched=True)
conll_clean = conll_clean.map(tokenize_and_align_labels, batched=True)
ontonotes_main = ontonotes_main.map(tokenize_and_align_labels, batched=True)
ontonotes_clean = ontonotes_clean.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/4149 [00:00<?, ? examples/s]

Map:   0%|          | 0/15343 [00:00<?, ? examples/s]

# Deciding params

In [11]:
train_data = conll_main
test_data = conll_clean

In [12]:
train_data_name = 'conll'

In [13]:
mod = BertForTokenClassification.from_pretrained(
    model_name, num_labels=len(label_list))

metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return metric.compute(predictions=true_predictions, references=true_labels)


data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir=f"./output/{model_name}",
    eval_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to="none",
    fp16=True,
    save_strategy="no",
)

trainer = Trainer(
    model=mod,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


In [14]:
trainer.train()

Step,Training Loss



KeyboardInterrupt



In [None]:
trainer.save_model(f"./saved_model/{model_name}_{train_data_name}_self")

In [None]:
predictions = trainer.predict(test_data)

In [None]:
with open(f'./results/{model_name}_{train_data_name}_results.pkl', 'wb') as f:
    pickle.dump(predictions, f)

In [None]:
x, y = evaluate_predictions(predictions, test_data)


Map:   6%|▌         | 3423/61371 [00:00<00:14, 3949.93 examples/s]


Map:   7%|▋         | 4000/61371 [00:00<00:15, 3806.36 examples/s]


Map:   8%|▊         | 4905/61371 [00:01<00:11, 4906.18 examples/s]


Map:   9%|▉         | 5741/61371 [00:01<00:12, 4630.69 examples/s]


Map:  10%|█         | 6380/61371 [00:01<00:12, 4241.78 examples/s]


Map:  11%|█▏        | 7000/61371 [00:01<00:13, 3976.48 examples/s]


Map:  12%|█▏        | 7505/61371 [00:01<00:12, 4188.58 examples/s]


Map:  13%|█▎        | 7971/61371 [00:01<00:12, 4292.33 examples/s]


Map:  14%|█▍        | 8842/61371 [00:02<00:15, 3405.72 examples/s]


Map:  15%|█▌        | 9343/61371 [00:02<00:15, 3355.63 examples/s]


Map:  16%|█▋        | 10000/61371 [00:02<00:15, 3352.10 examples/s]


Map:  18%|█▊        | 10819/61371 [00:02<00:11, 4250.51 examples/s]


Map:  19%|█▉        | 11605/61371 [00:02<00:12, 3862.75 examples/s]


Map:  20%|██        | 12358/61371 [00:03<00:14, 3465.85 examples/s]


Map:  21%|██        | 13000/61371 [00:03<00:14, 3236.08 examples/s]


Map:  22%|██▏       | 13800/61371 [00:03<00:11, 4021.49 examples/s]


Map:  23%|██▎       | 14355/61371 [00:03<00:12, 3678.32 examples/s]


Map:  24%|██▍       | 15000/61371 [00:03<00:13, 3344.73 examples/s]


Map:  26%|██▌       | 15689/61371 [00:04<00:11, 3969.16 examples/s]


Map:  27%|██▋       | 16334/61371 [00:04<00:13, 3446.55 examples/s]


Map:  28%|██▊       | 17000/61371 [00:04<00:13, 3268.28 examples/s]


Map:  29%|██▉       | 17658/61371 [00:04<00:11, 3843.36 examples/s]


Map:  30%|██▉       | 18372/61371 [00:04<00:11, 3627.40 examples/s]


Map:  31%|███       | 19000/61371 [00:05<00:12, 3263.29 examples/s]


Map:  32%|███▏      | 19601/61371 [00:05<00:11, 3740.75 examples/s]


Map:  33%|███▎      | 20307/61371 [00:05<00:12, 3402.17 examples/s]


Map:  34%|███▍      | 21000/61371 [00:05<00:12, 3315.34 examples/s]


Map:  35%|███▌      | 21559/61371 [00:05<00:10, 3710.69 examples/s]


Map:  36%|███▌      | 22000/61371 [00:05<00:13, 2995.15 examples/s]


Map:  37%|███▋      | 22689/61371 [00:06<00:10, 3701.51 examples/s]


Map:  38%|███▊      | 23343/61371 [00:06<00:10, 3565.11 examples/s]


Map:  39%|███▉      | 24000/61371 [00:06<00:11, 3304.84 examples/s]


Map:  40%|████      | 24657/61371 [00:06<00:09, 3899.92 examples/s]


Map:  41%|████▏     | 25371/61371 [00:06<00:09, 3719.30 examples/s]


Map:  42%|████▏     | 26000/61371 [00:06<00:10, 3490.35 examples/s]


Map:  43%|████▎     | 26474/61371 [00:07<00:09, 3716.51 examples/s]


Map:  44%|████▍     | 27000/61371 [00:07<00:11, 3005.92 examples/s]


Map:  45%|████▌     | 27649/61371 [00:07<00:09, 3626.51 examples/s]


Map:  46%|████▌     | 28370/61371 [00:07<00:10, 3280.91 examples/s]


Map:  47%|████▋     | 29000/61371 [00:07<00:09, 3262.25 examples/s]


Map:  49%|████▊     | 29804/61371 [00:08<00:07, 4124.50 examples/s]


Map:  49%|████▉     | 30373/61371 [00:08<00:08, 3815.99 examples/s]


Map:  51%|█████     | 31000/61371 [00:08<00:08, 3527.60 examples/s]


Map:  52%|█████▏    | 31782/61371 [00:08<00:06, 4343.86 examples/s]


Map:  53%|█████▎    | 32556/61371 [00:08<00:07, 3888.75 examples/s]


Map:  54%|█████▍    | 33273/61371 [00:08<00:07, 3649.21 examples/s]


Map:  55%|█████▌    | 33836/61371 [00:09<00:06, 4004.20 examples/s]


Map:  56%|█████▌    | 34324/61371 [00:09<00:08, 3172.78 examples/s]


Map:  57%|█████▋    | 35000/61371 [00:09<00:07, 3297.49 examples/s]


Map:  58%|█████▊    | 35703/61371 [00:09<00:06, 3978.73 examples/s]


Map:  59%|█████▉    | 36396/61371 [00:09<00:06, 3982.89 examples/s]


Map:  60%|██████    | 37000/61371 [00:09<00:06, 3934.40 examples/s]


Map:  61%|██████▏   | 37681/61371 [00:10<00:05, 4527.81 examples/s]


Map:  62%|██████▏   | 38326/61371 [00:10<00:05, 4096.99 examples/s]


Map:  64%|██████▎   | 39000/61371 [00:10<00:06, 3677.32 examples/s]


Map:  65%|██████▍   | 39627/61371 [00:10<00:05, 4176.27 examples/s]


Map:  66%|██████▌   | 40281/61371 [00:10<00:05, 3537.26 examples/s]


Map:  67%|██████▋   | 40885/61371 [00:10<00:05, 4009.18 examples/s]


Map:  67%|██████▋   | 41377/61371 [00:11<00:06, 3131.14 examples/s]


Map:  68%|██████▊   | 42000/61371 [00:11<00:05, 3393.04 examples/s]


Map:  70%|██████▉   | 42761/61371 [00:11<00:04, 4211.99 examples/s]


Map:  71%|███████   | 43342/61371 [00:11<00:04, 3747.86 examples/s]


Map:  72%|███████▏  | 44000/61371 [00:11<00:05, 3358.73 examples/s]


Map:  73%|███████▎  | 44506/61371 [00:11<00:04, 3671.14 examples/s]


Map:  73%|███████▎  | 45000/61371 [00:12<00:08, 1925.16 examples/s]


Map:  75%|███████▍  | 45751/61371 [00:12<00:05, 2633.87 examples/s]


Map:  75%|███████▌  | 46317/61371 [00:12<00:05, 2602.65 examples/s]


Map:  77%|███████▋  | 47000/61371 [00:13<00:05, 2794.26 examples/s]


Map:  77%|███████▋  | 47554/61371 [00:13<00:04, 3230.41 examples/s]


Map:  78%|███████▊  | 48000/61371 [00:13<00:05, 2666.21 examples/s]


Map:  79%|███████▉  | 48709/61371 [00:13<00:03, 3417.37 examples/s]


Map:  80%|████████  | 49301/61371 [00:13<00:03, 3189.61 examples/s]


Map:  81%|████████▏ | 49905/61371 [00:13<00:03, 3717.29 examples/s]


Map:  83%|████████▎ | 50649/61371 [00:14<00:03, 3233.72 examples/s]


Map:  84%|████████▎ | 51322/61371 [00:14<00:03, 2949.18 examples/s]


Map:  85%|████████▍ | 52000/61371 [00:14<00:03, 2960.14 examples/s]


Map:  86%|████████▌ | 52787/61371 [00:14<00:02, 3750.68 examples/s]


Map:  87%|████████▋ | 53328/61371 [00:14<00:02, 3435.47 examples/s]


Map:  88%|████████▊ | 54000/61371 [00:15<00:02, 3285.30 examples/s]


Map:  89%|████████▉ | 54592/61371 [00:15<00:01, 3749.32 examples/s]


Map:  90%|█████████ | 55333/61371 [00:15<00:01, 3295.10 examples/s]


Map:  91%|█████████ | 56000/61371 [00:15<00:01, 3401.63 examples/s]


Map:  92%|█████████▏| 56671/61371 [00:15<00:01, 3993.89 examples/s]


Map:  93%|█████████▎| 57295/61371 [00:16<00:01, 3571.33 examples/s]


Map:  95%|█████████▍| 57996/61371 [00:16<00:00, 4220.68 examples/s]


Map:  96%|█████████▌| 58632/61371 [00:16<00:00, 3377.28 examples/s]


Map:  97%|█████████▋| 59347/61371 [00:16<00:00, 2780.84 examples/s]


Map:  98%|█████████▊| 60000/61371 [00:16<00:00, 3023.13 examples/s]


Map:  99%|█████████▉| 60705/61371 [00:17<00:00, 3675.60 examples/s]


Map: 100%|██████████| 61371/61371 [00:19<00:00, 909.80 examples/s] 


Map: 100%|██████████| 61371/61371 [00:19<00:00, 3162.99 examples/s]




              precision    recall  f1-score   support

       B-LOC       0.83      0.89      0.86     17495
      B-MISC       0.64      0.73      0.68     10657
       B-ORG       0.63      0.68      0.65     13041
       B-PER       0.90      0.93      0.91     15547
       I-LOC       0.71      0.66      0.68      5367
      I-MISC       0.66      0.37      0.47      7305
       I-ORG       0.90      0.71      0.79     18313
       I-PER       0.95      0.88      0.91     11086
           O       0.99      0.99      0.99   1011383

    accuracy                           0.97   1110194
   macro avg       0.80      0.76      0.77   1110194
weighted avg       0.97      0.97      0.97   1110194



## Stanford

In [None]:
df = pd.read_excel(
    "./Guided-Adversarial-Augmentation-main/Guided-Adversarial-Augmentation-main/data/data/conll2003/challenge_set.xlsx", header=None)

In [None]:
examples = []

i = 0
while i < len(df):
    row = df.iloc[i]
    if str(row[0]).startswith("GUID"):

        guid = str(df.iloc[i][1]).strip()

        try:
            quality = int(str(df.iloc[i+1][1]).strip())
        except (ValueError, TypeError):
            quality = 999

        try:
            aug_type = int(str(df.iloc[i+2][1]).strip())
        except (ValueError, TypeError):
            aug_type = 999

        tokens_row = df.iloc[i+3].dropna().tolist()[1:]
        labels_row = df.iloc[i+4].dropna().tolist()[1:]
        labels_row = [label.strip()
                      for label in labels_row if label.strip() != ""]

        if len(tokens_row) == len(labels_row):
            examples.append({
                "guid": guid,
                "quality": quality,
                "aug_type": aug_type,
                "tokens": tokens_row,
                "labels": labels_row
            })

        i += 6
    else:
        i += 1

challenge_dataset = Dataset.from_list(examples)

In [None]:
conll_label_to_id = {
    'O': 0,
    'B-PER': 1, 'I-PER': 2,
    'B-ORG': 3, 'I-ORG': 4,
    'B-LOC': 5, 'I-LOC': 6,
    'B-MISC': 7, 'I-MISC': 8,
}


def encode_labels(example):
    example["labels"] = [conll_label_to_id.get(
        label, 0) for label in example["labels"]]
    return example

In [None]:
challenge_dataset = challenge_dataset.map(encode_labels)


Map: 100%|██████████| 1418/1418 [00:00<00:00, 10944.56 examples/s]




In [21]:
stanford_encoded = challenge_dataset.map(
    tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/1418 [00:00<?, ? examples/s]


Map:  71%|███████   | 1000/1418 [00:00<00:00, 2132.63 examples/s]


Map: 100%|██████████| 1418/1418 [00:00<00:00, 2498.01 examples/s]


Map: 100%|██████████| 1418/1418 [00:00<00:00, 2388.34 examples/s]




In [22]:
stanford_results = trainer.predict(stanford_encoded)

In [23]:
with open(f'./results/{model_name}_{train_data_name}_stanford', 'wb') as f:
    pickle.dump(stanford_results, f)

In [24]:
x, y = evaluate_predictions(stanford_results, stanford_encoded)


Map:   0%|          | 0/1418 [00:00<?, ? examples/s]


Map:  46%|████▌     | 646/1418 [00:00<00:00, 6385.92 examples/s]


Map: 100%|█████████▉| 1414/1418 [00:00<00:00, 5309.18 examples/s]


Map: 100%|██████████| 1418/1418 [00:00<00:00, 4913.02 examples/s]




              precision    recall  f1-score   support

       B-LOC       0.63      0.86      0.72       539
      B-MISC       0.93      0.92      0.93       319
       B-ORG       0.70      0.85      0.77      1214
       B-PER       0.88      0.64      0.74      1067
       I-LOC       0.38      0.86      0.53        79
      I-MISC       0.87      0.88      0.87        82
       I-ORG       0.87      0.94      0.90      2026
       I-PER       1.00      0.63      0.77       939
           O       1.00      0.99      1.00     22466

    accuracy                           0.96     28731
   macro avg       0.81      0.84      0.80     28731
weighted avg       0.96      0.96      0.96     28731



## Personal

In [25]:
nd = NameDataset()
country_codes = nd.get_country_codes()

continent_names = {
    'AF': 'Africa',
    'NA': 'North America',
    'OC': 'Oceania',
    'AN': 'Antarctica',
    'AS': 'Asia',
    'EU': 'Europe',
    'SA': 'South America',
}

continent_to_code = defaultdict(list)

continent_to_countries = defaultdict(list)

for code in country_codes:
    try:
        alpha_2 = code.alpha_2
        country = pycountry.countries.get(alpha_2=alpha_2)
        if not country:
            continue

        continent_code = pc.country_alpha2_to_continent_code(alpha_2)
        continent = continent_names[continent_code]

        continent_to_code[continent].append(alpha_2)
        continent_to_countries[continent].append((alpha_2, country.name))

    except Exception as e:
        print(f"Skipping {code} due to error: {e}")
        continue

In [26]:
def get_names(country, n=10):
    names_dict = nd.get_top_names(n=n, country_alpha2=country)
    names = []
    if country in names_dict:
        country_names = names_dict[country]
        for gender in ['M', 'F']:
            if gender in country_names:
                names.extend(country_names[gender])
    return list(set(names))


def tokenize(text):
    tokens = re.findall(r"\w+|[^\w\s]", text, re.UNICODE)
    return tokens


def tag_tokens(tokens, person_names):
    tags = ["O"] * len(tokens)
    for name in person_names:
        name_tokens = name.split()
        n_len = len(name_tokens)
        for i in range(len(tokens) - n_len + 1):
            if tokens[i:i + n_len] == name_tokens:
                tags[i] = "B-PER"
                for j in range(i + 1, i + n_len):
                    tags[j] = "I-PER"
    return tags


def generate_challenge_dataset(num_samples=300):
    dataset = []
    failures = 0
    max_failures = 1000

    while len(dataset) < num_samples:
        if failures >= max_failures:
            print(f"Stopped after {failures} failed attempts.")
            break

        country = random.choice(chosen_countries)
        names = get_names(country, n=10)

        # Filter names: keep only those also in common_nouns
        filtered_names = [
            name for name in names if name.lower() in common_nouns]

        if not filtered_names:
            failures += 1
            if failures % 10 == 0:
                print(f"{failures} failed attempts so far.")
            continue

        # Pick one or two names as needed
        if len(filtered_names) == 1:
            name1 = filtered_names[0]
            name2 = None
        else:
            name1, name2 = random.sample(filtered_names, 2)

        template = random.choice(sentence_templates)

        if "{name2}" in template and name2 is None:
            sentence = template.format(name=name1, name2=name1)
            person_names = [name1]
        elif "{name2}" in template:
            sentence = template.format(name=name1, name2=name2)
            person_names = [name1, name2]
        else:
            sentence = template.format(name=name1)
            person_names = [name1]

        tokens = tokenize(sentence)
        tags = tag_tokens(tokens, person_names)
        dataset.append(list(zip(tokens, tags)))
        print(len(dataset))

    return dataset


def save_to_csv(dataset, filepath="default_name.csv"):
    with open(filepath, mode="w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["sentence_id", "token", "tag"])
        for idx, sentence in enumerate(dataset):
            for token, tag in sentence:
                writer.writerow([idx, token, tag])

In [27]:
def evaluate_csv(file, common=False):
    print('evaluating', file)
    scripted_challenge = pd.read_csv(file)

    examples = []
    for sentence_id, group in scripted_challenge.groupby("sentence_id"):
        tokens = group["token"].tolist()
        tags = group["tag"].tolist()
        examples.append({"tokens": tokens, "ner_tags": tags})

    unique_tags = sorted(set(tag for ex in examples for tag in ex["ner_tags"]))

    for ex in examples:
        ex["labels"] = [conll_label_to_id[tag] for tag in ex["ner_tags"]]

    challenge_dataset = Dataset.from_list(examples)
    tokenized = challenge_dataset.map(tokenize_and_align_labels, batched=True)
    a = trainer.predict(tokenized)

    if common:
        with open(f'./results/{model_name}_{train_data_name}_challenge_common.pkl', 'wb') as f:
            pickle.dump(a, f)
    else:
        with open(f'./results/{model_name}_{train_data_name}_challenge_{file[:-3]}.pkl', 'wb') as f:
            pickle.dump(a, f)

    x, y = evaluate_predictions(a, tokenized)
    print()
    print('-'*30)

In [28]:
for x in list(continent_to_code.keys()):
    evaluate_csv(x+'_ner_challenge.csv')

evaluating North America_ner_challenge.csv



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 7929.91 examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 7634.21 examples/s]




  _warn_prf(average, modifier, msg_start, len(result))



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 8529.72 examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 8228.55 examples/s]




              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00         0
       B-ORG       0.00      0.00      0.00         0
       B-PER       1.00      0.96      0.98      1145
       I-LOC       0.00      0.00      0.00         0
       I-ORG       0.00      0.00      0.00         0
       I-PER       0.99      1.00      0.99        75
           O       1.00      0.98      0.99      9569

    accuracy                           0.98     10789
   macro avg       0.43      0.42      0.42     10789
weighted avg       1.00      0.98      0.99     10789


------------------------------
evaluating South America_ner_challenge.csv



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


Map: 100%|██████████| 1000/1000 [00:01<00:00, 981.75 examples/s]


Map: 100%|██████████| 1000/1000 [00:01<00:00, 976.81 examples/s]




  _warn_prf(average, modifier, msg_start, len(result))



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 8611.46 examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 8373.82 examples/s]




              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00         0
       B-ORG       0.00      0.00      0.00         0
       B-PER       1.00      0.97      0.99      1164
       I-LOC       0.00      0.00      0.00         0
       I-PER       1.00      1.00      1.00         2
           O       1.00      0.98      0.99      9624

    accuracy                           0.98     10790
   macro avg       0.50      0.49      0.50     10790
weighted avg       1.00      0.98      0.99     10790


------------------------------
evaluating Africa_ner_challenge.csv



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 7277.42 examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 7016.00 examples/s]




  _warn_prf(average, modifier, msg_start, len(result))



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 8508.04 examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 8260.94 examples/s]




              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00         0
      B-MISC       0.00      0.00      0.00         0
       B-ORG       0.00      0.00      0.00         0
       B-PER       1.00      0.96      0.98      1163
       I-LOC       0.00      0.00      0.00         0
       I-PER       1.00      1.00      1.00         9
           O       1.00      0.98      0.99      9682

    accuracy                           0.98     10854
   macro avg       0.43      0.42      0.42     10854
weighted avg       1.00      0.98      0.99     10854


------------------------------
evaluating Asia_ner_challenge.csv



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 7559.37 examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 7283.50 examples/s]




  _warn_prf(average, modifier, msg_start, len(result))



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 8337.12 examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 8115.79 examples/s]




              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00         0
       B-ORG       0.00      0.00      0.00         0
       B-PER       0.97      0.97      0.97      1134
       I-LOC       0.00      0.00      0.00         0
       I-ORG       0.00      0.00      0.00         0
       I-PER       0.38      1.00      0.55         6
           O       1.00      0.98      0.99      9764

    accuracy                           0.97     10904
   macro avg       0.34      0.42      0.36     10904
weighted avg       1.00      0.97      0.99     10904


------------------------------
evaluating Europe_ner_challenge.csv



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 7511.20 examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 7213.85 examples/s]




  _warn_prf(average, modifier, msg_start, len(result))



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 8468.76 examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 8234.88 examples/s]




              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00         0
       B-ORG       0.00      0.00      0.00         0
       B-PER       1.00      0.97      0.98      1136
       I-LOC       0.00      0.00      0.00         0
           O       1.00      0.98      0.99      9682

    accuracy                           0.98     10818
   macro avg       0.40      0.39      0.39     10818
weighted avg       1.00      0.98      0.99     10818


------------------------------
evaluating Oceania_ner_challenge.csv



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 7705.15 examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 7417.38 examples/s]




  _warn_prf(average, modifier, msg_start, len(result))



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 8673.34 examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 8431.34 examples/s]




              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00         0
       B-ORG       0.00      0.00      0.00         0
       B-PER       1.00      0.91      0.95      1148
       I-LOC       0.00      0.00      0.00         0
           O       1.00      0.98      0.99      9548

    accuracy                           0.97     10696
   macro avg       0.40      0.38      0.39     10696
weighted avg       1.00      0.97      0.99     10696


------------------------------


In [29]:
evaluate_csv('./common_nouns_challenge.csv', True)

evaluating ./common_nouns_challenge.csv



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 7812.19 examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 7495.83 examples/s]




  _warn_prf(average, modifier, msg_start, len(result))



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 8682.39 examples/s]


Map: 100%|██████████| 1000/1000 [00:00<00:00, 8448.88 examples/s]




              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00         0
       B-ORG       0.00      0.00      0.00         0
       B-PER       1.00      0.94      0.97      1168
           O       1.00      1.00      1.00      9336

    accuracy                           0.99     10504
   macro avg       0.50      0.49      0.49     10504
weighted avg       1.00      0.99      1.00     10504


------------------------------


In [30]:
seeds = [33, 42, 57, 106, 812, ]
results = []

for seed in seeds:
    set_seed(seed)

    seed_model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(conll_label_to_id)
    )

    seed_args = TrainingArguments(
        output_dir=f"./output/{seed}",
        seed=seed,
        eval_strategy="no",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=2,
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
        report_to="none",
        fp16=True,
        logging_steps=1000,
        save_strategy="no",
    )

    seed_trainer = Trainer(
        model=seed_model,
        args=seed_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    seed_trainer.train()

    pred = seed_trainer.predict(test_data)

    x, y = evaluate_predictions(pred, test_data)
    eval_result = {'predictions': x, 'seed': seed}
    results.append(eval_result)
    with open(f'./results/{model_name}_{train_data_name}_seed_var_results.pkl', 'wb') as f:
        pickle.dump(results, f)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  seed_trainer = Trainer(


Step,Training Loss
1000,0.1764
2000,0.0523
3000,0.0312



Map:   0%|          | 0/61371 [00:00<?, ? examples/s]


Map:   1%|          | 695/61371 [00:00<00:08, 6875.55 examples/s]


Map:   3%|▎         | 1598/61371 [00:00<00:10, 5634.68 examples/s]


Map:   4%|▍         | 2441/61371 [00:00<00:10, 5360.98 examples/s]


Map:   5%|▍         | 3000/61371 [00:00<00:10, 5348.57 examples/s]


Map:   6%|▋         | 3875/61371 [00:00<00:09, 6349.27 examples/s]


Map:   8%|▊         | 4936/61371 [00:00<00:08, 6510.32 examples/s]


Map:   9%|▉         | 5781/61371 [00:00<00:08, 6198.78 examples/s]


Map:  11%|█         | 6794/61371 [00:01<00:08, 6085.19 examples/s]


Map:  12%|█▏        | 7540/61371 [00:01<00:09, 5492.70 examples/s]


Map:  14%|█▍        | 8445/61371 [00:01<00:10, 5229.55 examples/s]


Map:  15%|█▍        | 9000/61371 [00:01<00:10, 5220.72 examples/s]


Map:  16%|█▌        | 9776/61371 [00:01<00:08, 5796.57 examples/s]


Map:  17%|█▋        | 10418/61371 [00:01<00:09, 5631.24 examples/s]


Map:  18%|█▊        | 11000/61371 [00:01<00:09, 5518.57 examples/s]


Map:  19%|█▉        | 11658/61371 [00:02<00:08, 5788.94 examples/s]


Map:  20%|██        | 12407/61371 [00:02<00:08, 5503.16 examples/s]


Map:  21%|██        | 13000/61371 [00:02<00:09, 5340.51 examples/s]


Map:  23%|██▎       | 13865/61371 [00:02<00:07, 6183.03 examples/s]


Map:  24%|██▍       | 14793/61371 [00:02<00:07, 6081.49 examples/s]


Map:  26%|██▌       | 15718/61371 [00:02<00:07, 5851.98 examples/s]


Map:  27%|██▋       | 16386/61371 [00:02<00:08, 5575.11 examples/s]


Map:  28%|██▊       | 17000/61371 [00:02<00:08, 5415.22 examples/s]


Map:  29%|██▉       | 17771/61371 [00:03<00:07, 5965.48 examples/s]


Map:  31%|███       | 18757/61371 [00:03<00:07, 5861.81 examples/s]


Map:  32%|███▏      | 19379/61371 [00:03<00:07, 5620.81 examples/s]


Map:  33%|███▎      | 20000/61371 [00:03<00:07, 5428.41 examples/s]


Map:  34%|███▍      | 20775/61371 [00:03<00:06, 5995.24 examples/s]


Map:  35%|███▌      | 21653/61371 [00:03<00:07, 5673.45 examples/s]


Map:  37%|███▋      | 22416/61371 [00:03<00:07, 5468.41 examples/s]


Map:  37%|███▋      | 23000/61371 [00:04<00:07, 5377.70 examples/s]


Map:  39%|███▉      | 23783/61371 [00:04<00:06, 5972.63 examples/s]


Map:  40%|████      | 24762/61371 [00:04<00:06, 5862.79 examples/s]


Map:  41%|████▏     | 25400/61371 [00:04<00:06, 5659.12 examples/s]


Map:  42%|████▏     | 26000/61371 [00:04<00:06, 5499.20 examples/s]


Map:  43%|████▎     | 26596/61371 [00:04<00:06, 5610.68 examples/s]


Map:  45%|████▍     | 27378/61371 [00:04<00:06, 5266.60 examples/s]


Map:  46%|████▌     | 28000/61371 [00:04<00:06, 5142.26 examples/s]


Map:  47%|████▋     | 28850/61371 [00:05<00:05, 5960.98 examples/s]


Map:  49%|████▊     | 29887/61371 [00:05<00:05, 6151.10 examples/s]


Map:  50%|█████     | 30754/61371 [00:05<00:05, 5963.75 examples/s]


Map:  51%|█████     | 31379/61371 [00:05<00:05, 5652.77 examples/s]


Map:  52%|█████▏    | 32000/61371 [00:05<00:05, 5481.30 examples/s]


Map:  53%|█████▎    | 32704/61371 [00:05<00:04, 5860.02 examples/s]


Map:  55%|█████▍    | 33569/61371 [00:05<00:05, 5356.29 examples/s]


Map:  56%|█████▌    | 34404/61371 [00:06<00:05, 5100.98 examples/s]


Map:  57%|█████▋    | 35000/61371 [00:06<00:05, 5131.67 examples/s]


Map:  58%|█████▊    | 35791/61371 [00:06<00:04, 5774.82 examples/s]


Map:  59%|█████▉    | 36442/61371 [00:06<00:04, 5645.64 examples/s]


Map:  61%|██████    | 37404/61371 [00:06<00:04, 5812.26 examples/s]


Map:  62%|██████▏   | 38000/61371 [00:06<00:04, 5614.13 examples/s]


Map:  63%|██████▎   | 38787/61371 [00:06<00:03, 6168.38 examples/s]


Map:  65%|██████▍   | 39741/61371 [00:06<00:03, 5983.03 examples/s]


Map:  66%|██████▌   | 40620/61371 [00:07<00:03, 5585.66 examples/s]


Map:  68%|██████▊   | 41470/61371 [00:07<00:03, 5481.11 examples/s]


Map:  69%|██████▉   | 42446/61371 [00:07<00:03, 5782.47 examples/s]


Map:  71%|███████   | 43366/61371 [00:07<00:03, 5808.90 examples/s]


Map:  72%|███████▏  | 44000/61371 [00:07<00:03, 5515.43 examples/s]


Map:  73%|███████▎  | 44571/61371 [00:07<00:03, 5555.54 examples/s]


Map:  74%|███████▍  | 45397/61371 [00:08<00:03, 5248.99 examples/s]


Map:  75%|███████▍  | 46000/61371 [00:08<00:02, 5218.51 examples/s]


Map:  76%|███████▌  | 46743/61371 [00:08<00:02, 5742.90 examples/s]


Map:  78%|███████▊  | 47666/61371 [00:08<00:02, 5548.69 examples/s]


Map:  79%|███████▉  | 48412/61371 [00:08<00:02, 5366.67 examples/s]


Map:  80%|███████▉  | 49000/61371 [00:08<00:02, 5236.47 examples/s]


Map:  81%|████████  | 49615/61371 [00:08<00:02, 5450.73 examples/s]


Map:  82%|████████▏ | 50342/61371 [00:08<00:02, 5168.44 examples/s]


Map:  83%|████████▎ | 51000/61371 [00:09<00:02, 5016.65 examples/s]


Map:  84%|████████▍ | 51731/61371 [00:09<00:01, 5560.68 examples/s]


Map:  85%|████████▌ | 52429/61371 [00:09<00:01, 5443.89 examples/s]


Map:  86%|████████▋ | 53000/61371 [00:09<00:01, 5411.23 examples/s]


Map:  88%|████████▊ | 53787/61371 [00:09<00:01, 6047.04 examples/s]


Map:  89%|████████▉ | 54654/61371 [00:09<00:01, 5676.46 examples/s]


Map:  90%|█████████ | 55450/61371 [00:09<00:01, 5553.51 examples/s]


Map:  92%|█████████▏| 56409/61371 [00:10<00:00, 5767.90 examples/s]


Map:  93%|█████████▎| 57000/61371 [00:10<00:00, 5602.81 examples/s]


Map:  94%|█████████▍| 57715/61371 [00:10<00:00, 5975.25 examples/s]


Map:  95%|█████████▌| 58555/61371 [00:10<00:00, 5396.29 examples/s]


Map:  97%|█████████▋| 59412/61371 [00:11<00:01, 1667.32 examples/s]


Map:  98%|█████████▊| 60000/61371 [00:11<00:00, 1994.17 examples/s]


Map:  99%|█████████▉| 60806/61371 [00:11<00:00, 2626.22 examples/s]


Map: 100%|██████████| 61371/61371 [00:12<00:00, 2404.97 examples/s]


Map: 100%|██████████| 61371/61371 [00:12<00:00, 4894.10 examples/s]




              precision    recall  f1-score   support

       B-LOC       0.83      0.89      0.86     17495
      B-MISC       0.63      0.74      0.68     10657
       B-ORG       0.63      0.68      0.65     13041
       B-PER       0.90      0.91      0.91     15547
       I-LOC       0.71      0.65      0.68      5367
      I-MISC       0.65      0.39      0.49      7305
       I-ORG       0.91      0.71      0.80     18313
       I-PER       0.94      0.87      0.90     11086
           O       0.99      0.99      0.99   1011383

    accuracy                           0.97   1110194
   macro avg       0.80      0.76      0.77   1110194
weighted avg       0.97      0.97      0.97   1110194



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  seed_trainer = Trainer(


Step,Training Loss
1000,0.1709
2000,0.0521
3000,0.0307



Map:   0%|          | 0/61371 [00:00<?, ? examples/s]


Map:   1%|▏         | 849/61371 [00:00<00:07, 8394.82 examples/s]


Map:   3%|▎         | 1911/61371 [00:00<00:10, 5911.08 examples/s]


Map:   5%|▍         | 2877/61371 [00:00<00:09, 5901.32 examples/s]


Map:   6%|▋         | 3879/61371 [00:00<00:09, 6104.64 examples/s]


Map:   8%|▊         | 4929/61371 [00:00<00:08, 6322.59 examples/s]


Map:   9%|▉         | 5754/61371 [00:00<00:09, 6038.82 examples/s]


Map:  10%|█         | 6397/61371 [00:01<00:09, 5750.85 examples/s]


Map:  11%|█▏        | 7000/61371 [00:01<00:09, 5577.42 examples/s]


Map:  13%|█▎        | 7799/61371 [00:01<00:09, 5491.79 examples/s]


Map:  14%|█▍        | 8448/61371 [00:01<00:10, 5218.63 examples/s]


Map:  15%|█▍        | 9000/61371 [00:01<00:09, 5238.82 examples/s]


Map:  16%|█▌        | 9760/61371 [00:01<00:08, 5831.30 examples/s]


Map:  17%|█▋        | 10422/61371 [00:01<00:08, 5670.80 examples/s]


Map:  18%|█▊        | 11000/61371 [00:01<00:09, 5535.56 examples/s]


Map:  19%|█▉        | 11652/61371 [00:02<00:08, 5797.65 examples/s]


Map:  20%|██        | 12373/61371 [00:02<00:09, 5435.53 examples/s]


Map:  21%|██        | 13000/61371 [00:02<00:09, 5297.56 examples/s]


Map:  23%|██▎       | 13863/61371 [00:02<00:07, 6150.11 examples/s]


Map:  24%|██▍       | 14788/61371 [00:02<00:07, 6058.98 examples/s]


Map:  26%|██▌       | 15718/61371 [00:02<00:07, 5832.69 examples/s]


Map:  27%|██▋       | 16387/61371 [00:02<00:08, 5568.19 examples/s]


Map:  28%|██▊       | 17000/61371 [00:02<00:08, 5361.60 examples/s]


Map:  29%|██▉       | 17763/61371 [00:03<00:07, 5904.15 examples/s]


Map:  31%|███       | 18737/61371 [00:03<00:07, 5801.19 examples/s]


Map:  32%|███▏      | 19372/61371 [00:03<00:07, 5528.01 examples/s]


Map:  33%|███▎      | 20000/61371 [00:03<00:07, 5365.24 examples/s]


Map:  34%|███▍      | 20786/61371 [00:03<00:06, 5968.22 examples/s]


Map:  35%|███▌      | 21645/61371 [00:03<00:07, 5621.57 examples/s]


Map:  37%|███▋      | 22408/61371 [00:03<00:07, 5418.04 examples/s]


Map:  37%|███▋      | 23000/61371 [00:04<00:07, 5348.96 examples/s]


Map:  39%|███▊      | 23780/61371 [00:04<00:06, 5941.76 examples/s]


Map:  40%|████      | 24761/61371 [00:04<00:06, 5820.93 examples/s]


Map:  41%|████▏     | 25397/61371 [00:04<00:06, 5632.75 examples/s]


Map:  42%|████▏     | 26000/61371 [00:04<00:06, 5506.11 examples/s]


Map:  43%|████▎     | 26578/61371 [00:04<00:06, 5571.83 examples/s]


Map:  45%|████▍     | 27381/61371 [00:04<00:06, 5238.60 examples/s]


Map:  46%|████▌     | 28000/61371 [00:04<00:06, 5141.82 examples/s]


Map:  47%|████▋     | 28828/61371 [00:05<00:05, 5905.60 examples/s]


Map:  48%|████▊     | 29445/61371 [00:05<00:05, 5783.48 examples/s]


Map:  50%|████▉     | 30380/61371 [00:05<00:05, 5841.81 examples/s]


Map:  51%|█████     | 31000/61371 [00:05<00:05, 5585.35 examples/s]


Map:  52%|█████▏    | 31776/61371 [00:05<00:04, 6126.63 examples/s]


Map:  53%|█████▎    | 32715/61371 [00:05<00:04, 5921.42 examples/s]


Map:  55%|█████▍    | 33556/61371 [00:05<00:05, 5338.79 examples/s]


Map:  56%|█████▌    | 34425/61371 [00:06<00:05, 5166.11 examples/s]


Map:  57%|█████▋    | 35000/61371 [00:06<00:05, 5159.40 examples/s]


Map:  58%|█████▊    | 35795/61371 [00:06<00:04, 5790.97 examples/s]


Map:  59%|█████▉    | 36437/61371 [00:06<00:04, 5663.43 examples/s]


Map:  61%|██████    | 37401/61371 [00:06<00:04, 5835.15 examples/s]


Map:  62%|██████▏   | 38000/61371 [00:06<00:04, 5647.99 examples/s]


Map:  63%|██████▎   | 38778/61371 [00:06<00:03, 6172.09 examples/s]


Map:  65%|██████▍   | 39750/61371 [00:06<00:03, 5976.10 examples/s]


Map:  66%|██████▌   | 40618/61371 [00:07<00:03, 5591.67 examples/s]


Map:  68%|██████▊   | 41464/61371 [00:07<00:03, 5468.59 examples/s]


Map:  69%|██████▉   | 42452/61371 [00:07<00:03, 5796.46 examples/s]


Map:  71%|███████   | 43362/61371 [00:07<00:03, 5829.07 examples/s]


Map:  72%|███████▏  | 44000/61371 [00:07<00:03, 5527.76 examples/s]


Map:  73%|███████▎  | 44563/61371 [00:07<00:03, 5548.32 examples/s]


Map:  74%|███████▍  | 45397/61371 [00:08<00:03, 5248.88 examples/s]


Map:  75%|███████▍  | 46000/61371 [00:08<00:02, 5139.59 examples/s]


Map:  76%|███████▌  | 46756/61371 [00:08<00:02, 5706.28 examples/s]


Map:  78%|███████▊  | 47653/61371 [00:08<00:02, 5458.74 examples/s]


Map:  79%|███████▉  | 48380/61371 [00:08<00:02, 5267.33 examples/s]


Map:  80%|███████▉  | 49000/61371 [00:08<00:02, 5193.14 examples/s]


Map:  81%|████████  | 49617/61371 [00:08<00:02, 5422.48 examples/s]


Map:  82%|████████▏ | 50356/61371 [00:08<00:02, 5141.81 examples/s]


Map:  83%|████████▎ | 51000/61371 [00:09<00:02, 4997.81 examples/s]


Map:  84%|████████▍ | 51724/61371 [00:09<00:01, 5527.97 examples/s]


Map:  85%|████████▌ | 52407/61371 [00:09<00:01, 5409.84 examples/s]


Map:  86%|████████▋ | 53000/61371 [00:09<00:01, 5350.40 examples/s]


Map:  88%|████████▊ | 53779/61371 [00:09<00:01, 5975.50 examples/s]


Map:  89%|████████▉ | 54660/61371 [00:09<00:01, 5673.23 examples/s]


Map:  90%|█████████ | 55456/61371 [00:09<00:01, 5550.20 examples/s]


Map:  92%|█████████▏| 56408/61371 [00:10<00:00, 5740.95 examples/s]


Map:  93%|█████████▎| 57000/61371 [00:10<00:00, 5586.06 examples/s]


Map:  94%|█████████▍| 57710/61371 [00:10<00:00, 5948.28 examples/s]


Map:  95%|█████████▌| 58537/61371 [00:10<00:00, 5361.97 examples/s]


Map:  97%|█████████▋| 59416/61371 [00:11<00:01, 1601.33 examples/s]


Map:  98%|█████████▊| 60000/61371 [00:11<00:00, 1911.28 examples/s]


Map:  99%|█████████▉| 60801/61371 [00:11<00:00, 2522.46 examples/s]


Map: 100%|██████████| 61371/61371 [00:12<00:00, 2384.19 examples/s]


Map: 100%|██████████| 61371/61371 [00:12<00:00, 4850.14 examples/s]




              precision    recall  f1-score   support

       B-LOC       0.83      0.89      0.86     17495
      B-MISC       0.65      0.73      0.68     10657
       B-ORG       0.63      0.68      0.66     13041
       B-PER       0.90      0.91      0.91     15547
       I-LOC       0.70      0.65      0.68      5367
      I-MISC       0.66      0.36      0.47      7305
       I-ORG       0.90      0.71      0.79     18313
       I-PER       0.94      0.88      0.91     11086
           O       0.99      0.99      0.99   1011383

    accuracy                           0.97   1110194
   macro avg       0.80      0.76      0.77   1110194
weighted avg       0.97      0.97      0.97   1110194



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  seed_trainer = Trainer(


Step,Training Loss
1000,0.1714
2000,0.0506
3000,0.0301



Map:   0%|          | 0/61371 [00:00<?, ? examples/s]


Map:   1%|          | 634/61371 [00:00<00:09, 6271.89 examples/s]


Map:   2%|▏         | 1306/61371 [00:00<00:11, 5404.33 examples/s]


Map:   3%|▎         | 1905/61371 [00:00<00:10, 5639.48 examples/s]


Map:   5%|▍         | 2875/61371 [00:00<00:10, 5749.30 examples/s]


Map:   6%|▋         | 3874/61371 [00:00<00:09, 6037.50 examples/s]


Map:   8%|▊         | 4935/61371 [00:00<00:08, 6346.55 examples/s]


Map:   9%|▉         | 5763/61371 [00:00<00:09, 6068.98 examples/s]


Map:  10%|█         | 6396/61371 [00:01<00:09, 5785.41 examples/s]


Map:  11%|█▏        | 7000/61371 [00:01<00:09, 5610.20 examples/s]


Map:  13%|█▎        | 7808/61371 [00:01<00:09, 5530.47 examples/s]


Map:  14%|█▎        | 8431/61371 [00:01<00:10, 5219.88 examples/s]


Map:  15%|█▍        | 9000/61371 [00:01<00:10, 5216.41 examples/s]


Map:  16%|█▌        | 9770/61371 [00:01<00:08, 5838.61 examples/s]


Map:  17%|█▋        | 10420/61371 [00:01<00:08, 5686.60 examples/s]


Map:  18%|█▊        | 11000/61371 [00:01<00:09, 5568.88 examples/s]


Map:  19%|█▉        | 11651/61371 [00:02<00:08, 5819.19 examples/s]


Map:  20%|██        | 12391/61371 [00:02<00:08, 5492.34 examples/s]


Map:  21%|██        | 13000/61371 [00:02<00:09, 5304.95 examples/s]


Map:  23%|██▎       | 13874/61371 [00:02<00:07, 6186.36 examples/s]


Map:  24%|██▍       | 14796/61371 [00:02<00:07, 6070.38 examples/s]


Map:  26%|██▌       | 15686/61371 [00:02<00:07, 5777.74 examples/s]


Map:  27%|██▋       | 16386/61371 [00:02<00:08, 5534.48 examples/s]


Map:  28%|██▊       | 17000/61371 [00:02<00:08, 5387.02 examples/s]


Map:  29%|██▉       | 17749/61371 [00:03<00:07, 5891.47 examples/s]


Map:  30%|██▉       | 18375/61371 [00:03<00:07, 5629.11 examples/s]


Map:  31%|███       | 19000/61371 [00:03<00:07, 5440.30 examples/s]


Map:  32%|███▏      | 19758/61371 [00:03<00:06, 5976.21 examples/s]


Map:  33%|███▎      | 20394/61371 [00:03<00:07, 5685.97 examples/s]


Map:  34%|███▍      | 21000/61371 [00:03<00:07, 5485.55 examples/s]


Map:  35%|███▌      | 21644/61371 [00:03<00:06, 5734.04 examples/s]


Map:  36%|███▋      | 22386/61371 [00:03<00:07, 5416.85 examples/s]


Map:  37%|███▋      | 23000/61371 [00:04<00:07, 5331.18 examples/s]


Map:  39%|███▊      | 23766/61371 [00:04<00:06, 5928.26 examples/s]


Map:  40%|███▉      | 24376/61371 [00:04<00:06, 5643.50 examples/s]


Map:  41%|████      | 25000/61371 [00:04<00:06, 5418.41 examples/s]


Map:  42%|████▏     | 25786/61371 [00:04<00:05, 6056.64 examples/s]


Map:  43%|████▎     | 26594/61371 [00:04<00:06, 5553.13 examples/s]


Map:  45%|████▍     | 27375/61371 [00:04<00:06, 5216.06 examples/s]


Map:  46%|████▌     | 28000/61371 [00:04<00:06, 5124.65 examples/s]


Map:  47%|████▋     | 28838/61371 [00:05<00:05, 5893.52 examples/s]


Map:  49%|████▊     | 29864/61371 [00:05<00:05, 6074.71 examples/s]


Map:  50%|█████     | 30774/61371 [00:05<00:05, 5954.28 examples/s]


Map:  51%|█████     | 31390/61371 [00:05<00:05, 5680.21 examples/s]


Map:  52%|█████▏    | 32000/61371 [00:05<00:05, 5504.50 examples/s]


Map:  53%|█████▎    | 32701/61371 [00:05<00:04, 5869.18 examples/s]


Map:  55%|█████▍    | 33586/61371 [00:05<00:05, 5412.95 examples/s]


Map:  56%|█████▌    | 34421/61371 [00:06<00:05, 5188.62 examples/s]


Map:  57%|█████▋    | 35000/61371 [00:06<00:05, 5170.66 examples/s]


Map:  58%|█████▊    | 35773/61371 [00:06<00:04, 5766.96 examples/s]


Map:  59%|█████▉    | 36434/61371 [00:06<00:04, 5687.11 examples/s]


Map:  61%|██████    | 37400/61371 [00:06<00:04, 5866.89 examples/s]


Map:  62%|██████▏   | 38000/61371 [00:06<00:04, 5685.18 examples/s]


Map:  63%|██████▎   | 38785/61371 [00:06<00:03, 6222.45 examples/s]


Map:  65%|██████▍   | 39745/61371 [00:06<00:03, 6015.55 examples/s]


Map:  66%|██████▌   | 40621/61371 [00:07<00:03, 5610.33 examples/s]


Map:  68%|██████▊   | 41468/61371 [00:07<00:03, 5496.22 examples/s]


Map:  69%|██████▉   | 42437/61371 [00:07<00:03, 5772.62 examples/s]


Map:  71%|███████   | 43362/61371 [00:07<00:03, 5808.96 examples/s]


Map:  72%|███████▏  | 44000/61371 [00:07<00:03, 5522.78 examples/s]


Map:  73%|███████▎  | 44564/61371 [00:07<00:03, 5546.54 examples/s]


Map:  74%|███████▍  | 45400/61371 [00:08<00:03, 5253.65 examples/s]


Map:  75%|███████▍  | 46000/61371 [00:08<00:02, 5188.10 examples/s]


Map:  76%|███████▌  | 46750/61371 [00:08<00:02, 5732.76 examples/s]


Map:  78%|███████▊  | 47675/61371 [00:08<00:02, 5536.41 examples/s]


Map:  79%|███████▉  | 48375/61371 [00:08<00:02, 5338.46 examples/s]


Map:  80%|███████▉  | 49000/61371 [00:08<00:02, 5212.76 examples/s]


Map:  81%|████████  | 49624/61371 [00:08<00:02, 5454.60 examples/s]


Map:  82%|████████▏ | 50359/61371 [00:08<00:02, 5155.50 examples/s]


Map:  83%|████████▎ | 51000/61371 [00:09<00:02, 4990.14 examples/s]


Map:  84%|████████▍ | 51713/61371 [00:09<00:01, 5497.31 examples/s]


Map:  85%|████████▌ | 52425/61371 [00:09<00:01, 5451.64 examples/s]


Map:  86%|████████▋ | 53000/61371 [00:09<00:01, 5426.69 examples/s]


Map:  88%|████████▊ | 53783/61371 [00:09<00:01, 6048.19 examples/s]


Map:  89%|████████▉ | 54652/61371 [00:09<00:01, 5693.70 examples/s]


Map:  90%|█████████ | 55423/61371 [00:09<00:01, 5502.41 examples/s]


Map:  91%|█████████ | 56000/61371 [00:09<00:00, 5448.10 examples/s]


Map:  93%|█████████▎| 56801/61371 [00:10<00:00, 6085.22 examples/s]


Map:  94%|█████████▍| 57719/61371 [00:10<00:00, 5855.58 examples/s]


Map:  95%|█████████▌| 58536/61371 [00:10<00:00, 5305.76 examples/s]


Map:  97%|█████████▋| 59416/61371 [00:11<00:01, 1465.88 examples/s]


Map:  98%|█████████▊| 60000/61371 [00:12<00:00, 1762.47 examples/s]


Map:  99%|█████████▉| 60798/61371 [00:12<00:00, 2331.10 examples/s]


Map: 100%|██████████| 61371/61371 [00:12<00:00, 2293.10 examples/s]


Map: 100%|██████████| 61371/61371 [00:12<00:00, 4814.91 examples/s]




              precision    recall  f1-score   support

       B-LOC       0.83      0.89      0.86     17495
      B-MISC       0.63      0.73      0.68     10657
       B-ORG       0.64      0.67      0.65     13041
       B-PER       0.90      0.91      0.90     15547
       I-LOC       0.71      0.66      0.69      5367
      I-MISC       0.64      0.36      0.46      7305
       I-ORG       0.90      0.71      0.79     18313
       I-PER       0.93      0.88      0.90     11086
           O       0.99      0.99      0.99   1011383

    accuracy                           0.97   1110194
   macro avg       0.80      0.75      0.77   1110194
weighted avg       0.97      0.97      0.97   1110194



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  seed_trainer = Trainer(


Step,Training Loss
1000,0.1712
2000,0.0531
3000,0.0333



Map:   0%|          | 0/61371 [00:00<?, ? examples/s]


Map:   1%|          | 747/61371 [00:00<00:08, 7389.68 examples/s]


Map:   3%|▎         | 1603/61371 [00:00<00:10, 5678.23 examples/s]


Map:   4%|▍         | 2434/61371 [00:00<00:10, 5369.87 examples/s]


Map:   5%|▍         | 3000/61371 [00:00<00:10, 5392.90 examples/s]


Map:   6%|▋         | 3890/61371 [00:00<00:08, 6432.06 examples/s]


Map:   8%|▊         | 4933/61371 [00:00<00:08, 6581.75 examples/s]


Map:   9%|▉         | 5774/61371 [00:00<00:08, 6231.53 examples/s]


Map:  11%|█         | 6806/61371 [00:01<00:08, 6112.53 examples/s]


Map:  12%|█▏        | 7544/61371 [00:01<00:09, 5534.35 examples/s]


Map:  14%|█▎        | 8434/61371 [00:01<00:10, 5265.14 examples/s]


Map:  15%|█▍        | 9000/61371 [00:01<00:10, 5219.45 examples/s]


Map:  16%|█▌        | 9759/61371 [00:01<00:08, 5756.66 examples/s]


Map:  17%|█▋        | 10416/61371 [00:01<00:09, 5601.56 examples/s]


Map:  18%|█▊        | 11000/61371 [00:01<00:09, 5524.86 examples/s]


Map:  19%|█▉        | 11661/61371 [00:02<00:08, 5799.98 examples/s]


Map:  20%|██        | 12387/61371 [00:02<00:09, 5392.32 examples/s]


Map:  21%|██        | 13000/61371 [00:02<00:09, 5250.87 examples/s]


Map:  23%|██▎       | 13822/61371 [00:02<00:07, 5997.45 examples/s]


Map:  24%|██▍       | 14782/61371 [00:02<00:07, 5946.09 examples/s]


Map:  26%|██▌       | 15712/61371 [00:02<00:07, 5761.63 examples/s]


Map:  27%|██▋       | 16372/61371 [00:02<00:08, 5516.72 examples/s]


Map:  28%|██▊       | 17000/61371 [00:03<00:08, 5331.30 examples/s]


Map:  29%|██▉       | 17767/61371 [00:03<00:07, 5884.59 examples/s]


Map:  30%|██▉       | 18389/61371 [00:03<00:07, 5623.97 examples/s]


Map:  31%|███       | 19000/61371 [00:03<00:07, 5396.20 examples/s]


Map:  32%|███▏      | 19747/61371 [00:03<00:07, 5922.23 examples/s]


Map:  33%|███▎      | 20397/61371 [00:03<00:07, 5661.34 examples/s]


Map:  34%|███▍      | 21000/61371 [00:03<00:07, 5459.22 examples/s]


Map:  35%|███▌      | 21632/61371 [00:03<00:06, 5680.83 examples/s]


Map:  37%|███▋      | 22405/61371 [00:03<00:07, 5429.54 examples/s]


Map:  37%|███▋      | 23000/61371 [00:04<00:07, 5382.39 examples/s]


Map:  39%|███▉      | 23788/61371 [00:04<00:06, 6027.57 examples/s]


Map:  40%|████      | 24766/61371 [00:04<00:06, 5882.28 examples/s]


Map:  41%|████▏     | 25405/61371 [00:04<00:06, 5653.36 examples/s]


Map:  42%|████▏     | 26000/61371 [00:04<00:06, 5496.82 examples/s]


Map:  43%|████▎     | 26577/61371 [00:04<00:06, 5563.66 examples/s]


Map:  45%|████▍     | 27357/61371 [00:04<00:06, 5197.68 examples/s]


Map:  46%|████▌     | 28000/61371 [00:04<00:06, 5145.03 examples/s]


Map:  47%|████▋     | 28826/61371 [00:05<00:05, 5905.65 examples/s]


Map:  48%|████▊     | 29440/61371 [00:05<00:05, 5785.60 examples/s]


Map:  50%|████▉     | 30398/61371 [00:05<00:05, 5937.79 examples/s]


Map:  51%|█████     | 31000/61371 [00:05<00:05, 5683.28 examples/s]


Map:  52%|█████▏    | 31768/61371 [00:05<00:04, 6185.29 examples/s]


Map:  53%|█████▎    | 32709/61371 [00:05<00:04, 5863.26 examples/s]


Map:  55%|█████▍    | 33564/61371 [00:05<00:05, 5379.55 examples/s]


Map:  56%|█████▌    | 34417/61371 [00:06<00:05, 5181.07 examples/s]


Map:  57%|█████▋    | 35000/61371 [00:06<00:05, 5189.30 examples/s]


Map:  58%|█████▊    | 35800/61371 [00:06<00:04, 5828.88 examples/s]


Map:  59%|█████▉    | 36439/61371 [00:06<00:04, 5677.65 examples/s]


Map:  61%|██████    | 37409/61371 [00:06<00:04, 5839.89 examples/s]


Map:  63%|██████▎   | 38396/61371 [00:06<00:03, 5862.43 examples/s]


Map:  64%|██████▎   | 39000/61371 [00:06<00:03, 5662.59 examples/s]


Map:  65%|██████▍   | 39755/61371 [00:06<00:03, 6106.77 examples/s]


Map:  66%|██████▌   | 40616/61371 [00:07<00:03, 5640.77 examples/s]


Map:  68%|██████▊   | 41437/61371 [00:07<00:03, 5474.98 examples/s]


Map:  68%|██████▊   | 42000/61371 [00:07<00:03, 5492.84 examples/s]


Map:  70%|██████▉   | 42859/61371 [00:07<00:02, 6244.24 examples/s]


Map:  71%|███████▏  | 43728/61371 [00:07<00:02, 5988.32 examples/s]


Map:  73%|███████▎  | 44572/61371 [00:07<00:03, 5466.52 examples/s]


Map:  74%|███████▍  | 45393/61371 [00:08<00:03, 5191.88 examples/s]


Map:  75%|███████▍  | 46000/61371 [00:08<00:02, 5158.43 examples/s]


Map:  76%|███████▌  | 46755/61371 [00:08<00:02, 5700.03 examples/s]


Map:  78%|███████▊  | 47678/61371 [00:08<00:02, 5536.99 examples/s]


Map:  79%|███████▉  | 48389/61371 [00:08<00:02, 5289.78 examples/s]


Map:  80%|███████▉  | 49000/61371 [00:08<00:02, 5160.98 examples/s]


Map:  81%|████████  | 49611/61371 [00:08<00:02, 5381.36 examples/s]


Map:  82%|████████▏ | 50349/61371 [00:08<00:02, 5118.05 examples/s]


Map:  83%|████████▎ | 51000/61371 [00:09<00:02, 4976.60 examples/s]


Map:  84%|████████▍ | 51732/61371 [00:09<00:01, 5528.10 examples/s]


Map:  85%|████████▌ | 52435/61371 [00:09<00:01, 5465.23 examples/s]


Map:  86%|████████▋ | 53000/61371 [00:09<00:01, 5382.59 examples/s]


Map:  88%|████████▊ | 53804/61371 [00:09<00:01, 6067.19 examples/s]


Map:  89%|████████▉ | 54660/61371 [00:09<00:01, 5692.38 examples/s]


Map:  90%|█████████ | 55442/61371 [00:09<00:01, 5494.71 examples/s]


Map:  92%|█████████▏| 56403/61371 [00:10<00:00, 5718.19 examples/s]


Map:  93%|█████████▎| 57000/61371 [00:10<00:00, 5583.43 examples/s]


Map:  94%|█████████▍| 57704/61371 [00:10<00:00, 5932.60 examples/s]


Map:  95%|█████████▌| 58558/61371 [00:10<00:00, 5408.43 examples/s]


Map:  97%|█████████▋| 59392/61371 [00:10<00:00, 5130.90 examples/s]


Map:  98%|█████████▊| 60000/61371 [00:12<00:01, 1161.70 examples/s]


Map:  99%|█████████▉| 60802/61371 [00:12<00:00, 1596.02 examples/s]


Map: 100%|██████████| 61371/61371 [00:12<00:00, 1754.65 examples/s]


Map: 100%|██████████| 61371/61371 [00:13<00:00, 4697.90 examples/s]




              precision    recall  f1-score   support

       B-LOC       0.82      0.89      0.86     17495
      B-MISC       0.64      0.73      0.68     10657
       B-ORG       0.64      0.68      0.66     13041
       B-PER       0.90      0.92      0.91     15547
       I-LOC       0.70      0.66      0.68      5367
      I-MISC       0.64      0.39      0.48      7305
       I-ORG       0.91      0.70      0.79     18313
       I-PER       0.95      0.87      0.91     11086
           O       0.99      0.99      0.99   1011383

    accuracy                           0.97   1110194
   macro avg       0.80      0.76      0.77   1110194
weighted avg       0.97      0.97      0.97   1110194



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  seed_trainer = Trainer(


Step,Training Loss
1000,0.1762
2000,0.0536
3000,0.0322



Map:   0%|          | 0/61371 [00:00<?, ? examples/s]


Map:   1%|▏         | 770/61371 [00:00<00:07, 7613.28 examples/s]


Map:   3%|▎         | 1606/61371 [00:00<00:10, 5710.46 examples/s]


Map:   4%|▍         | 2450/61371 [00:00<00:10, 5431.55 examples/s]


Map:   6%|▌         | 3441/61371 [00:00<00:09, 5842.16 examples/s]


Map:   7%|▋         | 4475/61371 [00:00<00:09, 6165.27 examples/s]


Map:   9%|▉         | 5407/61371 [00:00<00:09, 6180.11 examples/s]


Map:  10%|█         | 6400/61371 [00:01<00:09, 5992.81 examples/s]


Map:  12%|█▏        | 7270/61371 [00:01<00:09, 5726.05 examples/s]


Map:  13%|█▎        | 8000/61371 [00:01<00:10, 5073.27 examples/s]


Map:  14%|█▍        | 8876/61371 [00:01<00:08, 5834.68 examples/s]


Map:  16%|█▌        | 9760/61371 [00:01<00:08, 5772.65 examples/s]


Map:  17%|█▋        | 10424/61371 [00:01<00:09, 5649.47 examples/s]


Map:  18%|█▊        | 11339/61371 [00:01<00:08, 5639.98 examples/s]


Map:  20%|█▉        | 11995/61371 [00:02<00:08, 5845.31 examples/s]


Map:  21%|██        | 12779/61371 [00:02<00:08, 5641.84 examples/s]


Map:  22%|██▏       | 13409/61371 [00:02<00:08, 5497.75 examples/s]


Map:  23%|██▎       | 14000/61371 [00:02<00:08, 5471.80 examples/s]


Map:  24%|██▍       | 14776/61371 [00:02<00:07, 6048.22 examples/s]


Map:  26%|██▌       | 15710/61371 [00:02<00:07, 5819.53 examples/s]


Map:  27%|██▋       | 16390/61371 [00:02<00:08, 5558.83 examples/s]


Map:  28%|██▊       | 17000/61371 [00:02<00:08, 5395.93 examples/s]


Map:  29%|██▉       | 17756/61371 [00:03<00:07, 5926.27 examples/s]


Map:  30%|██▉       | 18377/61371 [00:03<00:07, 5619.28 examples/s]


Map:  31%|███       | 19000/61371 [00:03<00:07, 5433.73 examples/s]


Map:  32%|███▏      | 19741/61371 [00:03<00:07, 5941.37 examples/s]


Map:  33%|███▎      | 20386/61371 [00:03<00:07, 5671.95 examples/s]


Map:  34%|███▍      | 21000/61371 [00:03<00:07, 5504.88 examples/s]


Map:  35%|███▌      | 21648/61371 [00:03<00:06, 5759.93 examples/s]


Map:  37%|███▋      | 22420/61371 [00:03<00:07, 5500.79 examples/s]


Map:  37%|███▋      | 23000/61371 [00:04<00:07, 5369.99 examples/s]


Map:  39%|███▊      | 23770/61371 [00:04<00:06, 5972.55 examples/s]


Map:  40%|████      | 24760/61371 [00:04<00:06, 5880.26 examples/s]


Map:  41%|████▏     | 25398/61371 [00:04<00:06, 5603.21 examples/s]


Map:  42%|████▏     | 26000/61371 [00:04<00:06, 5460.41 examples/s]


Map:  43%|████▎     | 26589/61371 [00:04<00:06, 5566.31 examples/s]


Map:  45%|████▍     | 27374/61371 [00:04<00:06, 5226.72 examples/s]


Map:  46%|████▌     | 28000/61371 [00:04<00:06, 5144.30 examples/s]


Map:  47%|████▋     | 28851/61371 [00:05<00:05, 5968.85 examples/s]


Map:  49%|████▊     | 29872/61371 [00:05<00:05, 6114.35 examples/s]


Map:  50%|█████     | 30813/61371 [00:05<00:05, 6064.63 examples/s]


Map:  52%|█████▏    | 31782/61371 [00:05<00:05, 5886.85 examples/s]


Map:  53%|█████▎    | 32711/61371 [00:05<00:05, 5729.08 examples/s]


Map:  55%|█████▍    | 33582/61371 [00:05<00:05, 5380.06 examples/s]


Map:  56%|█████▌    | 34426/61371 [00:06<00:05, 5241.94 examples/s]


Map:  57%|█████▋    | 35000/61371 [00:06<00:05, 5208.86 examples/s]


Map:  58%|█████▊    | 35799/61371 [00:06<00:04, 5820.90 examples/s]


Map:  59%|█████▉    | 36438/61371 [00:06<00:04, 5713.40 examples/s]


Map:  61%|██████    | 37400/61371 [00:06<00:04, 5800.21 examples/s]


Map:  62%|██████▏   | 38000/61371 [00:06<00:04, 5549.62 examples/s]


Map:  63%|██████▎   | 38785/61371 [00:06<00:03, 6103.09 examples/s]


Map:  65%|██████▍   | 39746/61371 [00:06<00:03, 5932.53 examples/s]


Map:  66%|██████▌   | 40616/61371 [00:07<00:03, 5569.23 examples/s]


Map:  68%|██████▊   | 41436/61371 [00:07<00:03, 5426.25 examples/s]


Map:  68%|██████▊   | 42000/61371 [00:07<00:03, 5434.86 examples/s]


Map:  70%|██████▉   | 42860/61371 [00:07<00:02, 6182.45 examples/s]


Map:  71%|███████▏  | 43728/61371 [00:07<00:02, 5938.09 examples/s]


Map:  73%|███████▎  | 44574/61371 [00:07<00:03, 5430.27 examples/s]


Map:  74%|███████▍  | 45389/61371 [00:08<00:03, 5183.16 examples/s]


Map:  75%|███████▍  | 46000/61371 [00:08<00:02, 5162.82 examples/s]


Map:  76%|███████▌  | 46748/61371 [00:08<00:02, 5685.61 examples/s]


Map:  77%|███████▋  | 47346/61371 [00:08<00:02, 5391.53 examples/s]


Map:  78%|███████▊  | 48000/61371 [00:08<00:02, 5109.89 examples/s]


Map:  79%|███████▉  | 48754/61371 [00:08<00:02, 5694.23 examples/s]


Map:  81%|████████  | 49621/61371 [00:08<00:02, 5366.79 examples/s]


Map:  82%|████████▏ | 50330/61371 [00:08<00:02, 5096.32 examples/s]


Map:  83%|████████▎ | 51000/61371 [00:09<00:02, 4974.24 examples/s]


Map:  84%|████████▍ | 51713/61371 [00:09<00:01, 5464.11 examples/s]


Map:  85%|████████▌ | 52432/61371 [00:09<00:01, 5432.17 examples/s]


Map:  86%|████████▋ | 53000/61371 [00:09<00:01, 5380.45 examples/s]


Map:  88%|████████▊ | 53782/61371 [00:09<00:01, 6001.34 examples/s]


Map:  89%|████████▉ | 54653/61371 [00:09<00:01, 5639.05 examples/s]


Map:  90%|█████████ | 55446/61371 [00:09<00:01, 5523.58 examples/s]


Map:  92%|█████████▏| 56406/61371 [00:10<00:00, 5709.54 examples/s]


Map:  93%|█████████▎| 57000/61371 [00:10<00:00, 5588.08 examples/s]


Map:  94%|█████████▍| 57709/61371 [00:10<00:00, 5946.82 examples/s]


Map:  95%|█████████▌| 58554/61371 [00:10<00:00, 5362.08 examples/s]


Map:  97%|█████████▋| 59390/61371 [00:10<00:00, 5003.71 examples/s]


Map:  98%|█████████▊| 60000/61371 [00:15<00:02, 514.30 examples/s] 


Map:  99%|█████████▉| 60798/61371 [00:15<00:00, 731.05 examples/s]


Map: 100%|██████████| 61371/61371 [00:15<00:00, 892.91 examples/s]


Map: 100%|██████████| 61371/61371 [00:15<00:00, 3947.61 examples/s]




              precision    recall  f1-score   support

       B-LOC       0.84      0.89      0.86     17495
      B-MISC       0.63      0.73      0.67     10657
       B-ORG       0.65      0.68      0.66     13041
       B-PER       0.90      0.92      0.91     15547
       I-LOC       0.70      0.66      0.68      5367
      I-MISC       0.66      0.35      0.46      7305
       I-ORG       0.92      0.71      0.80     18313
       I-PER       0.95      0.87      0.91     11086
           O       0.99      0.99      0.99   1011383

    accuracy                           0.97   1110194
   macro avg       0.80      0.76      0.77   1110194
weighted avg       0.97      0.97      0.97   1110194



In [31]:
with open(f'./results/{model_name}_{train_data_name}_seed_var_results.pkl', 'rb') as f:
    seed_res = pickle.load(f)

In [32]:
kappa_scores = []

for i, j in combinations(range(len(seed_res)), 2):
    preds_i = seed_res[i]['predictions']
    preds_j = seed_res[j]['predictions']

    kappa = cohen_kappa_score(preds_i, preds_j)
    kappa_scores.append(kappa)

    print(f"Cohen's kappa between seed {i} and seed {j}: {kappa:.6f}")

average_kappa = sum(kappa_scores) / len(kappa_scores)
print(f"\nAverage Cohen's kappa across all pairs: {average_kappa:.6f}")

Cohen's kappa between seed 0 and seed 1: 0.968839


Cohen's kappa between seed 0 and seed 2: 0.967721


Cohen's kappa between seed 0 and seed 3: 0.969990


Cohen's kappa between seed 0 and seed 4: 0.968262


Cohen's kappa between seed 1 and seed 2: 0.968510


Cohen's kappa between seed 1 and seed 3: 0.968212


Cohen's kappa between seed 1 and seed 4: 0.969408


Cohen's kappa between seed 2 and seed 3: 0.967933


Cohen's kappa between seed 2 and seed 4: 0.966671


Cohen's kappa between seed 3 and seed 4: 0.967895

Average Cohen's kappa across all pairs: 0.968344


In [33]:
data = np.array([seed['predictions'] for seed in seed_res])

alpha = krippendorff.alpha(reliability_data=data,
                           level_of_measurement='nominal')

print(f"\nKrippendorff’s alpha (nominal): {alpha:.4f}")


Krippendorff’s alpha (nominal): 0.9683
