In [1]:
import os
import sys
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
from datasets import load_from_disk
import evaluate
import pandas as pd
sys.path.append("..") # Adds higher directory to path (temporarily) to access our modules
from src.inference import main as inference
from src.train import tokenize_and_align_labels, compute_metrics_extra_args

#### Setting up some constants (loading from .env file)

In [7]:
load_dotenv()
LOCAL_DATASET_PATH = os.getenv('LOCAL_DATASET_PATH')
CHECKPOINT_PATH = os.getenv('CHECKPOINT_PATH')

#### Evaluating our models (I trained bert-base and bert-large) on train, val and test data to see final performance

In [8]:
def evaluate_on_all_sets(dataset_path, model_path):
    dataset = load_from_disk(dataset_path)

    tokenizer = AutoTokenizer.from_pretrained(model_path)

    tokenized_ds = dataset.map(tokenize_and_align_labels, 
                        batched=True,
                        remove_columns=dataset['train'].column_names,
                        fn_kwargs={'tokenizer': tokenizer})

    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    label_names = ['O', 'B-MOUNT', 'I-MOUNT']
    label2id = {label: i for i, label in enumerate(label_names)}
    id2label = {label: i for i, label in label2id.items()}

    model = AutoModelForTokenClassification.from_pretrained(model_path)

    training_args = TrainingArguments(
        output_dir=CHECKPOINT_PATH,
        eval_strategy='epoch',
        save_strategy='no',
        logging_strategy='no',
    )

    metric = evaluate.load('seqeval')

    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_ds['train'],
        eval_dataset=tokenized_ds['val'],
        data_collator=data_collator,
        processing_class=tokenizer,
        compute_metrics=compute_metrics_extra_args(metric, id2label)
    )

    res_train = trainer.evaluate(tokenized_ds['train'])
    res_val = trainer.evaluate(tokenized_ds['val'])
    res_test = trainer.evaluate(tokenized_ds['test'])

    return res_train, res_val, res_test

In [5]:
bert_base = '../models/bert-finetuned-NER'
bert_large = '../models/bert-large-finetuned-NER'

In [7]:
res_train_base, res_val_base, res_test_base = evaluate_on_all_sets(LOCAL_DATASET_PATH, bert_base)

Map: 100%|██████████| 10000/10000 [00:04<00:00, 2378.47 examples/s]


In [9]:
res_train_large, res_val_large, res_test_large = evaluate_on_all_sets(LOCAL_DATASET_PATH, bert_large)

In [55]:
df_train_base = pd.DataFrame.from_dict(res_train_base, orient='index', columns=['train'])
df_val_base = pd.DataFrame.from_dict(res_val_base, orient='index', columns=['val'])
df_test_base = pd.DataFrame.from_dict(res_test_base, orient='index', columns=['test'])

all_res_base = pd.concat([df_train_base, df_val_base, df_test_base], axis=1)
all_res_base

Unnamed: 0,train,val,test
eval_loss,0.0171,0.052447,0.054503
eval_model_preparation_time,0.002,0.002,0.002
eval_precision,0.887781,0.776042,0.769197
eval_recall,0.911282,0.831512,0.830727
eval_f1,0.899378,0.80282,0.798779
eval_accuracy,0.99421,0.983945,0.982885
eval_runtime,207.3038,130.3598,127.9223
eval_samples_per_second,144.715,76.711,78.172
eval_steps_per_second,18.089,9.589,9.772


In [10]:
df_train_large = pd.DataFrame.from_dict(res_train_large, orient='index', columns=['train'])
df_val_large = pd.DataFrame.from_dict(res_val_large, orient='index', columns=['val'])
df_test_large = pd.DataFrame.from_dict(res_test_large, orient='index', columns=['test'])

all_res_large = pd.concat([df_train_large, df_val_large, df_test_large], axis=1)
all_res_large

Unnamed: 0,train,val,test
eval_loss,0.010216,0.050208,0.052177
eval_model_preparation_time,0.004,0.004,0.004
eval_precision,0.938637,0.817246,0.813042
eval_recall,0.932396,0.84665,0.844976
eval_f1,0.935506,0.831688,0.828702
eval_accuracy,0.996643,0.985701,0.985128
eval_runtime,470.0777,339.0369,346.5321
eval_samples_per_second,63.819,29.495,28.857
eval_steps_per_second,7.977,3.687,3.607


#### The results are not bad, but there is a lot of room for improvement.
Also, bert-large as expected yields better results than bert-base, especially in terms of presicion. It is about 0.04-0.05 higher than bert-base that is also resulting in higher f1-score.
Worth mentioning that bert-large is about 2 times slower than bert-base.

#### Demo of inference (using bert-large fine-tuned model)
I found quite big dataset for training and decided to stick with it, but I also tried to use ChatGPT for data creation and generated some samples.  
Lets use them as examples for inference.

In [2]:
csv = pd.read_csv('mountain_ner_examples.csv')
csv = csv['tokens'].tolist()
csv[0]

'The history and cultural significance of Denali are well-documented.'

In [3]:
import numpy as np
indexes = np.random.randint(0, len(csv), 5)
for i in indexes:
    example = csv[i]
    print('-----------------------')
    print(example)
    print('-----------------------')

    inference(example)

Device set to use cuda:0


-----------------------
Lhotse is part of a mountain range that spans several countries.
-----------------------


Device set to use cuda:0


Word: L, Start: 0, End: 1
Entity: B-MOUNT, Score: 0.9991176724433899

Word: ##hot, Start: 1, End: 4
Entity: I-MOUNT, Score: 0.9998487234115601

Word: ##se, Start: 4, End: 6
Entity: I-MOUNT, Score: 0.999747097492218

-----------------------
Mount Elbrus has been the site of numerous scientific expeditions over the years.
-----------------------


Device set to use cuda:0


Word: Mount, Start: 0, End: 5
Entity: B-MOUNT, Score: 0.9997310042381287

Word: El, Start: 6, End: 8
Entity: I-MOUNT, Score: 0.9998530149459839

Word: ##b, Start: 8, End: 9
Entity: I-MOUNT, Score: 0.9998550415039062

Word: ##rus, Start: 9, End: 12
Entity: I-MOUNT, Score: 0.999805748462677

-----------------------
The ascent of Nanga Parbat requires careful preparation and skill.
-----------------------


Device set to use cuda:0


Word: Nan, Start: 14, End: 17
Entity: B-MOUNT, Score: 0.9980646967887878

Word: ##ga, Start: 17, End: 19
Entity: I-MOUNT, Score: 0.9993358254432678

Word: Pa, Start: 20, End: 22
Entity: I-MOUNT, Score: 0.9996610879898071

Word: ##rb, Start: 22, End: 24
Entity: I-MOUNT, Score: 0.9996837377548218

Word: ##at, Start: 24, End: 26
Entity: I-MOUNT, Score: 0.9996525049209595

-----------------------
One of the most iconic peaks in the world, Kangchenjunga stands tall as a natural wonder.
-----------------------


Device set to use cuda:0


Word: Kang, Start: 43, End: 47
Entity: B-MOUNT, Score: 0.9992875456809998

Word: ##chen, Start: 47, End: 51
Entity: I-MOUNT, Score: 0.9996626377105713

Word: ##jun, Start: 51, End: 54
Entity: I-MOUNT, Score: 0.9996095299720764

Word: ##ga, Start: 54, End: 56
Entity: I-MOUNT, Score: 0.999241828918457

-----------------------
Many stories have been told about the treacherous conditions on Mount Kosciuszko.
-----------------------
Word: Mount, Start: 64, End: 69
Entity: B-MOUNT, Score: 0.9967331886291504

Word: Ko, Start: 70, End: 72
Entity: I-MOUNT, Score: 0.9972622394561768

Word: ##s, Start: 72, End: 73
Entity: I-MOUNT, Score: 0.9997381567955017

Word: ##cius, Start: 73, End: 77
Entity: I-MOUNT, Score: 0.9997338652610779

Word: ##z, Start: 77, End: 78
Entity: I-MOUNT, Score: 0.9995952248573303

Word: ##ko, Start: 78, End: 80
Entity: I-MOUNT, Score: 0.9996440410614014



#### As we can see model works well on this examples, now lets try hard example.
I asked ChatGPT to generate hard example for us, lets see how our model will perform on it.
(This sentence uses less prominent Colorado 14ers that could challenge NER models since "Wilson" and "Antero" could be mistaken for person names, while "Blanca" could be confused with a given name.)

In [4]:
example = "The lesser-known Wilson Peak looms beside Mount Elbert and Blanca Peak, while distant Antero and Shavano pierce the Colorado skyline."
inference(example)

Device set to use cuda:0


Word: Wilson, Start: 17, End: 23
Entity: B-MOUNT, Score: 0.8673344254493713

Word: Peak, Start: 24, End: 28
Entity: I-MOUNT, Score: 0.8126006126403809

Word: Mount, Start: 42, End: 47
Entity: B-MOUNT, Score: 0.9360774755477905

Word: El, Start: 48, End: 50
Entity: I-MOUNT, Score: 0.9280115365982056

Word: ##bert, Start: 50, End: 54
Entity: I-MOUNT, Score: 0.933672308921814

Word: Blanc, Start: 59, End: 64
Entity: B-MOUNT, Score: 0.8041844964027405

Word: ##a, Start: 64, End: 65
Entity: I-MOUNT, Score: 0.8239085078239441

Word: Peak, Start: 66, End: 70
Entity: I-MOUNT, Score: 0.7228032350540161



#### We can see that our model did not catch all the entities (it missed Antero and Shavano), but it found other three mountains.