## Loading Libraries

In [83]:
from datasets import load_dataset
import pandas as pd
from evaluate import load


## Loading the data

In [84]:
def get_data(dataset):
    return load_dataset(dataset)

In [85]:
datasets = ['gusevski/factrueval2016',
            'RCC-MSU/collection3',
           'conll2003',
           'ghadeermobasher/BC5CDR-Chemical-Disease']

In [86]:
def label_map(data, id):
    if id == 0:
        label_names = ['O', 'B-PER', 'I-PER' ,'B-ORG','I-ORG' ,'B-LOC' ,'I-LOC']
    else:
        label_names = data['test'].features['ner_tags'].feature.names
    get_label = {k: v for k, v in enumerate(label_names)} 
    get_id = {v: k for k, v in enumerate(label_names)}
    return lambda x : get_label[x], lambda label : get_id[label]

## Recognition

In [87]:
def recognize_named_entities(dataset, id):
    id2label, label2id = label_map(dataset, id)
    border = int(len(dataset['test']) // 2)
    if id == 0:
        train_df = pd.DataFrame(dataset['test']['data'][0]).iloc[:border] if len(dataset) == 1 else pd.DataFrame(dataset['train']['data'][0])
        test_df = pd.DataFrame(dataset['test']['data'][0]).iloc[border:] if len(dataset) == 1 else pd.DataFrame(dataset['test']['data'][0])
    else:
        train_df = pd.DataFrame(dataset['test']).iloc[:border] if len(dataset) == 1 else pd.DataFrame(dataset['train'])
        test_df = pd.DataFrame(dataset['test']).iloc[border:] if len(dataset) == 1 else pd.DataFrame(dataset['test'])
    named_entity_dict = {(tok, id2label(prev)): id2label(tag) for _, row in train_df.iterrows() for tok, prev, tag in zip(row['tokens'], [0] + row['ner_tags'][:-1], row['ner_tags']) if tag}
    recognized_entities = []
    for sentence in test_df['tokens']:
        last_tag = 'O'
        pred_sentence = []
        for word in sentence:
            if (word, last_tag) in named_entity_dict:
                last_tag = named_entity_dict[(word, last_tag)]
                pred_sentence.append(last_tag)
            else:
                last_tag = 'O'
                pred_sentence.append(last_tag)
        recognized_entities.append(pred_sentence)
    metric = load("seqeval")
    res = metric.compute(predictions=recognized_entities, references=[[id2label(tag) for tag in sentence]  for sentence in test_df['ner_tags']])
    return res['overall_f1'], res['overall_precision'], res['overall_recall'] 

## Visualize results

In [88]:
for i, dataset in enumerate(datasets):
    data = get_data(dataset)
    f1, precision, recall = recognize_named_entities(data, i)
    print(dataset)
    print(f"\tprecision: {precision}, recall: {recall}, f1: {f1}\n")

Repo card metadata block was not found. Setting CardData to empty.


gusevski/factrueval2016
	precision: 0.7132471728594507, recall: 0.6438206343419614, f1: 0.6767579996167847
RCC-MSU/collection3
	precision: 0.22230864965774735, recall: 0.3571607098225444, f1: 0.27404353245757024
conll2003
	precision: 0.3719438182764002, recall: 0.37978045325779036, f1: 0.37582128777923784


Repo card metadata block was not found. Setting CardData to empty.


ghadeermobasher/BC5CDR-Chemical-Disease
	precision: 0.203691184424013, recall: 0.3071668875522479, f1: 0.244949392301126
