In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
from evaluate import load

In [2]:

def compute_metrics(test, predictions):
    metric = load("seqeval")
    return metric.compute(predictions=predictions, references=test)


In [3]:

# Load the dataset from the Excel file
def load_dataset(excel_path):
    return pd.read_excel(excel_path)


In [4]:
def reformat_tags(path):
    df = load_dataset(path)
    df_bio = []
    for _, row in df.iterrows():
        match (row['Entity Type'], row['BIOES']):
            case ('NONE', _):
                df_bio.append('O')
            case (_, 'B-BIOES'):
                df_bio.append('B-' + row['Entity Type'])
            case (_, 'S-BIOES'):
                df_bio.append('B-' + row['Entity Type'])
            case (_, 'I-BIOES'):
                df_bio.append('I-' + row['Entity Type'])
            case (_, 'E-BIOES'):
                df_bio.append('I-' + row['Entity Type'])
            case (_, _):
                df_bio.append('O')
    df['BIO'] = df_bio
    return df
    
            

In [5]:
def reformat_structure(df):
    grouped_df = df.groupby('Sentence').agg(lambda x: list(x)).reset_index()    
    grouped_df.set_index('Sentence', inplace=True)    
    return grouped_df

In [6]:
# Function for recognizing named entities in text using a dictionary from Excel
def recognize_named_entities(train_path, test_path):
    # Dataset loading
    train_df = reformat_structure(reformat_tags(train_path))
    test_df = reformat_structure(reformat_tags(test_path))
    named_entity_dict = {(word, prev_tag): tag for _, row in train_df.iterrows() for word, prev_tag, tag in zip(row['Word'], ['O'] + row['BIO'][:-1], row['BIO']) if tag}
    
    recognized_entities = []
    for sentence in test_df['Word']:
        last_tag = 'O'
        pred_sentence = []
        for word in sentence:
            if (word, last_tag) in named_entity_dict:
                last_tag = named_entity_dict[(word, last_tag)]
                pred_sentence.append(last_tag)
            else:
                last_tag = 'O'
                pred_sentence.append(last_tag)
        recognized_entities.append(pred_sentence)
    return compute_metrics(test_df['BIO'].tolist(), recognized_entities)
    


In [7]:

test_path= 'test_data.xlsx'
train_path = 'train_data.xlsx' # here we should insert a link for .xlsx file, which is available here DOI:10.17632/p6rcwf4p9c.1
metrics = recognize_named_entities(train_path, test_path)
metrics



{'City': {'precision': np.float64(0.9565217391304348),
  'recall': np.float64(0.7096774193548387),
  'f1': np.float64(0.8148148148148149),
  'number': np.int64(31)},
 'Country': {'precision': np.float64(0.9333333333333333),
  'recall': np.float64(0.3111111111111111),
  'f1': np.float64(0.4666666666666667),
  'number': np.int64(45)},
 'Country_B': {'precision': np.float64(1.0),
  'recall': np.float64(0.14705882352941177),
  'f1': np.float64(0.25641025641025644),
  'number': np.int64(34)},
 'Date': {'precision': np.float64(0.9058823529411765),
  'recall': np.float64(0.7938144329896907),
  'f1': np.float64(0.8461538461538461),
  'number': np.int64(97)},
 'Organization': {'precision': np.float64(0.7397260273972602),
  'recall': np.float64(0.8059701492537313),
  'f1': np.float64(0.7714285714285715),
  'number': np.int64(67)},
 'Position': {'precision': np.float64(0.9333333333333333),
  'recall': np.float64(0.7368421052631579),
  'f1': np.float64(0.8235294117647058),
  'number': np.int64(19)