In [138]:
import pandas as pd
import os

In [139]:
df = pd.read_csv('Error_analysis/Bi-LSTM_predictions.csv', index_col=0)

In [141]:
from sklearn.metrics import classification_report

print(classification_report(list(df.ent), list(df.prediction)))

              precision    recall  f1-score   support

      B-DATE       0.93      0.68      0.78        40
       B-LOC       0.83      0.56      0.67        85
     B-MONEY       0.74      0.64      0.69        53
      B-PERS       0.94      0.80      0.86       254
      I-DATE       0.96      0.66      0.78       151
       I-LOC       1.00      0.10      0.18        10
     I-MONEY       0.85      0.93      0.89       123
      I-PERS       0.88      0.63      0.73       224
           O       0.95      0.99      0.97      4310

    accuracy                           0.94      5250
   macro avg       0.90      0.67      0.73      5250
weighted avg       0.94      0.94      0.93      5250



In [132]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

print(f1_score([list(df.ent)], [list(df.prediction)]))

print(classification_report([list(df.ent)], [list(df.prediction)]))

0.6296296296296297
              precision    recall  f1-score   support

        DATE       0.53      0.50      0.51        40
         LOC       0.76      0.53      0.62        85
       MONEY       0.56      0.62      0.59        53
        PERS       0.71      0.62      0.66       254

   micro avg       0.67      0.59      0.63       432
   macro avg       0.64      0.57      0.60       432
weighted avg       0.68      0.59      0.63       432



In [117]:
true = ''

for x, y in zip(df.word, df.ent):
    true = true + (x.strip() + '\t' + y.strip() + '\n') 

In [118]:
pred = ''

for x, y in zip(df.word, df.prediction):
    pred = pred + (x.strip() + '\t' + y.strip() + '\n')

In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5250 entries, 0 to 5249
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   word        5250 non-null   object
 1   ent         5250 non-null   object
 2   prediction  5250 non-null   object
dtypes: object(3)
memory usage: 164.1+ KB


In [123]:
from nervaluate import Evaluator

evaluator = Evaluator(true, pred, tags=['LOC', 'PERS', 'DATE', 'MONEY'],  loader="conll")

# Returns overall metrics and metrics for each tag

results, results_per_tag = evaluator.evaluate()

print(results)

2023-01-31 11:58:03 root DEBUG: Imported 43970 predictions for 44888 true examples


{'ent_type': {'correct': 333, 'incorrect': 15, 'partial': 0, 'missed': 84, 'spurious': 30, 'possible': 432, 'actual': 378, 'precision': 0.8809523809523809, 'recall': 0.7708333333333334, 'f1': 0.8222222222222222}, 'partial': {'correct': 263, 'incorrect': 0, 'partial': 85, 'missed': 84, 'spurious': 30, 'possible': 432, 'actual': 378, 'precision': 0.8082010582010583, 'recall': 0.7071759259259259, 'f1': 0.754320987654321}, 'strict': {'correct': 255, 'incorrect': 93, 'partial': 0, 'missed': 84, 'spurious': 30, 'possible': 432, 'actual': 378, 'precision': 0.6746031746031746, 'recall': 0.5902777777777778, 'f1': 0.6296296296296297}, 'exact': {'correct': 263, 'incorrect': 85, 'partial': 0, 'missed': 84, 'spurious': 30, 'possible': 432, 'actual': 378, 'precision': 0.6957671957671958, 'recall': 0.6087962962962963, 'f1': 0.6493827160493828}}


In [137]:
import pprint as pp

pp.pprint(results_per_tag['PERS']['partial'])

{'actual': 219,
 'correct': 158,
 'f1': 0.7758985200845666,
 'incorrect': 0,
 'missed': 45,
 'partial': 51,
 'possible': 254,
 'precision': 0.8378995433789954,
 'recall': 0.7224409448818898,
 'spurious': 10}


In [33]:
os.chdir('C:/Users/Gebruiker/Documents/UA/Stage/NER-Evaluation/')

from ner_evaluation.ner_eval import collect_named_entities
from ner_evaluation.ner_eval import compute_metrics
from ner_evaluation.ner_eval import compute_precision_recall_wrapper, compute_precision_recall

In [None]:
from collections import deepcopy

def compute_metrics(true_named_entities, pred_named_entities, tags):


    eval_metrics = {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'precision': 0, 'recall': 0}

    # overall results
    
    evaluation = {
        'correct': 0,
        'partial': 0,
        'missed': 0,
        'spurious': 0 
    }

    # results by entity type

    evaluation_agg_entities_type = {e: deepcopy(evaluation) for e in tags}

    # keep track of entities that overlapped

    true_which_overlapped_with_pred = []

    # Subset into only the tags that we are interested in.
    # NOTE: we remove the tags we don't want from both the predicted and the
    # true entities. This covers the two cases where mismatches can occur:
    #
    # 1) Where the model predicts a tag that is not present in the true data
    # 2) Where there is a tag in the true data that the model is not capable of
    # predicting.

    true_named_entities = [ent for ent in true_named_entities if ent.e_type in tags]
    pred_named_entities = [ent for ent in pred_named_entities if ent.e_type in tags]

    # go through each predicted named-entity

    for pred in pred_named_entities:
        found_overlap = False

        # Check each of the potential scenarios in turn. See
        # http://www.davidsbatista.net/blog/2018/05/09/Named_Entity_Evaluation/
        # for scenario explanation.

        # Scenario I: Exact match between true and pred

        if pred in true_named_entities:
            true_which_overlapped_with_pred.append(pred)
            evaluation['correct'] += 1

            # for the agg. by e_type results
            evaluation_agg_entities_type[pred.e_type]['correct'] += 1

        else:

            # check for overlaps with any of the true entities

            for true in true_named_entities:

                pred_range = range(pred.start_offset, pred.end_offset)
                true_range = range(true.start_offset, true.end_offset)

                # Scenario IV: Offsets match, but entity type is wrong

                if true.start_offset == pred.start_offset and pred.end_offset == true.end_offset \
                        and true.e_type != pred.e_type:

                    # overall results
                    evaluation['strict']['incorrect'] += 1
                    evaluation['ent_type']['incorrect'] += 1
                    evaluation['partial']['correct'] += 1
                    evaluation['exact']['correct'] += 1

                    # aggregated by entity type results
                    evaluation_agg_entities_type[true.e_type]['strict']['incorrect'] += 1
                    evaluation_agg_entities_type[true.e_type]['ent_type']['incorrect'] += 1
                    evaluation_agg_entities_type[true.e_type]['partial']['correct'] += 1
                    evaluation_agg_entities_type[true.e_type]['exact']['correct'] += 1

                    true_which_overlapped_with_pred.append(true)
                    found_overlap = True

                    break

                # check for an overlap i.e. not exact boundary match, with true entities

                if find_overlap(true_range, pred_range):

                    true_which_overlapped_with_pred.append(true)

                    # Scenario V: There is an overlap (but offsets do not match
                    # exactly), and the entity type is the same.
                    # 2.1 overlaps with the same entity type

                    if pred.e_type == true.e_type:

                        # overall results
                        evaluation['partial'] += 1


                        # aggregated by entity type results
                        evaluation_agg_entities_type[true.e_type]['partial'] += 1

                        found_overlap = True

                        break

                    # Scenario VI: Entities overlap, but the entity type is
                    # different.

                    else:
                        # overall results
                        evaluation['strict']['incorrect'] += 1
                        evaluation['ent_type']['incorrect'] += 1
                        evaluation['partial']['partial'] += 1
                        evaluation['exact']['incorrect'] += 1

                        # aggregated by entity type results
                        # Results against the true entity

                        evaluation_agg_entities_type[true.e_type]['strict']['incorrect'] += 1
                        evaluation_agg_entities_type[true.e_type]['partial']['partial'] += 1
                        evaluation_agg_entities_type[true.e_type]['ent_type']['incorrect'] += 1
                        evaluation_agg_entities_type[true.e_type]['exact']['incorrect'] += 1

                        # Results against the predicted entity

                        # evaluation_agg_entities_type[pred.e_type]['strict']['spurious'] += 1

                        found_overlap = True

                        break

            # Scenario II: Entities are spurious (i.e., over-generated).

            if not found_overlap:

                # Overall results

                evaluation['strict']['spurious'] += 1
                evaluation['ent_type']['spurious'] += 1
                evaluation['partial']['spurious'] += 1
                evaluation['exact']['spurious'] += 1

                # Aggregated by entity type results

                # NOTE: when pred.e_type is not found in tags
                # or when it simply does not appear in the test set, then it is
                # spurious, but it is not clear where to assign it at the tag
                # level. In this case, it is applied to all target_tags
                # found in this example. This will mean that the sum of the
                # evaluation_agg_entities will not equal evaluation.

                for true in tags:                    

                    evaluation_agg_entities_type[true]['strict']['spurious'] += 1
                    evaluation_agg_entities_type[true]['ent_type']['spurious'] += 1
                    evaluation_agg_entities_type[true]['partial']['spurious'] += 1
                    evaluation_agg_entities_type[true]['exact']['spurious'] += 1

    # Scenario III: Entity was missed entirely.

    for true in true_named_entities:
        if true in true_which_overlapped_with_pred:
            continue
        else:
            # overall results
            evaluation['strict']['missed'] += 1
            evaluation['ent_type']['missed'] += 1
            evaluation['partial']['missed'] += 1
            evaluation['exact']['missed'] += 1

            # for the agg. by e_type
            evaluation_agg_entities_type[true.e_type]['strict']['missed'] += 1
            evaluation_agg_entities_type[true.e_type]['ent_type']['missed'] += 1
            evaluation_agg_entities_type[true.e_type]['partial']['missed'] += 1
            evaluation_agg_entities_type[true.e_type]['exact']['missed'] += 1

    # Compute 'possible', 'actual' according to SemEval-2013 Task 9.1 on the
    # overall results, and use these to calculate precision and recall.

    for eval_type in evaluation:
        evaluation[eval_type] = compute_actual_possible(evaluation[eval_type])

    # Compute 'possible', 'actual', and precision and recall on entity level
    # results. Start by cycling through the accumulated results.

    for entity_type, entity_level in evaluation_agg_entities_type.items():

        # Cycle through the evaluation types for each dict containing entity
        # level results.

        for eval_type in entity_level:

            evaluation_agg_entities_type[entity_type][eval_type] = compute_actual_possible(
                entity_level[eval_type]
            )

    return evaluation, evaluation_agg_entities_type


def find_overlap(true_range, pred_range):
    """Find the overlap between two ranges
    Find the overlap between two ranges. Return the overlapping values if
    present, else return an empty set().
    Examples:
    >>> find_overlap((1, 2), (2, 3))
    2
    >>> find_overlap((1, 2), (3, 4))
    set()
    """

    true_set = set(true_range)
    pred_set = set(pred_range)

    overlaps = true_set.intersection(pred_set)

    return overlaps



In [34]:
true = collect_named_entities(df.ent)
pred = collect_named_entities(df.prediction)

In [35]:
for key, value in ent_agg.items():
    print(key, value)

LOC {'strict': {'correct': 45, 'incorrect': 5, 'partial': 0, 'missed': 35, 'spurious': 56, 'precision': 0, 'recall': 0, 'actual': 106, 'possible': 85}, 'ent_type': {'correct': 45, 'incorrect': 5, 'partial': 0, 'missed': 35, 'spurious': 56, 'precision': 0, 'recall': 0, 'actual': 106, 'possible': 85}, 'partial': {'correct': 50, 'incorrect': 0, 'partial': 0, 'missed': 35, 'spurious': 56, 'precision': 0, 'recall': 0, 'actual': 106, 'possible': 85}, 'exact': {'correct': 50, 'incorrect': 0, 'partial': 0, 'missed': 35, 'spurious': 56, 'precision': 0, 'recall': 0, 'actual': 106, 'possible': 85}}
PERS {'strict': {'correct': 157, 'incorrect': 35, 'partial': 0, 'missed': 62, 'spurious': 56, 'precision': 0, 'recall': 0, 'actual': 248, 'possible': 254}, 'ent_type': {'correct': 191, 'incorrect': 1, 'partial': 0, 'missed': 62, 'spurious': 56, 'precision': 0, 'recall': 0, 'actual': 248, 'possible': 254}, 'partial': {'correct': 158, 'incorrect': 0, 'partial': 34, 'missed': 62, 'spurious': 56, 'precisio