In [1]:
import json, os
from glob import glob

In [2]:
def evaluate_ner(true_entities, predicted_entities):
	tp, fp, fn = 0, 0, 0

	true_set = {(ent['pos'][0], ent['pos'][1], ent['type']) for ent in true_entities}
	pred_set = {(ent['start'], ent['end'], ent['label']) for ent in predicted_entities}

	tp = len(true_set & pred_set)  # Intersection: Correctly predicted
	fp = len(pred_set - true_set)  # Predicted but not in ground truth
	fn = len(true_set - pred_set)  # Ground truth but not predicted

	precision = tp / (tp + fp) if (tp + fp) > 0 else 0
	recall = tp / (tp + fn) if (tp + fn) > 0 else 0
	f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

	return {"Precision": precision, "Recall": recall, "F1-score": f1_score}


In [3]:
pred_file = "data/NEREvals/bc5cdr.json"
exp_file = "data/IE_INSTRUCTIONS/NER/bc5cdr/test.json"

predictions = json.load(open(pred_file))
expected = json.load(open(exp_file))

len(predictions), len(expected)

(4797, 4797)

In [4]:
predictions[0]

[{'start': 19,
  'end': 42,
  'text': 'ventricular tachycardia',
  'label': 'Disease',
  'score': 0.36110201478004456},
 {'start': 72,
  'end': 82,
  'text': 'dobutamine',
  'label': 'Chemical',
  'score': 0.8203075528144836},
 {'start': 111,
  'end': 133,
  'text': 'dilated cardiomyopathy',
  'label': 'Disease',
  'score': 0.9815657138824463},
 {'start': 138,
  'end': 162,
  'text': 'congestive heart failure',
  'label': 'Disease',
  'score': 0.9569717645645142}]

In [5]:
expected[0]["entities"]

[{'name': 'Torsade de pointes', 'type': 'Disease', 'pos': [0, 18]},
 {'name': 'ventricular tachycardia', 'type': 'Disease', 'pos': [19, 42]},
 {'name': 'dobutamine', 'type': 'Chemical', 'pos': [72, 82]},
 {'name': 'dilated cardiomyopathy', 'type': 'Disease', 'pos': [111, 133]},
 {'name': 'congestive heart failure', 'type': 'Disease', 'pos': [138, 162]}]

In [6]:
thrs = [
	i / 10 for i in range(1, 10)
]
scores = {
	i: [
		evaluate_ner(k1["entities"], [n for n in k2 if n["score"] > i]) for k1, k2 in zip(expected, predictions)
	] for i in thrs
}

In [7]:
import pandas as pd

In [8]:
{
	k: pd.DataFrame(score).mean() for k, score in scores.items()
}

{0.1: Precision    0.532425
 Recall       0.594687
 F1-score     0.546447
 dtype: float64,
 0.2: Precision    0.552919
 Recall       0.581224
 F1-score     0.551955
 dtype: float64,
 0.3: Precision    0.563158
 Recall       0.567636
 F1-score     0.550225
 dtype: float64,
 0.4: Precision    0.568285
 Recall       0.553066
 F1-score     0.544819
 dtype: float64,
 0.5: Precision    0.570473
 Recall       0.531965
 F1-score     0.534046
 dtype: float64,
 0.6: Precision    0.570928
 Recall       0.506465
 F1-score     0.519089
 dtype: float64,
 0.7: Precision    0.561612
 Recall       0.466462
 F1-score     0.490973
 dtype: float64,
 0.8: Precision    0.534985
 Recall       0.404136
 F1-score     0.440472
 dtype: float64,
 0.9: Precision    0.419705
 Recall       0.272395
 F1-score     0.313661
 dtype: float64}

In [9]:
final_scores = pd.DataFrame({
	k: pd.DataFrame(score).mean() for k, score in scores.items()
})
final_scores

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
Precision,0.532425,0.552919,0.563158,0.568285,0.570473,0.570928,0.561612,0.534985,0.419705
Recall,0.594687,0.581224,0.567636,0.553066,0.531965,0.506465,0.466462,0.404136,0.272395
F1-score,0.546447,0.551955,0.550225,0.544819,0.534046,0.519089,0.490973,0.440472,0.313661


In [10]:
from glob import glob

pred_files = glob("data/NEREvals/*.json")

all_scores = {}

for pred_file in pred_files:
	name = os.path.basename(pred_file).replace(".json", "")
	exp_file = f"data/IE_INSTRUCTIONS/NER/{name}/test.json"
	
	predictions = json.load(open(pred_file))
	expected = json.load(open(exp_file))

	thrs = [
		i / 10 for i in range(1, 10)
	]
	scores = {
		i: [
			evaluate_ner(k1["entities"], [n for n in k2 if n["score"] > i]) for k1, k2 in zip(expected, predictions)
		] for i in thrs
	}

	final_scores = pd.DataFrame({
		k: pd.DataFrame(score).mean() for k, score in scores.items()
	})
	all_scores[name] = final_scores.to_dict()

In [11]:
len(all_scores)

31

In [12]:
all_scores["ACE 2004"]

{0.1: {'Precision': 0.3112232014017728,
  'Recall': 0.28590096616574445,
  'F1-score': 0.2840903811115719},
 0.2: {'Precision': 0.33739319656807343,
  'Recall': 0.27812621833557793,
  'F1-score': 0.291823340869098},
 0.3: {'Precision': 0.35093781765579796,
  'Recall': 0.2651394719128709,
  'F1-score': 0.2894997544368719},
 0.4: {'Precision': 0.3559235475799515,
  'Recall': 0.24991496256890344,
  'F1-score': 0.2808608478123607},
 0.5: {'Precision': 0.3558707678473689,
  'Recall': 0.23425607301838827,
  'F1-score': 0.26844120811592104},
 0.6: {'Precision': 0.3534385018375166,
  'Recall': 0.21794224863806144,
  'F1-score': 0.25515812593581483},
 0.7: {'Precision': 0.3509974392055673,
  'Recall': 0.20032163717508544,
  'F1-score': 0.23984668679541307},
 0.8: {'Precision': 0.33739492923606224,
  'Recall': 0.17140167577359694,
  'F1-score': 0.21263185367142315},
 0.9: {'Precision': 0.2996100164203612,
  'Recall': 0.12692366131405539,
  'F1-score': 0.1644231897002833}}

In [13]:
with open("data/Results/gliner-med.json", "w") as f:
    json.dump(all_scores, f)