In [1]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

os.environ["WANDB_PROJECT"] = "mscft_ner"

from flair.models import SequenceTagger

In [2]:
PATH = "flair_finetuning/debertav3/best-model.pt" # "urchade/gliner_small-v2.1"

model = SequenceTagger.load(PATH,).cuda() #  load_tokenizer=True
# model

2024-10-10 13:33:29,566 SequenceTagger predicts: Dictionary with 5 tags: O, B-LOC, I-LOC, <START>, <STOP>


In [3]:
import json
import pandas as pd

raw_test = json.load(open("data/accepted_data/TestCleaned.json"))
texts = [
    " ".join(i["tokenized_text"]) for i in raw_test
]

def get_expected(raw):
    entities = [
        " ".join(raw["tokenized_text"][i[0]: i[1] + 1]) for i in raw["ner"]
    ]
    return " ".join(sorted(entities))
ners = [
    get_expected(i) for i in raw_test
]
test_data = pd.DataFrame(zip(texts, ners), columns=["text", "location"])
print(len(test_data))

test_data.sample(5)

1645


Unnamed: 0,text,location
616,And we know nothing about the damage caused by...,
1480,". @ SBSbroadcasting sends 100 , 000 pounds of ...",Puerto Rico
29,# CycloneIdai update : we aim to reach + 500k ...,Malawi Mozambique Zimbabwe
103,Rush lacrosse club raising $ 50K for Fort McMu...,Fort McMurray
34,Any donation to @ topos helps the victims of t...,Mexico


In [4]:
from flair.data import Sentence

texts = [Sentence(i) for i in test_data["text"].tolist()]

model.predict(texts[0])

In [5]:
model.predict(texts, verbose=True)

Batch inference: 100%|██████████| 52/52 [00:23<00:00,  2.19it/s]


In [6]:
texts[10]

Sentence[52]: "Kerala Govt has opened a new website to co - ordinate the flood relief efforts . Kindly visit to 1 . Request for help 2 . District Needs 3 . To Contribute 4 . Register as a Volunteer 5 . Contact different camps 6 . To know d registered requests # KeralaFloods2018" → ["Kerala"/LOC]

In [7]:
pred = []

for entity in texts[10].get_spans("ner"):
    if entity.tag == "LOC":
        pred.append(
            {
                "entity": entity.tag,
                "word": entity.text,
                "start": entity.start_position,
                "end": entity.end_position,
                "probs": entity.score
            }
        )

In [8]:
pred

[{'entity': 'LOC',
  'word': 'Kerala',
  'start': 0,
  'end': 6,
  'probs': 0.9961767196655273}]

In [9]:
preditions = []
for text in texts:
	pred = []

	for entity in text.get_spans("ner"):
		if entity.tag == "LOC":
			pred.append(
				{
					"entity": entity.tag,
					"word": entity.text,
					"start": entity.start_position,
					"end": entity.end_position,
					"probs": entity.score
				}
			)

	preditions.append(pred)

In [10]:
preditions[101]

[{'entity': 'LOC',
  'word': 'Clarence River',
  'start': 37,
  'end': 51,
  'probs': 0.9953771531581879}]

In [11]:
raws = [
    " ".join(sorted(i["word"] for i in pred)) or "@" for pred in preditions
]

raws[25]

'Ellicott City Maryland Maryland'

In [12]:
(test_data["location"] == "").sum()

461

In [15]:
test_data.loc[test_data["location"] == "", "location"] = "@"

expected = test_data["location"].tolist()

expected[25]

'Ellicott City Maryland Maryland'

In [16]:
from evaluate import load

wer = load("wer")

wer.compute(predictions=raws, references=expected)

0.16406547392462886

In [17]:
probs = [
    i["probs"] for pred in preditions for i in pred
]

pd.Series(probs).describe()

count    1815.000000
mean        0.975094
std         0.073932
min         0.391826
25%         0.991646
50%         0.997539
75%         0.998732
max         0.999396
dtype: float64

In [18]:
import numpy as np

def drop_duplicate(places):
    return sorted(list(set(places)))

def compute_score(thr: float = .5):
    raws = [
		" ".join(drop_duplicate(i["word"] for i in pred if i["probs"] >= thr)) or "@" for pred in preditions
	]
    return wer.compute(predictions=raws, references=expected)

best_score, best_thr = 1000, 0
for thr in np.linspace(0.0, 1, 50):
    score = compute_score(thr)
    if score < best_score:
        best_score = score
        best_thr = thr
        print(score, thr)

0.18880852683669586 0.0
0.1884278644842025 0.4081632653061224
0.18804720213170917 0.44897959183673464
0.18652455272173582 0.5306122448979591
0.18614389036924248 0.5510204081632653
0.18500190331176247 0.5714285714285714
0.18462124095926913 0.5918367346938775
0.18424057860677578 0.6122448979591836


* 0.1727126805778491 0.1
* 0.1723916532905297 0.4724137931034482
* 0.17207062600321027 0.5344827586206896
* 0.17142857142857143 0.5655172413793104
* 0.1707865168539326 0.596551724137931
* 0.17046548956661317 0.6586206896551724
* 0.1682182985553772 0.689655172413793
* 0.1666131621187801 0.7206896551724138
* 0.16565008025682182 0.8448275862068965

In [19]:
import numpy as np

def drop_duplicate(places):
    return sorted(places)

def compute_score(thr: float = .5):
    raws = [
		" ".join(drop_duplicate(i["word"] for i in pred if i["probs"] >= thr)) or "@" for pred in preditions
	]
    return wer.compute(predictions=raws, references=expected)

best_score, best_thr = 1000, 0
for thr in np.linspace(0.0, 1, 50):
    score = compute_score(thr)
    if score < best_score:
        best_score = score
        best_thr = thr
        print(score, thr)

0.16406547392462886 0.0
0.16368481157213552 0.4081632653061224
0.16330414921964218 0.44897959183673464
0.16178149980966883 0.5306122448979591
0.16102017510468214 0.5510204081632653
0.1594975256947088 0.5714285714285714
0.15911686334221545 0.5918367346938775
0.1583555386372288 0.6122448979591836
