In [1]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import pandas as pd
from flair.models import SequenceTagger

In [2]:
task_id = "27"

thr = 0.61

PATH = "flair_finetuning/debertav3/best-model.pt" # "urchade/gliner_small-v2.1"

model = SequenceTagger.load(PATH,).cuda() #  load_tokenizer=True

2024-10-10 14:01:40,476 SequenceTagger predicts: Dictionary with 5 tags: O, B-LOC, I-LOC, <START>, <STOP>


In [3]:
import re

def split_and_clean(sentence: str):
	# Split sentence into words while keeping punctuation as separate tokens
	words = re.findall(r"[\w]+|[^\s\w]", sentence)
	return words

def normalize_word(word: str):
	# Remove leading # and @ symbols for comparison purposes
	return word.lstrip("#@")

In [4]:
split_and_clean("EVERYONE wants to help people impacted by @t")

['EVERYONE', 'wants', 'to', 'help', 'people', 'impacted', 'by', '@', 't']

In [5]:
test_data = pd.read_csv("microsoft-learn-location-mention-recognition-challenge20240905-10153-193u9hv/Test.csv")

test_data = test_data.fillna("")

test_data["text"] = test_data["text"].apply(lambda x: " ".join(split_and_clean(x)))

test_data.sample(10)

Unnamed: 0,tweet_id,text
349,ID_1032595530776342528,"Subcommittee directed by # SC suggests that , ..."
1906,ID_783586053215051776,Maryland Group Sending Hurricane Relief Suppli...
2045,ID_797805580299763712,RT @ ForaDeControlee : # Tsunami # warning # i...
1198,ID_1167778641901686784,See bottom right hand corner picture . That is...
2604,ID_910681795225575424,if youre not from Mexico and want to help ! ! !
1317,ID_1168386123267268608,ὓ4 METAR with 360 km / h gusting forecast at G...
765,ID_1065651388401819648,"My , what a juicy Ohnut . Margaret really outd..."
1902,ID_783539813463027712,WowHurricane Matthews damage in Haiti really m...
882,ID_1108967792890413056,Deeply saddened by the devastation caused by #...
1525,ID_721832188313006080,Devastated victims of Ecuador earthquake beg f...


In [6]:
from tqdm import tqdm

from flair.data import Sentence

texts = [Sentence(i) for i in test_data["text"].tolist()]

model.predict(texts, verbose=True)
len(texts)

Batch inference: 100%|██████████| 92/92 [00:44<00:00,  2.08it/s]


2942

In [7]:
preditions = []
for text in texts:
	pred = []

	for entity in text.get_spans("ner"):
		if entity.tag == "LOC":
			pred.append(
				{
					"entity": entity.tag,
					"word": entity.text,
					"start": entity.start_position,
					"end": entity.end_position,
					"probs": entity.score
				}
			)

	preditions.append(pred)

In [8]:
def drop_duplicate(places):
    return sorted(list(set(places)))

raws = [
	" ".join(drop_duplicate(i["word"] for i in pred if i["probs"] >= thr)) or " " for pred in preditions
]

In [9]:
submission = test_data[["tweet_id"]]
submission["location"] = raws

submission.sample(10)

Unnamed: 0,tweet_id,location
573,ID_1042058989045993472,NC New Bern
2795,ID_912963737363021824,Mexico Oaxaca
1983,ID_783882565136490496,Haiti
752,ID_1065398080785199104,California
2124,ID_798007786822979584,NZ NewZealand
1076,ID_1111813790973263872,Nebraska
429,ID_1039405953244954624,Miamis
1957,ID_783820755280605184,Haiti
592,ID_1061325222140231680,California
1632,ID_728534356034084864,Fort McMurray


In [10]:
submission.to_csv(f"submissions/{task_id}-FlairSplit.csv", index=False)

In [11]:
f"submissions/{task_id}-Flair.csv"

'submissions/27-Flair.csv'