In [1]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

os.environ["WANDB_PROJECT"] = "mscft_ner"
# os.environ["WANDB_LOG_MODEL"] = "true"
# os.environ["WANDB_WATCH"] = "none"

import json
import random
from glob import glob

import pandas as pd
import torch
from gliner import GLiNER

In [2]:
models_path = glob("models/checkpoint-*")

loss_max = 21.5
paths = []
for path in models_path:
    perf_file = json.load(open(os.path.join(path, "trainer_state.json"), "r"))["log_history"]
    for i in perf_file[::-1]:
        if "eval_loss" in i:
            break
    if i["eval_loss"] <= loss_max:
        paths.append(path)
        
paths

['models/checkpoint-1150', 'models/checkpoint-1840', 'models/checkpoint-920']

In [3]:
# paths = [
#     "models/checkpoint-920",
#     "models/checkpoint-1150",
#     "models/checkpoint-1380",
#     "models/checkpoint-1840",
#     "models/checkpoint-2070",
#     "models/checkpoint-2315",
#     "models/checkpoint-2300"
# ]

models = [
    GLiNER.from_pretrained(path,).cuda() for path in paths
]

config.json not found in /data/home/eak/learning/zindi_challenge/micro_rec/models/checkpoint-1150
config.json not found in /data/home/eak/learning/zindi_challenge/micro_rec/models/checkpoint-1840
config.json not found in /data/home/eak/learning/zindi_challenge/micro_rec/models/checkpoint-920


In [4]:
raw_test = json.load(open("data/accepted_data/TestCleaned.json"))
location_name = "disaster related location"
for i in raw_test:
	# i["label"] = labels
	for row in i["ner"]:
		row[-1] = location_name

texts = [
    " ".join(i["tokenized_text"]) for i in raw_test
]

def get_expected(raw):
    entities = [
        " ".join(raw["tokenized_text"][i[0]: i[1] + 1]) for i in raw["ner"]
    ]
    return " ".join(sorted(entities))
ners = [
    get_expected(i) for i in raw_test
]
test_data = pd.DataFrame(zip(texts, ners), columns=["text", "location"])
print(len(test_data))

test_data.sample(5)

1645


Unnamed: 0,text,location
748,Hurricane Matthew kills 10 across Caribbean ; ...,
937,HurricaneHarvey spun deeper into Texas unloadi...,Texas
1290,"Ecuador earthquake : 10 , 000 troops deployed ...",Ecuador
887,The track of Hurricane # Florence may have shi...,VA Virginia
907,Foreign Aid Received by India in the Last Ten ...,India


In [5]:
from tqdm import tqdm

label_name = ["disaster related location"]


def make_predictions(batch: list[str]):
    preds = [
        sum(rows, start=[])
        for rows in zip(
            *[
                model.batch_predict_entities(batch, label_name, threshold=0.05)
                for model in models
            ]
        )
    ]
    return preds


texts = test_data["text"].tolist()
bsize = 256

predictions = [
    make_predictions(texts[i : i + bsize])
    for i in tqdm(range(0, len(texts), bsize))
]

len(predictions)

  0%|          | 0/7 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 7/7 [00:26<00:00,  3.83s/it]


7

In [6]:
thr = .05

all_predictions = sum(predictions, start=[])

all_predictions = [
	[i for i in raws if i["score"] >= thr] for raws in all_predictions
]

for i, raw in enumerate(all_predictions):
	for j in raw:
		j["raw_id"] = i

all_predictions = sum(all_predictions, start=[])
all_predictions[:5]

[{'start': 9,
  'end': 27,
  'text': 'Wright - Patterson',
  'label': 'disaster related location',
  'score': 0.1267768144607544,
  'raw_id': 0},
 {'start': 60,
  'end': 65,
  'text': 'Texas',
  'label': 'disaster related location',
  'score': 0.9987847208976746,
  'raw_id': 0},
 {'start': 9,
  'end': 27,
  'text': 'Wright - Patterson',
  'label': 'disaster related location',
  'score': 0.13000044226646423,
  'raw_id': 0},
 {'start': 60,
  'end': 65,
  'text': 'Texas',
  'label': 'disaster related location',
  'score': 0.9987105131149292,
  'raw_id': 0},
 {'start': 9,
  'end': 27,
  'text': 'Wright - Patterson',
  'label': 'disaster related location',
  'score': 0.2057553082704544,
  'raw_id': 0}]

In [7]:
preds = pd.DataFrame(all_predictions)

preds.sample(10)

Unnamed: 0,start,end,text,label,score,raw_id
1010,28,36,Florence,disaster related location,0.537928,238
2531,54,62,Florence,disaster related location,0.536101,630
413,37,45,Carolina,disaster related location,0.856449,100
4072,2,7,Haiti,disaster related location,0.999413,997
3611,82,88,Mexico,disaster related location,0.996668,885
1577,0,14,Bonita Springs,disaster related location,0.912909,390
3546,90,103,Fort McMurray,disaster related location,0.850488,869
3575,0,5,JAMMU,disaster related location,0.933096,877
180,98,106,SriLanka,disaster related location,0.997957,38
4382,33,50,Palm Beach County,disaster related location,0.83572,1085


In [8]:
preds["label"].value_counts()

label
disaster related location    6543
Name: count, dtype: int64

In [9]:
def create_predictions(raw: pd.DataFrame):
	entities = raw.to_dict("records")
	filtered_entities = []
	seen = {}

	for entity in entities:
		label = entity['label']
		text = entity['text']
		
		# Check if we have already seen this label-text pair
		if (label, text) not in seen:
			# Add to seen dictionary with the entity itself
			seen[(label, text)] = entity
		else:
			# If the entity already exists, keep the one with the higher score
			if entity['score'] > seen[(label, text)]['score']:
				seen[(label, text)] = entity

	# Add unique/high-score entities to filtered list
	filtered_entities = list(seen.values())

	# Step 2: Sort the entities first by label hierarchy and then by start index for duplicates
	sorted_entities = sorted(filtered_entities, key=lambda x: x['text'])
	return sorted_entities

structured_preds = preds.groupby("raw_id")[preds.columns].apply(create_predictions)

In [10]:
structured_preds.sample(10)

raw_id
729     [{'start': 0, 'end': 6, 'text': 'Punjab', 'lab...
285     [{'start': 79, 'end': 84, 'text': 'Miami', 'la...
1124    [{'start': 0, 'end': 11, 'text': 'New Zealand'...
115     [{'start': 83, 'end': 92, 'text': 'Aranayake',...
1492    [{'start': 105, 'end': 107, 'text': 'NC', 'lab...
746     [{'start': 23, 'end': 33, 'text': 'SouthTexas'...
1416    [{'start': 39, 'end': 46, 'text': 'Ecuador', '...
270     [{'start': 57, 'end': 59, 'text': 'NC', 'label...
523     [{'start': 58, 'end': 68, 'text': 'California'...
687     [{'start': 0, 'end': 10, 'text': 'California',...
dtype: object

In [11]:
structured_preds.loc[0]

[{'start': 60,
  'end': 65,
  'text': 'Texas',
  'label': 'disaster related location',
  'score': 0.9987847208976746,
  'raw_id': 0},
 {'start': 9,
  'end': 27,
  'text': 'Wright - Patterson',
  'label': 'disaster related location',
  'score': 0.2057553082704544,
  'raw_id': 0}]

In [12]:
structured_preds.loc[912]

[{'start': 148,
  'end': 158,
  'text': 'Mozambique',
  'label': 'disaster related location',
  'score': 0.9995207786560059,
  'raw_id': 912}]

In [13]:
import numpy as np

test_data.reset_index(drop=True, inplace=True)

n_position = test_data["location"].apply(lambda x: len(str(x).split()))

n_position.describe()

count    1645.000000
mean        1.316717
std         1.415931
min         0.000000
25%         0.000000
50%         1.000000
75%         2.000000
max        19.000000
Name: location, dtype: float64

In [14]:
prediction_list = structured_preds.tolist()
expected = [i or "@" for i in test_data["location"]]

len(expected), len(prediction_list)

(1645, 1316)

In [15]:
from evaluate import load

wer = load("wer")

def extract_predictions(thr = .5):
    order_element = lambda line: sorted(
        line, key=lambda x:  x["text"]
    )
    raws = [
        [j for j in i if j["score"] > thr]
        for i in prediction_list
    ]
    raws = [order_element(i) for i in raws]

    raws = {i[0]["raw_id"]: " ".join(j["text"] for j in i) for i in raws if i}
    preds = [raws.get(i, "@") for i in range(len(expected))]
    # refs = references["location"].fillna(" ").tolist()
    return wer.compute(predictions=preds, references=expected)

extract_predictions()

0.19946707270650932

In [16]:
best_score, best_thr = 1000, 0
for thr in np.linspace(0.1, 1, 50):
    score = extract_predictions(thr)
    if score < best_score:
        best_score = score
        best_thr = thr
        print(score, thr)

0.2805481537875904 0.1
0.2763608679101637 0.11836734693877551
0.2695089455652836 0.13673469387755102
0.2657023220403502 0.15510204081632656
0.2577084126379901 0.17346938775510207
0.24743052912066996 0.19183673469387758
0.24095926912828322 0.21020408163265308
0.2360106585458698 0.2285714285714286
0.2268747620860297 0.2469387755102041
0.22230681385610962 0.2653061224489796
0.21964217738865627 0.2836734693877551
0.2131709173962695 0.3020408163265306
0.20860296916634946 0.3387755102040817
0.20822230681385612 0.3571428571428572
0.20784164446136277 0.3755102040816327
0.20555767034640274 0.3938775510204082
0.2032736962314427 0.41224489795918373
0.19946707270650932 0.43061224489795924
0.19870574800152266 0.46734693877551026
0.19756376094404263 0.5775510204081633
