In [None]:
%pip install seqeval

In [None]:
import numpy as np 
import pandas as pd 

import warnings
warnings.filterwarnings('ignore')

In [None]:
print("\nSTART LOADING DATA\n")

In [None]:
train_df = pd.read_csv("/kaggle/input/ass4-smalldf/train.csv")[:1000]
val_df = pd.read_csv("/kaggle/input/ass4-smalldf/val.csv")[:500]

In [None]:
print("\nEND LOADING DATA\n")

In [None]:
from datasets import Dataset

def array_to_str(x):
    if isinstance(x, list):
        return x
    if isinstance(x, np.ndarray):
        return x.tolist()
    if isinstance(x, str):
        s = x.strip()
        if s.startswith("[") and s.endswith("]"):
            s = s[1:-1]
            s = s.strip()
            if not s:
                return []
            parts = s.split()
            try:
                return [int(p) for p in parts]
            except ValueError:
                return x
    return x

for df in (train_df, val_df):
    for col in df.columns:
        if df[col].dtype == "object":
            first_val = df[col].iloc[0]
            if isinstance(first_val, str):
                fs = first_val.strip()
                if fs.startswith("[") and fs.endswith("]"):
                    df[col] = df[col].apply(array_to_str)

train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

wanted_cols = [c for c in ["input_ids", "attention_mask", "token_type_ids", "labels"] if c in train_dataset.column_names]
extra_cols = [c for c in train_dataset.column_names if c not in wanted_cols]
if extra_cols:
    train_dataset = train_dataset.remove_columns(extra_cols)
    val_dataset = val_dataset.remove_columns(extra_cols)

In [None]:
model_name = "microsoft/mdeberta-v3-base" # mDeBERTa-v3

label_list = ["O", "S-LOC", "I-LOC"]

id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

batch_size = 4
epochs = 2
lr = 3e-5
num_labels = len(label_list)
padding = -100
max_len = 512

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name) 

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    label_pad_token_id=padding,
)

In [None]:
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score

def compute_metrics(eval_pred):
    true_tags = []
    pred_tags = []

    id2label_local = id2label

    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    
    for p_seq, l_seq in zip(preds, labels):
        t_seq = []
        p_tags = []
        
        for p, l in zip(p_seq, l_seq):
            if l == -100:
                continue
                
            t_seq.append(id2label_local[l])
            p_tags.append(id2label_local[p])
            
        true_tags.append(t_seq)
        pred_tags.append(p_tags)

    return {
        "precision": precision_score(true_tags, pred_tags),
        "recall": recall_score(true_tags, pred_tags),
        "f1": f1_score(true_tags, pred_tags),
    }

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./ner-mdeberta-v3",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    report_to="none",  
)

In [None]:
print("\nSTART TRAINING MODEL\n")

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
print("\nEND TRAINING MODE\n")

# Testing

In [None]:
test = pd.read_csv("/kaggle/input/kse-ua-location-extraction-2025/test.csv")

In [None]:
from datasets import Dataset

texts = test["text"].tolist()

encoded = tokenizer(
    texts,
    padding=True,
    truncation=True,
    max_length=max_len,
    return_offsets_mapping=True,
)

test_dataset = Dataset.from_dict({
    "input_ids": encoded["input_ids"],
    "attention_mask": encoded["attention_mask"],
})

pred_output = trainer.predict(test_dataset)

logits = pred_output.predictions 
pred_ids = logits.argmax(-1)

offsets = np.array(encoded["offset_mapping"])
attn = np.array(encoded["attention_mask"])

In [None]:
all_locations_strings = []

for i, text in enumerate(texts):
    seq_ids = pred_ids[i]
    seq_offsets = offsets[i]
    seq_attn = attn[i]

    spans = []
    current_span = None

    for token_id, (start, end), m in zip(seq_ids, seq_offsets, seq_attn):
        if m == 0:
            continue
        if start == 0 and end == 0:
            continue

        label = id2label[int(token_id)]

        if label.startswith("S-LOC"):
            if current_span is not None:
                spans.append(current_span)
            current_span = [start, end]
        elif label.startswith("I-LOC") and current_span is not None:
            current_span[1] = end
        else:
            if current_span is not None:
                spans.append(current_span)
                current_span = None

    if current_span is not None:
        spans.append(current_span)

    loc_strings = []
    for (start, end) in spans:
        loc = text[start:end].strip()
        if loc:
            loc_strings.append(loc)

    unique_locs = list(dict.fromkeys(loc_strings))
    locations_str = ", ".join(unique_locs)

    all_locations_strings.append(locations_str)

In [None]:
predictions = test[["text_id", "text"]].copy()
predictions["locations"] = all_locations_strings

# predictions.head()
predictions[["text_id", "locations"]].to_csv("baseline.csv", index=False)

---
# Code Ends