In [None]:
%pip install seqeval

In [None]:
import numpy as np 
import pandas as pd 

import warnings
warnings.filterwarnings('ignore')

## Work With Data

In [None]:
print("/nSTART LOADING/n")

In [None]:
ua_df = pd.read_csv("/kaggle/input/kse-ua-location-extraction-2025/uk_geo_dataset.csv")
ua_df['geo_is'] = 'ua'
ru_df = pd.read_csv("/kaggle/input/kse-ua-location-extraction-2025/ru_geo_dataset.csv")
ru_df['geo_is'] = 'ru'

In [None]:
print("/nEND LOADING/n")

In [None]:
ua_df_loc = ua_df[ua_df['loc_markers'] != '[]'] # max 233,421
ua_df_nloc = ua_df[ua_df['loc_markers'] == '[]'] # max 776,579

ru_df_loc = ru_df[ru_df['loc_markers'] != '[]'] # max 2,464,436
ru_df_nloc = ru_df[ru_df['loc_markers'] == '[]'] # max 5,564,404

In [None]:
df_loc = pd.concat([ua_df_loc[:200000], ru_df_loc[:600000]], axis=0)
df_nloc = pd.concat([ua_df_nloc[:600000], ru_df_nloc[:600000]], axis=0)

In [None]:
def oversample_ua(df, factor):
    
    minority_df = df[df["geo_is"] == "ua"]
    majority_df = df[df["geo_is"] != "ua"]
    
    extra = minority_df.sample(
        n=len(minority_df) * (factor - 1),
        replace=True,
        random_state=42
    )
    
    df_oversampled = pd.concat([majority_df, minority_df, extra], ignore_index=True)
    df_oversampled = df_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)
    return df_oversampled

In [None]:
df_loc = oversample_ua(df_loc, 3)
# df_nloc = oversample_ua(df_nloc, 4)

df_loc = df_loc[["text", "loc_markers"]]
df_nloc = df_nloc[["text", "loc_markers"]]

In [None]:
# 1,080,000 - 90%
train_df = pd.concat([df_loc[:1080000], df_nloc[:1080000]], axis=0) 
# 120,000 - 10%;
val_df = pd.concat([df_loc[1080000:], df_nloc[1080000:]], axis=0)   

In [None]:
### TEMP ###
### train_df, val_df = train_df[:1000], val_df[:500] 
### TEMP ###

## Tokinization

In [None]:
model_name = "microsoft/mdeberta-v3-base" # mDeBERTa-v3

label_list = ["O", "S-LOC", "I-LOC"]

id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

batch_size = 8
epochs = 2
lr = 3e-5
num_labels = len(label_list)
padding = -100

In [None]:
print("/nSTART TOKENIZE/n")

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name) 

In [None]:
print("/nEND TOKENIZE/n")

In [None]:
import ast

def char_markers_label(char_start, char_end, spans):
        for (span_start, span_end) in spans:
            if not (char_end <= span_start or char_start >= span_end):
                if char_start == span_start:
                    return "S-LOC"
                else:
                    return "I-LOC"
        return None



def to_token_labels(text, loc_markers, tokenizer, label2id):
    spans = []

    if loc_markers is None:
        loc_markers = []

    if isinstance(loc_markers, str):
        try:
            loc_markers = ast.literal_eval(loc_markers)
        except (SyntaxError, ValueError):
            loc_markers = []

    if not isinstance(loc_markers, (list, tuple)):
        loc_markers = []

    for span in loc_markers:
        if not isinstance(span, (list, tuple)):
            continue
        if len(span) != 2:
            continue
        s, e = span
        spans.append((int(s), int(e)))


    encoded = tokenizer(
        text,
        return_offsets_mapping=True,
        truncation=True,
        max_length=512,
    )

    offsets = encoded["offset_mapping"]
    labels = [label2id["O"]] * len(offsets)

    prev_was_loc = False
    for i, (start, end) in enumerate(offsets):
        if start == end:
            labels[i] = label2id["O"]
            prev_was_loc = False
            continue

        lbl = char_markers_label(start, end, spans)
        if lbl is None:
            labels[i] = label2id["O"]
            prev_was_loc = False
        else:
            if lbl == "I-LOC" and not prev_was_loc:
                lbl = "S-LOC"
            labels[i] = label2id[lbl]
            prev_was_loc = True

    encoded["labels"] = labels
    
    return encoded

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df[["text", "loc_markers"]])
val_dataset = Dataset.from_pandas(val_df[["text", "loc_markers"]])


In [None]:
def encode(df):
    ttl = to_token_labels(df["text"], df["loc_markers"], tokenizer, label2id)
    return ttl
    
train_dataset = train_dataset.map(encode, batched=False)
val_dataset = val_dataset.map(encode, batched=False)

In [None]:
train_dataset = train_dataset.remove_columns('offset_mapping')
train_dataset = train_dataset.remove_columns('loc_markers')
train_dataset = train_dataset.remove_columns('__index_level_0__')

val_dataset = val_dataset.remove_columns('offset_mapping')
val_dataset = val_dataset.remove_columns('loc_markers')
val_dataset = val_dataset.remove_columns('__index_level_0__')

In [None]:
print("/nSTART SAVING/n")

In [None]:
train_dataset.to_csv("train.csv", index=False)
val_dataset.to_csv("val.csv", index=False)

In [None]:
print("/nEND SAVING/n")

---
# Code Ends