# Week 40 - Classifying Span containing Answer

In [None]:
import os
import polars as pl
import torch
import numpy as np

from bert_utils import (
    predict,
    prepare_data,
    tokenize_function,
    train_mbert,
)

# Huggingface imports
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
)
from datasets import load_dataset

In [43]:
# Select device for training
device = torch.device("cpu")
if torch.backends.mps.is_available():
    device = torch.device("mps")
if torch.cuda.is_available():
  device = torch.device("cuda")

print(f'Using device: {device}')

Using device: cuda


In [44]:
# Load dataset
dataset = load_dataset("coastalcph/tydi_xor_rc")
df_train = dataset["train"].to_polars()
df_val = dataset["validation"].to_polars()

# Arabic, Telegu and Korean
df_ar_train = df_train.filter(pl.col("lang") == "ar")
df_ar_val = df_val.filter(pl.col("lang") == "ar")
df_te_train = df_train.filter(pl.col("lang") == "te")
df_te_val = df_val.filter(pl.col("lang") == "te")
df_ko_train = df_train.filter((pl.col("lang") == "ko") & (pl.col("answerable") == True))
df_ko_val = df_val.filter(pl.col("lang") == "ko")

# Make a dict
data = {
    "arabic": {"train": df_ar_train, "val": df_ar_val},
    "telegu": {"train": df_te_train, "val": df_te_val},
    "korean": {"train": df_ko_train, "val": df_ko_val},
}
assert df_ko_train.height == sum(df_ko_train["answerable"]), "All answers should be answerable"

In [45]:
df_ko_train["answer"][1]

'Wilhelm Röntgen'

In [61]:
tst_cont = df_ko_train["context"][1]
tst_answer = df_ko_train["answer"][1]
print("CONTEXT: ", tst_cont)
print("ANSWER: ", tst_answer)

mbert_checkpoint = "bert-base-multilingual-uncased"
mbert_tokenizer = AutoTokenizer.from_pretrained(mbert_checkpoint)
print("CONTEXT TOKEN: ", mbert_tokenizer(tst_cont, return_offsets_mapping=True)["input_ids"])
print("ANSWER TOKEN: ", mbert_tokenizer(tst_answer, return_offsets_mapping=True)["offset_mapping"])

CONTEXT:  X-rays make up X-radiation, a form of electromagnetic radiation. Most X-rays have a wavelength ranging from 0.01 to 10 nanometers, corresponding to frequencies in the range 30 petahertz to 30 exahertz (3×10<sup>16</sup> Hz to 3×10<sup>19</sup> Hz) and energies in the range 100 eV to 100 keV. X-ray wavelengths are shorter than those of UV rays and typically longer than those of gamma rays. In many languages, X-radiation is referred to with terms meaning Röntgen radiation, after the German scientist Wilhelm Röntgen who discovered these on November 8, 1895, who usually is credited as its discoverer, and who named it "X-radiation" to signify an unknown type of radiation. Spelling of "X-ray(s)" in the English language includes the variants "x-ray(s)", "xray(s)", and "X ray(s)".
ANSWER:  Wilhelm Röntgen
CONTEXT TOKEN:  [101, 166, 118, 62793, 12696, 10700, 166, 118, 41949, 117, 143, 11857, 10108, 51900, 10380, 50418, 11183, 41949, 119, 10889, 166, 118, 62793, 10574, 143, 21560, 1161

In [47]:
num2bio = { 0: "O", 1: "B-ANS", 2: "I-ANS"}
bio2num = {"O": 0, "B-ANS": 1, "I-ANS": 2}

In [None]:
def sequence_labeler_robust(
    context: str, 
    answer_start: int, 
    answer_text: str,
) -> list:
    """Enhanced version with validation."""
    
    if answer_start == -1:  # Unanswerable
        encoding = mbert_tokenizer(context, return_offsets_mapping=True)
        return [0] * len(encoding["input_ids"])
    
    answer_end = answer_start + len(answer_text) if answer_text else answer_start + 1
    
    encoding = mbert_tokenizer(
        context,
        return_offsets_mapping=True,
        add_special_tokens=True,
        truncation=True,
        max_length=512  # Adjust based on your model
    )
    
    tokens = encoding["input_ids"]
    offset_mapping = encoding["offset_mapping"]
    labels = [0] * len(tokens)
    
    answer_token_indices = []
    for idx, (token_start, token_end) in enumerate(offset_mapping):
        if token_start == 0 and token_end == 0:
            continue
        if token_start < answer_end and token_end > answer_start:
            answer_token_indices.append(idx)
    
    if answer_token_indices:
        labels[answer_token_indices[0]] = 1
        for idx in answer_token_indices[1:]:
            labels[idx] = 2
    elif answer_text:  # Validation mode
        print(f"WARNING: No tokens found for answer '{answer_text}' at position {answer_start}")
        print(f"Context: {context[max(0, answer_start-20):answer_start+len(answer_text)+20]}")
    
    return labels

In [None]:
# Takes a series and returns a list
def sequence_labeler(context: str, answer: str) -> np.ndarray:
    context_tokens = mbert_tokenizer(context)["input_ids"]
    answer_tokens = mbert_tokenizer(answer)["input_ids"]
    answer_seq = answer_tokens[1:-1]
    labels = np.zeros(len(context_tokens), dtype=np.int8)
    window_size = len(answer_seq)
    found = False
    for i in range(len(context_tokens) - window_size + 1):
        tmp = context_tokens[i: i + window_size]
        if tmp == answer_seq:
            found = True
            labels[i] = 1
            for j in range(1, window_size):
                labels[i + j] = 2
    if not found:
        print("WARNING: Answer not found in context!")
        print("CONTEXT: ", context)
        print("ANSWER: ", answer)
        print("CONTEXT TOKENS: ", context_tokens)
        print("ANSWER TOKENS: ", answer_tokens)
        print("LABELS: ", labels)
        
    return labels

In [None]:
df_ko_train = df_ko_train.with_columns(
    pl.struct(["context", "answer"]).map_elements(
        lambda x: sequence_labeler(x["context"], x["answer"]), return_dtype=pl.List(pl.Int8)
    ).alias("labels")
)

CONTEXT:  The Eye of Horus is an ancient Egyptian symbol of protection and royal power from deities, in this case from Horus or Ra. The symbol is seen on images of Horus' mother, Isis, and on other deities associated with her. In the Egyptian language, the word for this symbol was "wedjat" ("wɟt"). It was the eye of one of the earliest of Egyptian deities, Wadjet, who later became associated with Bastet, Mut, and Hathor as well. Wadjet was a solar deity and this symbol began as her all-seeing eye. In early artwork, Hathor is also depicted with this eye. Funerary amulets were often made in the shape of the Eye of Horus. The Wedjat or Eye of Horus is "the central element" of seven "gold, faience, carnelian and lapis lazuli" bracelets found on the mummy of Shoshenq II. The Wedjat "was intended to protect the king [here] in the afterlife" and to ward off evil. Egyptian and Near Eastern sailors would frequently paint the symbol on the bow of their vessel to ensure safe sea travel.
ANSWER:  

In [None]:
all_correct_labels = df_ko_train.map_elements(
    lambda row: sequence_labeler(row["context"], row["answer"]),
    return_dtype=pl.List(pl.Int8),
    new_column_name="correct_labels",
)

AttributeError: 'DataFrame' object has no attribute 'map_elements'

In [None]:
# Train the models
all_classifiers = {}
all_tokenizers = {} # they're all the same
mbert_checkpoint = "bert-base-multilingual-uncased"
for lang in ["telugu"]:
    cap_lang = lang.capitalize()
    print(f"\n--- Processing language: {cap_lang} ---")
    trained = False
    classifiers_dir = "./mbert_span_classifiers"
    save_path = f"{lang}_mbert_span_classifier"
    full_save_path = os.path.join(classifiers_dir, save_path)
    # Check if model exists
    if not os.path.exists(classifiers_dir):
        print(f"No classifiers folder found, creating {classifiers_dir}...")
        os.makedirs(classifiers_dir)
    if os.path.exists(full_save_path):
        print(f"Found existing model for {cap_lang}, loading...")
        all_classifiers[lang] = AutoModelForSequenceClassification.from_pretrained(full_save_path)
        all_tokenizers[lang] = AutoTokenizer.from_pretrained(full_save_path) # all the same, we don't train tokenizer
        trained = True
        print(f"Model for {cap_lang} loaded.")

    # If model doesn't exist, train it
    if not trained:
        print("Model not found, training new mBERT model...")
        mbert_tokenizer = AutoTokenizer.from_pretrained(mbert_checkpoint)
        all_tokenizers[lang] = mbert_tokenizer # all the same, we don't train tokenizer
        mbert_classifier = AutoModelForSequenceClassification.from_pretrained(
            mbert_checkpoint,
            num_labels=2,
        )
        # Prepare datasets
        train_dataset = prepare_data(data[lang]["train"])
        val_dataset = prepare_data(data[lang]["val"])

        # Tokenize datasets - fix the function call
        tokenized_train = train_dataset.map(lambda examples: tokenize_function(examples, mbert_tokenizer), batched=True)
        tokenized_val = val_dataset.map(lambda examples: tokenize_function(examples, mbert_tokenizer), batched=True)
        # Train
        classifier, tokenizer = train_mbert(
            tokenized_train,
            tokenized_val,
            model_checkpoint = mbert_checkpoint,
            device=device,
        ) # type: ignore
        print("Saving model...")
        classifier.save_pretrained(full_save_path) # type: ignore
        tokenizer.save_pretrained(full_save_path) # type: ignore
        print(f"Model trained and saved to {full_save_path}.")
        # Store the trained model in notebook variable
        all_classifiers[lang] = AutoModelForSequenceClassification.from_pretrained(full_save_path)