In [1]:
import os
import random
import numpy as np
from dataclasses import dataclass
from typing import Optional, Tuple, List, Dict

import torch
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    default_data_collator,
)
import evaluate
from datasets import load_dataset, DatasetDict, concatenate_datasets


In [2]:
MODEL_NAME = "bert-base-multilingual-cased"
MAX_LENGTH = 384
DOC_STRIDE = 128
PAD_ON_RIGHT = True
SEED = 42

LANGS = ["en", "es", "hi", "de", "ar", "ru", "vi", "zh", "tr", "th"]

SPLIT_RATIO = 0.9



In [3]:
# Utils
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed()

def load_xquad_multilingual(langs):
    pieces = []
    for lang in langs:
        cfg = f"xquad.{lang}"
        ds = load_dataset("xquad", cfg)
        ds = ds["validation"].train_test_split(test_size=0.1, seed=42)
        ds = DatasetDict(train=ds["train"], validation=ds["test"])
        pieces.append(ds)

    train_all = pieces[0]["train"]
    valid_all = pieces[0]["validation"]

    for part in pieces[1:]:
        train_all = concatenate_datasets([train_all, part["train"]])
        valid_all = concatenate_datasets([valid_all, part["validation"]])

    return DatasetDict(train=train_all, validation=valid_all)

raw_datasets = load_xquad_multilingual(LANGS)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
# Tokenizer / Model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)



Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Preprocessing
def prepare_train_features(examples):
    questions = [q.lstrip() for q in examples["question"]]

    tokenized = tokenizer(
        questions if PAD_ON_RIGHT else examples["context"],
        examples["context"] if PAD_ON_RIGHT else questions,
        truncation="only_second" if PAD_ON_RIGHT else "only_first",
        max_length=MAX_LENGTH,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sample_idx = sample_mapping[i]
        answers = examples["answers"][sample_idx]
        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
            continue

        start_char = answers["answer_start"][0]
        answer_text = answers["text"][0]
        end_char = start_char + len(answer_text)

        sequence_ids = tokenized.sequence_ids(i)

        context_index = 1 if PAD_ON_RIGHT else 0

        token_start_index = 0
        while sequence_ids[token_start_index] != context_index:
            token_start_index += 1
        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != context_index:
            token_end_index -= 1

        if not (offsets[token_start_index][0] <= start_char and
                offsets[token_end_index][1] >= end_char):
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                token_start_index += 1
            start_positions.append(token_start_index - 1)
            while offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            end_positions.append(token_end_index + 1)

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    return tokenized



In [7]:
def prepare_validation_features(examples):
    questions = [q.lstrip() for q in examples["question"]]
    tokenized = tokenizer(
        questions if PAD_ON_RIGHT else examples["context"],
        examples["context"] if PAD_ON_RIGHT else questions,
        truncation="only_second" if PAD_ON_RIGHT else "only_first",
        max_length=MAX_LENGTH,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    example_ids = []

    tokenized["example_id"] = []
    for i in range(len(tokenized["input_ids"])):
        sequence_ids = tokenized.sequence_ids(i)
        context_index = 1 if PAD_ON_RIGHT else 0

        sample_idx = sample_mapping[i]
        example_ids.append(examples["id"][sample_idx])

        offsets = tokenized["offset_mapping"][i]
        tokenized["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(offsets)
        ]
        tokenized["example_id"].append(examples["id"][sample_idx])

    return tokenized

train_dataset = raw_datasets["train"].map(
    prepare_train_features,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

eval_examples = raw_datasets["validation"]
eval_dataset = eval_examples.map(
    prepare_validation_features,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

# Metrics (SQuAD-style EM/F1)
squad_metric = evaluate.load("squad")

def postprocess_qa_predictions(
    examples,
    features,
    predictions: Tuple[np.ndarray, np.ndarray],
    n_best_size: int = 20,
    max_answer_length: int = 30,
):
    all_start_logits, all_end_logits = predictions
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = {}
    for i, f in enumerate(features):
        example_id = f["example_id"]
        features_per_example.setdefault(example_id, []).append(i)

    predictions = {}
    for example_id, feature_indices in features_per_example.items():
        context = examples[example_id_to_index[example_id]]["context"]

        min_null_score = None
        valid_answers = []

        for fi in feature_indices:
            start_logits = all_start_logits[fi]
            end_logits = all_end_logits[fi]
            offsets = features[fi]["offset_mapping"]

            start_indexes = np.argsort(start_logits)[-1:-n_best_size-1:-1].tolist()
            end_indexes = np.argsort(end_logits)[-1:-n_best_size-1:-1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if start_index >= len(offsets) or end_index >= len(offsets):
                        continue
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    if end_index < start_index:
                        continue
                    length = end_index - start_index + 1
                    if length > max_answer_length:
                        continue

                    start_char = offsets[start_index][0]
                    end_char = offsets[end_index][1]
                    text = context[start_char:end_char]
                    score = start_logits[start_index] + end_logits[end_index]
                    valid_answers.append({"text": text, "score": score})

        if valid_answers:
            best_answer = max(valid_answers, key=lambda x: x["score"])
            predictions[example_id] = best_answer["text"]
        else:
            predictions[example_id] = ""

    return predictions

def compute_metrics(p):
    preds = postprocess_qa_predictions(
        eval_examples, eval_dataset, p.predictions
    )
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in eval_examples]
    return squad_metric.compute(predictions=[{"id": k, "prediction_text": v} for k, v in preds.items()],
                                references=references)




Map:   0%|          | 0/1190 [00:00<?, ? examples/s]

In [9]:
# Training
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="./mbert-xquad",
    eval_steps=1000,
    save_steps=1000,
    logging_steps=200,
    learning_rate=3e-5,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    warmup_ratio=0.1,
    gradient_accumulation_steps=1,
    fp16=torch.cuda.is_available(),
    report_to="none",
    save_total_limit=2,
)

data_collator = default_data_collator



In [10]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
metrics = trainer.evaluate()
print("Evaluation:", metrics)

trainer.save_model("./mbert-xquad/final")
tokenizer.save_pretrained("./mbert-xquad/final")

  trainer = Trainer(


Step,Training Loss
200,4.3912
400,2.1416
600,1.7375
800,1.5989
1000,1.5263
1200,1.4157
1400,1.2739
1600,1.2038
1800,0.8671
2000,0.7705


Evaluation: {'eval_runtime': 11.2154, 'eval_samples_per_second': 127.147, 'eval_steps_per_second': 15.96, 'epoch': 2.0}


('./mbert-xquad/final/tokenizer_config.json',
 './mbert-xquad/final/special_tokens_map.json',
 './mbert-xquad/final/vocab.txt',
 './mbert-xquad/final/added_tokens.json',
 './mbert-xquad/final/tokenizer.json')

In [11]:
from google.colab import drive
drive.mount('/content/drive')

SAVE_DIR = "/content/drive/MyDrive/mbert_xquad_model"
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print(f"✅ Model saved permanently to Google Drive: {SAVE_DIR}")

Mounted at /content/drive
✅ Model saved permanently to Google Drive: /content/drive/MyDrive/mbert_xquad_model


In [12]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained(SAVE_DIR)
tokenizer = AutoTokenizer.from_pretrained(SAVE_DIR)


In [13]:
from transformers import pipeline

qa = pipeline("question-answering", model=model, tokenizer=tokenizer)

context = "El Taj Mahal está en la ciudad de Agra, India."
question = "¿En qué ciudad está el Taj Mahal?"
print(qa({"question": question, "context": context}))


Device set to use cuda:0


{'score': 0.9206821701664012, 'start': 34, 'end': 38, 'answer': 'Agra'}


