In [1]:
! pip install datasets pandas transformers numpy accelerate
import pandas as pd



In [2]:
from datasets import load_dataset, DatasetDict, Dataset
import numpy as np

# Load GermanQuAD dataset
germanquad = load_dataset("deepset/germanquad", trust_remote_code=True)

def split_dataset(dataset: DatasetDict, train_prop, val_prop, test_prop):
    """
    Splits the train dataset into train and validation sets,
    maintaining the ratio of short and long answers.
    Leaves the original test set untouched.
    """
    assert np.isclose(train_prop + val_prop + test_prop, 1.0), "Proportions must sum to 1"

    train_data = dataset["train"]

    def is_short_answer(example):
        return len(example['answers']['text'][0]) <= 50 if example['answers']['text'] else False

    # Calculate the ratio of short to long answers in the original train set
    short_answers = train_data.filter(is_short_answer)
    long_answers = train_data.filter(lambda x: not is_short_answer(x))
    short_ratio = len(short_answers) / len(train_data)

    # Calculate sizes for new train and validation sets
    total_size = len(train_data)
    new_train_size = int(total_size * (train_prop / (train_prop + val_prop)))
    val_size = total_size - new_train_size

    # Create new train set
    new_train_short = short_answers.select(range(int(new_train_size * short_ratio)))
    new_train_long = long_answers.select(range(int(new_train_size * (1 - short_ratio))))
    new_train = Dataset.from_dict({k: new_train_short[k] + new_train_long[k] for k in new_train_short.features}).shuffle(seed=42)

    # Create validation set
    val_short = short_answers.select(range(int(new_train_size * short_ratio), len(short_answers)))
    val_long = long_answers.select(range(int(new_train_size * (1 - short_ratio)), len(long_answers)))
    validation = Dataset.from_dict({k: val_short[k] + val_long[k] for k in val_short.features}).shuffle(seed=42)

    return DatasetDict({
        "train": new_train,
        "validation": validation,
        "test": dataset["test"]
    })

# Calculate target proportions
target_train_prop = 0.7
target_val_prop = 0.14
target_test_prop = 0.16

# Split the dataset
germanquad_split = split_dataset(germanquad, target_train_prop, target_val_prop, target_test_prop)

# Print the sizes of the splits
print("Train:", len(germanquad_split["train"]))
print("Validation:", len(germanquad_split["validation"]))
print("Test:", len(germanquad_split["test"]))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Train: 9597
Validation: 1921
Test: 2204


In [3]:
from tqdm.auto import tqdm
import collections
import numpy as np
from datasets import load_metric
from nltk.translate.bleu_score import sentence_bleu

squad_metric = load_metric("squad", trust_remote_code=True)
n_best = 20
max_answer_length = 30

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []

    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]

    # Calculate BLEU score
    bleu_scores = []
    for pred, ref in zip(predicted_answers, theoretical_answers):
        pred_text = pred["prediction_text"]
        ref_texts = ref["answers"]["text"]  # GermanQuAD has a list of texts
        if pred_text and ref_texts:  # Only calculate BLEU if we have a prediction and reference
            bleu_score = sentence_bleu([text.split() for text in ref_texts], pred_text.split())
            bleu_scores.append(bleu_score)

    # Compute SQuAD metrics
    squad_results = squad_metric.compute(predictions=predicted_answers, references=theoretical_answers)

    # Add BLEU score to the results
    if bleu_scores:
        squad_results["bleu"] = sum(bleu_scores) / len(bleu_scores)  # Average BLEU score
    else:
        squad_results["bleu"] = 0.0

    return squad_results

  squad_metric = load_metric("squad", trust_remote_code=True)


# gelectra-base

In [4]:
from transformers import AutoTokenizer

gelectra_model_checkpoint = "deepset/gelectra-base"
gelectra_tokenizer = AutoTokenizer.from_pretrained(gelectra_model_checkpoint)
gelectra_tokenizer.is_fast

tokenizer_config.json:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/440 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/240k [00:00<?, ?B/s]

True

In [5]:
import torch

max_length = 512
stride = 128

def gelectra_preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]

    tokenized_inputs = gelectra_tokenizer(
        questions,
        contexts,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = tokenized_inputs.pop("offset_mapping")
    sample_map = tokenized_inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        sequence_ids = tokenized_inputs.sequence_ids(i)

        # Find the start and end of the context
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1
        while sequence_ids[context_end] != 1:
            context_end -= 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    tokenized_inputs["start_positions"] = start_positions
    tokenized_inputs["end_positions"] = end_positions
    return tokenized_inputs


In [6]:
# You might want to adjust these based on your dataset analysis
max_length = 512
stride = 128

def gelectra_preprocess_eval_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]

    tokenized_inputs = gelectra_tokenizer(
        questions,
        contexts,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = tokenized_inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(tokenized_inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = tokenized_inputs.sequence_ids(i)
        offset = tokenized_inputs["offset_mapping"][i]
        tokenized_inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    tokenized_inputs["example_id"] = example_ids
    return tokenized_inputs

In [7]:
gelectra_train_dataset = germanquad_split["train"].map(
    gelectra_preprocess_training_examples,
    batched=True,
    remove_columns=germanquad_split["train"].column_names,
)
len(germanquad_split["train"]), len(gelectra_train_dataset)

Map:   0%|          | 0/9597 [00:00<?, ? examples/s]

(9597, 10175)

In [8]:
gelectra_validation_dataset = germanquad_split["validation"].map(
    gelectra_preprocess_eval_examples,
    batched=True,
    remove_columns=germanquad_split["validation"].column_names,
)
len(germanquad_split["validation"]), len(gelectra_validation_dataset)

Map:   0%|          | 0/1921 [00:00<?, ? examples/s]

(1921, 1987)

In [9]:
gelectra_test_dataset = germanquad_split["test"].map(
    gelectra_preprocess_eval_examples,
    batched=True,
    remove_columns=germanquad_split["test"].column_names,
)
len(germanquad_split["test"]), len(gelectra_test_dataset)

Map:   0%|          | 0/2204 [00:00<?, ? examples/s]

(2204, 3786)

In [10]:
from transformers import AutoModelForQuestionAnswering
gelectra_model = AutoModelForQuestionAnswering.from_pretrained(gelectra_model_checkpoint)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at deepset/gelectra-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from transformers import TrainingArguments
from transformers import Trainer
import time

args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    output_dir="my_awesome_qa_model",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True,  # Enable mixed precision training for fair comparison
    optim="adamw_torch",  # Explicitly use AdamW
)

trainer = Trainer(
    model=gelectra_model,
    args=args,
    train_dataset=gelectra_train_dataset,
    eval_dataset=gelectra_validation_dataset,
    tokenizer=gelectra_tokenizer,
    compute_metrics=compute_metrics
)

start_time = time.time()

trainer.train()

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Training took {elapsed_time:.2f} seconds")



Epoch,Training Loss,Validation Loss
1,3.5115,No log
2,1.9395,No log
3,1.5594,No log


Training took 1225.70 seconds


In [12]:
predictions, _, _ = trainer.predict(gelectra_test_dataset)
start_logits, end_logits = predictions
squad_results = compute_metrics(start_logits, end_logits, gelectra_test_dataset, germanquad_split["test"])
print(squad_results)

  0%|          | 0/2204 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


{'exact_match': 52.67695099818512, 'f1': 70.7212168910412, 'bleu': 0.33033887146158947}


# gbert-base

In [13]:
from transformers import AutoTokenizer

gbert_base_model_checkpoint = "deepset/gbert-base"
gbert_base_tokenizer = AutoTokenizer.from_pretrained(gbert_base_model_checkpoint)
gbert_base_tokenizer.is_fast

tokenizer_config.json:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/362 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/240k [00:00<?, ?B/s]

True

In [14]:
import torch

max_length = 512
stride = 128

def gbert_base_preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]

    tokenized_inputs = gbert_base_tokenizer(
        questions,
        contexts,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = tokenized_inputs.pop("offset_mapping")
    sample_map = tokenized_inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        sequence_ids = tokenized_inputs.sequence_ids(i)

        # Find the start and end of the context
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1
        while sequence_ids[context_end] != 1:
            context_end -= 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    tokenized_inputs["start_positions"] = start_positions
    tokenized_inputs["end_positions"] = end_positions
    return tokenized_inputs

In [15]:
# You might want to adjust these based on your dataset analysis
max_length = 512
stride = 128

def gbert_base_preprocess_eval_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]

    tokenized_inputs = gbert_base_tokenizer(
        questions,
        contexts,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = tokenized_inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(tokenized_inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = tokenized_inputs.sequence_ids(i)
        offset = tokenized_inputs["offset_mapping"][i]
        tokenized_inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    tokenized_inputs["example_id"] = example_ids
    return tokenized_inputs

In [16]:
gbert_base_train_dataset = germanquad_split["train"].map(
    gbert_base_preprocess_training_examples,
    batched=True,
    remove_columns=germanquad_split["train"].column_names,
)
len(germanquad_split["train"]), len(gbert_base_train_dataset)

Map:   0%|          | 0/9597 [00:00<?, ? examples/s]

(9597, 10175)

In [17]:
gbert_base_validation_dataset = germanquad_split["validation"].map(
    gbert_base_preprocess_eval_examples,
    batched=True,
    remove_columns=germanquad_split["validation"].column_names,
)
len(germanquad_split["validation"]), len(gbert_base_validation_dataset)

Map:   0%|          | 0/1921 [00:00<?, ? examples/s]

(1921, 1987)

In [18]:
gbert_base_test_dataset = germanquad_split["test"].map(
    gbert_base_preprocess_eval_examples,
    batched=True,
    remove_columns=germanquad_split["test"].column_names,
)
len(germanquad_split["test"]), len(gbert_base_test_dataset)

Map:   0%|          | 0/2204 [00:00<?, ? examples/s]

(2204, 3786)

In [19]:
from transformers import AutoModelForQuestionAnswering
gbert_base_model = AutoModelForQuestionAnswering.from_pretrained(gbert_base_model_checkpoint)

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
from transformers import TrainingArguments
from transformers import Trainer
import time

args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    output_dir="my_awesome_qa_model",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="none",
    fp16=True,  # Enable mixed precision training for fair comparison
    optim="adamw_torch",  # Explicitly use AdamW
)

trainer = Trainer(
    model=gbert_base_model,
    args=args,
    train_dataset=gbert_base_train_dataset,
    eval_dataset=gbert_base_validation_dataset,
    tokenizer=gbert_base_tokenizer,
    compute_metrics=compute_metrics
)

start_time = time.time()

trainer.train()

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Training took {elapsed_time:.2f} seconds")



Epoch,Training Loss,Validation Loss
1,2.8192,No log
2,1.6143,No log
3,1.1977,No log


Training took 879.52 seconds


In [21]:
predictions, _, _ = trainer.predict(gbert_base_test_dataset)
start_logits, end_logits = predictions
squad_results = compute_metrics(start_logits, end_logits, gbert_base_test_dataset, germanquad_split["test"])
print(squad_results)

  0%|          | 0/2204 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


{'exact_match': 52.85843920145191, 'f1': 71.40125860950285, 'bleu': 0.32366833990475935}


# gbert-large

In [22]:
from transformers import AutoTokenizer

gbert_large_model_checkpoint = "deepset/gbert-large"
gbert_large_tokenizer = AutoTokenizer.from_pretrained(gbert_large_model_checkpoint)
gbert_large_tokenizer.is_fast

True

In [23]:
import torch

max_length = 512
stride = 128

def gbert_large_preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]

    tokenized_inputs = gbert_large_tokenizer(
        questions,
        contexts,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = tokenized_inputs.pop("offset_mapping")
    sample_map = tokenized_inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        sequence_ids = tokenized_inputs.sequence_ids(i)

        # Find the start and end of the context
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1
        while sequence_ids[context_end] != 1:
            context_end -= 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    tokenized_inputs["start_positions"] = start_positions
    tokenized_inputs["end_positions"] = end_positions
    return tokenized_inputs

In [24]:
# You might want to adjust these based on your dataset analysis
max_length = 512
stride = 128

def gbert_large_preprocess_eval_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]

    tokenized_inputs = gbert_large_tokenizer(
        questions,
        contexts,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = tokenized_inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(tokenized_inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = tokenized_inputs.sequence_ids(i)
        offset = tokenized_inputs["offset_mapping"][i]
        tokenized_inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    tokenized_inputs["example_id"] = example_ids
    return tokenized_inputs

In [25]:
gbert_large_train_dataset = germanquad_split["train"].map(
    gbert_large_preprocess_training_examples,
    batched=True,
    remove_columns=germanquad_split["train"].column_names,
)
len(germanquad_split["train"]), len(gbert_large_train_dataset)

Map:   0%|          | 0/9597 [00:00<?, ? examples/s]

(9597, 10175)

In [26]:
gbert_large_validation_dataset = germanquad_split["validation"].map(
    gbert_large_preprocess_eval_examples,
    batched=True,
    remove_columns=germanquad_split["validation"].column_names,
)
len(germanquad_split["validation"]), len(gbert_large_validation_dataset)

Map:   0%|          | 0/1921 [00:00<?, ? examples/s]

(1921, 1987)

In [27]:
gbert_large_test_dataset = germanquad_split["test"].map(
    gbert_large_preprocess_eval_examples,
    batched=True,
    remove_columns=germanquad_split["test"].column_names,
)
len(germanquad_split["test"]), len(gbert_large_test_dataset)

(2204, 3786)

In [28]:
from transformers import AutoModelForQuestionAnswering
gbert_large_model = AutoModelForQuestionAnswering.from_pretrained(gbert_large_model_checkpoint)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at deepset/gbert-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
from transformers import TrainingArguments
from transformers import Trainer
import time

args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    output_dir="my_awesome_qa_model",
    per_device_train_batch_size=1,  # Reduced from 8
    per_device_eval_batch_size=1,   # Reduced from 8
    report_to="none",
    gradient_accumulation_steps=64,
    fp16=True,  # Enable mixed precision training for fair comparison
    optim="adamw_torch",  # Explicitly use AdamW
)

trainer = Trainer(
    model=gbert_large_model,
    args=args,
    train_dataset=gbert_large_train_dataset,
    eval_dataset=gbert_large_validation_dataset,
    tokenizer=gbert_large_tokenizer,
    compute_metrics=compute_metrics
)

start_time = time.time()

trainer.train()

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Training took {elapsed_time:.2f} seconds")



Epoch,Training Loss,Validation Loss
0,No log,No log
1,No log,No log
2,No log,No log


Training took 3742.05 seconds


In [30]:
predictions, _, _ = trainer.predict(gbert_large_test_dataset)
start_logits, end_logits = predictions
squad_results = compute_metrics(start_logits, end_logits, gbert_large_test_dataset, germanquad_split["test"])
print(squad_results)

  0%|          | 0/2204 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


{'exact_match': 62.43194192377496, 'f1': 82.07206036541679, 'bleu': 0.394001756699121}
