# 1. Install packages
Please make sure the following packages are installed before running the notebook.

In [1]:

#! pip install transformers datasets
#! pip install accelerate -U
#! pip install opendatasets
#! pip install pandas
#! pip install datasets
#! pip install requests



In [2]:
import requests

def summery(title):
  url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{title}"
  response = requests.get(url)
  response.raise_for_status()
  return response.json()['extract']

# 2. Dataset Preparation
Here, we read in the dataset and tokenize it.

In [3]:
# prepare dataset
from transformers import BertTokenizerFast, BertForQuestionAnswering, AdamW, TrainingArguments, Trainer
from datasets import Dataset
import pandas as pd
import math


# and saved it locally to your google drive.
file = ("concap-en-nl.csv")
df = pd.read_csv(file)
# 1. Prepare the dataset

def preprocess_row(row):
    # Construct the question and context
    question = f"What is the capital of {row['CountryName']}?"
    capital = row['CapitalName']
    context = summery(capital)
    answer = capital
    if type(capital) == float and math.isnan(capital):
      answer = "None"
      context = f"{row['CountryName']} doesn't have a capital"
    return pd.Series({"question": question, "context": context, "answer": answer})

formatted_df = df.apply(preprocess_row, axis=1)
formatted_df.to_csv("formatted_dataset.csv", index=False)

formatted_dataset = Dataset.from_pandas(formatted_df)

# Initialize the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

def process_data_to_model_inputs(batch):
    # Tokenize the context/questions pairs
    inputs = tokenizer(batch['question'], batch['context'], padding='max_length', truncation=True, max_length=512, return_offsets_mapping=True,
)
    # Get the start and end position of the answer in the tokenized context
    start_positions = []
    end_positions = []
    offset_mapping = inputs.pop("offset_mapping")
    answers = batch["answer"]
    contexts = batch["context"]

    for i, offset in enumerate(offset_mapping):
      answer = answers[i]
      context = contexts[i]
      start_char = context.find(answer)
      end_char = start_char + len(answer)
      sequence_ids = inputs.sequence_ids(i)
      input_ids = inputs["input_ids"][i]
      cls_index = input_ids.index(tokenizer.cls_token_id)

      if start_char == -1:
          # If answer not found in context, just use first token
          start_positions.append(cls_index)
          end_positions.append(cls_index)
          continue

      idx = 0
      while sequence_ids[idx] != 1:
          idx += 1
      context_start = idx
      while sequence_ids[idx] == 1:
          idx += 1
      context_end = idx - 1

      if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
          start_positions.append(0)
          end_positions.append(0)
      else:
          # Otherwise it's the start and end token positions
          idx = context_start
          while idx <= context_end and offset[idx][0] <= start_char:
              idx += 1
          start_positions.append(idx - 1)

          idx = context_end
          while idx >= context_start and offset[idx][1] >= end_char:
              idx -= 1
          end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs


# Prepare the dataset
tokenized_datasets = formatted_dataset.map(process_data_to_model_inputs, batched=True)
# Assuming 90% for training and 10% for evaluation
train_dataset = tokenized_datasets.train_test_split(test_size=0.1)["train"]
eval_dataset = tokenized_datasets.train_test_split(test_size=0.1)["test"]

Map:   0%|          | 0/245 [00:00<?, ? examples/s]

# 3. Model training
We now load in the pre-trained cased multilingual BERT model, and train it using the hyperparameters specified in the report.

In [4]:
# 2. Initialize model and training
import logging
logging.basicConfig(level=logging.INFO)

model = BertForQuestionAnswering.from_pretrained('bert-base-multilingual-cased')
args = TrainingArguments(
    output_dir="./qa_model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=None,
)


trainer.train()

# Save the model
trainer.save_model("./fine_tuned_mbert_cased_qa")


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,4.9221
20,2.4597
30,0.9571
40,0.557
50,0.5643
60,0.3996
70,0.3734
80,0.4602
90,0.3384
100,0.3366


In [5]:
from transformers import pipeline
model_base = BertForQuestionAnswering.from_pretrained("bert-base-multilingual-cased")
base_qa = pipeline("question-answering", model=model_base, tokenizer=tokenizer)
#q = "Wat is de hoofdstad van China?"
#print("Without finetuning: ", base_qa(question=q, context='Peking is de hoofdstad van China'))
#
model_finetuned = BertForQuestionAnswering.from_pretrained("./fine_tuned_mbert_cased_qa")
finetuned_qa = pipeline("question-answering", model=model_finetuned, tokenizer=tokenizer)
#q = "Wat is de hoofdstad van China?"
#print("With finetuning: ", model_finetuned(question=q, context='Peking is de hoofdstad van China'))

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 4. Evaluation
Finally, we evaluate the fine-tuned model against the base BERT model, and save the results.

In [6]:
def preprocess(row):
  capital = row['CapitalName']
  country = row['CountryName']
  question_en = f"What's the of {country}?"
  # context_en = f"{capital} is the capital of {country}"
  context_en = f"The capital of the {country} is {capital}"
  base_ans_en = base_qa(question=question_en, context=context_en)['answer']
  fine_ans_en = finetuned_qa(question=question_en, context=context_en)['answer']

  country_nl = row['CountryNameDutch']
  capital_nl = row['CapitalNameDutch']
  question_nl = f"Wat is de hoofdstad van {country_nl}?"
  # context_nl = f"{capital_nl} is de hoofdstad van {country_nl}"
  context_nl = f"De hoofdstad van {country_nl} is {capital_nl}"
  base_ans_nl = base_qa(question=question_nl, context=context_nl)['answer']
  fine_ans_nl = finetuned_qa(question=question_nl, context=context_nl)['answer']
  return pd.Series({"country_en": country, "answer_en": capital, "answer_nl": capital_nl, "base_answer_en": base_ans_en, "finetune_answer_en": fine_ans_en , "base_answer_nl": base_ans_nl, "finetune_answer_nl": fine_ans_nl})


ans_df = df.apply(preprocess, axis=1)
ans_df.to_csv("result.csv", index=False)

In [7]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def accuracy(lan, ans_df):
  base_correct = 0
  fine_correct = 0
  total = 0
  for index, row in ans_df.iterrows():
    ans = row[f"answer_{lan}"]
    base = row[f"base_answer_{lan}"]
    fine = row[f"finetune_answer_{lan}"]


    if type(ans) == float and math.isnan(ans):
      ans = ""
    base_right = False
    fine_right = False
    if similar(ans.lower(), base.lower()) > .95:
      base_right = True
      base_correct+=1
    if similar(ans.lower(), fine.lower()) > .95:
      fine_right = True
      fine_correct+=1

    # if not (fine_right and base_right):
    #   print(f"lan: {lan} Country:{country} Actual: {ans} Base: {base} Finetuned: {fine}")

    total += 1
  print(f"lan={lan} base accuracy: {base_correct/total}, fine accuracy {fine_correct/total}")

accuracy("en", ans_df)
accuracy("nl", ans_df)



lan=en base accuracy: 0.2612244897959184, fine accuracy 0.9428571428571428
lan=nl base accuracy: 0.23265306122448978, fine accuracy 0.8979591836734694
