##### Week 4

We now move from binary classification to span-based QA, i.e., identifying the span in the document that answers the question, when it is answerable.
Let k be the number of members in your group. Using the training data, implement k different sequence labellers for each of the three languages, which predict which tokens in a document are part of the answer to the correspond- ing question. Evaluate the sequence labellers on the respective validation sets, report and analyse the performance for each language and compare the scores across languages.

In [1]:
!pip install bpemb
!pip install gensim
!python -m spacy download en_core_web_sm
!pip install fasttext
!pip install datasets
!pip install sklearn

/Users/emmastoklundlee/opt/anaconda3/envs/gensim_update/bin/python: No module named spacy


In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
import io
from math import log
from numpy import array
from numpy import argmax
import torch
import random
from math import log
from numpy import array
from numpy import argmax
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR, CyclicLR
from typing import List, Tuple, AnyStr
from tqdm.notebook import tqdm
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt
from copy import deepcopy
from datasets import load_dataset, load_metric
from sklearn.metrics import confusion_matrix
import torch.nn.functional as F
import heapq

In [5]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently 
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

enforce_reproducibility()

In [6]:
# Preamble 
import sys 
sys.path.append('..')

In [59]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("copenlu/answerable_tydiqa")

train_set = dataset["train"]
validation_set = dataset["validation"]

df_train = train_set.to_pandas()
df_val = validation_set.to_pandas()

print(len(df_train))
print(len(df_val))

df_train.head()


Found cached dataset parquet (/Users/emmastoklundlee/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-cceecfb5416d988a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

116067
13325


Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url
0,Milloin Charles Fort syntyi?,Charles Fort,finnish,"{'answer_start': [18], 'answer_text': ['6. elo...",Charles Hoy Fort (6. elokuuta (joidenkin lähte...,https://fi.wikipedia.org/wiki/Charles%20Fort
1,“ダン” ダニエル・ジャドソン・キャラハンの出身はどこ,ダニエル・J・キャラハン,japanese,"{'answer_start': [35], 'answer_text': ['カリフォルニ...",“ダン”こと、ダニエル・ジャドソン・キャラハンは1890年7月26日、カリフォルニア州サンフ...,https://ja.wikipedia.org/wiki/%E3%83%80%E3%83%...
2,వేప చెట్టు యొక్క శాస్త్రీయ నామం ఏమిటి?,వేప,telugu,"{'answer_start': [12], 'answer_text': ['Azadir...","వేప (లాటిన్ Azadirachta indica, syn. Melia aza...",https://te.wikipedia.org/wiki/%E0%B0%B5%E0%B1%...
3,চেঙ্গিস খান কোন বংশের রাজা ছিলেন ?,চেঙ্গিজ খান,bengali,"{'answer_start': [414], 'answer_text': ['বোরজি...",চেঙ্গিজ খান (মঙ্গোলীয়: Чингис Хаан আ-ধ্ব-ব: ...,https://bn.wikipedia.org/wiki/%E0%A6%9A%E0%A7%...
4,రెయ్యలగడ్ద గ్రామ విస్తీర్ణత ఎంత?,రెయ్యలగడ్ద,telugu,"{'answer_start': [259], 'answer_text': ['27 హె...","రెయ్యలగడ్ద, విశాఖపట్నం జిల్లా, గంగరాజు మాడుగుల...",https://te.wikipedia.org/wiki/%E0%B0%B0%E0%B1%...


In [97]:
# Get train and validation data for each language
df_train_bengali = df_train[df_train['language'] == 'bengali']
df_train_arabic = df_train[df_train['language'] == 'arabic']
df_train_indonesian = df_train[df_train['language'] == 'indonesian']

df_val_bengali = df_val[df_val['language'] == 'bengali']
df_val_arabic = df_val[df_val['language'] == 'arabic']
df_val_indonesian = df_val[df_val['language'] == 'indonesian']


# For testing
df_val_english = df_val[df_val['language'] == 'english']
df_train_english = df_train[df_train['language'] == 'english']


In [98]:
from transformers import AutoTokenizer
mbert_tokeniser = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")

def tokenize(df, key, transformer_model):
  df.loc[:, f'{key}_tokenized'] = [transformer_model.tokenize(row) for row in df[key]]


def answer_text(df):
    # create new column with 1 if answerable, 0 if not answerable
    df['answerable'] = df['annotations'].apply(lambda x: 0 if x['answer_start'] == [-1] else 1)
    # drop all rows with answerable = 0
    df = df[df['answerable'] == 1]
    # return answer_text from annotations
    df['answer_text'] = df['annotations'].apply(lambda x: x['answer_text'][0])
    # create new column with answer_start converted to int
    df['answer_start_int'] = df['annotations'].apply(lambda x: int(x['answer_start'][0]))
    
    return df


In [99]:
df_train_english = answer_text(df_train_english)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['answerable'] = df['annotations'].apply(lambda x: 0 if x['answer_start'] == [-1] else 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['answer_text'] = df['annotations'].apply(lambda x: x['answer_text'][0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['answer_start_int'] = df['annotatio

In [100]:
def all_tokenize(df):
    tokenize(df, 'answer_text', mbert_tokeniser)
    tokenize(df, 'document_plaintext', mbert_tokeniser)
    return df

In [115]:
def split_words_to_characters(df):
    """
    Split tokenized words in a DataFrame into individual characters and save them in a new column.

    Args:
        df (pandas.DataFrame): The DataFrame containing tokenized words.
        word_column_name (str): The name of the column containing tokenized words.
        new_column_name (str): The name of the new column to store individual characters.

    Returns:
        pandas.DataFrame: The DataFrame with the new column added.
    """
    answer_text_char = []
    document_text_char = []

    for index, row in df.iterrows():
        answer_text = row['answer_text']
        document_text = row['document_plaintext']
        chars_ans = []
        chars_doc = []

        for word in answer_text:
            chars_ans.extend(list(word))  # Split word into individual characters and extend the list
        for word in document_text:
            chars_doc.extend(list(word))
        
        answer_text_char.append(chars_ans)
        document_text_char.append(chars_doc)

    df['answer_text_char'] = answer_text_char
    df['document_text_char'] = document_text_char
    return df




In [118]:
df_train_english = split_words_to_characters(df_train_english)

In [122]:
# return length of answer_text_tokenized
def answer_length(df):
    df['answer_length'] = df['answer_text_char'].apply(lambda x: len(x))
    return df

In [123]:
df_train_english = answer_length(df_train_english)

In [124]:
# create bio tags for document_plaintext_tokenized where B is index of answer_start_int and I is index of answer_start_int + answer_length, and other are 0
def bio_tags(df):
    df['bio_tags'] = df.apply(lambda x: ['O' if i < x['answer_start_int'] or i >= x['answer_start_int'] + x['answer_length'] else 'B' if i == x['answer_start_int'] else 'I' for i in range(len(x['document_text_char']))], axis=1)
    return df


In [125]:
df_train_english = bio_tags(df_train_english)

In [136]:
df_train_english.head()

Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url,answerable,answer_text,answer_start_int,answer_text_tokenized,document_plaintext_tokenized,answer_length,bio_tags,answer_text_tokenized_split,answer_text_char,document_text_char
26,When was quantum field theory developed?,Quantum field theory,english,"{'answer_start': [159], 'answer_text': ['1920s']}",Quantum field theory naturally began with the ...,https://en.wikipedia.org/wiki/Quantum%20field%...,1,1920s,159,[1920s],"[quantum, field, theory, naturally, began, wit...",5,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",[1920s],"[1, 9, 2, 0, s]","[Q, u, a, n, t, u, m, , f, i, e, l, d, , t, ..."
43,Who was the first Nobel prize winner for Liter...,List of Nobel laureates in Literature,english,"{'answer_start': [610], 'answer_text': ['Sully...",The Nobel Prize in Literature (Swedish: Nobelp...,https://en.wikipedia.org/wiki/List%20of%20Nobe...,1,Sully Prudhomme,610,"[sull, ##y, pr, ##ud, ##hom, ##me]","[the, nobel, prize, in, literature, (, swedish...",15,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[sull, ##y, pr, ##ud, ##hom, ##me]","[S, u, l, l, y, , P, r, u, d, h, o, m, m, e]","[T, h, e, , N, o, b, e, l, , P, r, i, z, e, ..."
112,When is the dialectical method used?,Dialectic,english,"{'answer_start': [129], 'answer_text': ['disco...","Dialectic or dialectics (Greek: διαλεκτική, di...",https://en.wikipedia.org/wiki/Dialectic,1,discourse between two or more people holding d...,129,"[discourse, between, two, or, more, people, ho...","[dialect, ##ic, or, dialect, ##ics, (, greek, ...",147,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[discourse, between, two, or, more, people, ho...","[d, i, s, c, o, u, r, s, e, , b, e, t, w, e, ...","[D, i, a, l, e, c, t, i, c, , o, r, , d, i, ..."
123,Who invented Hangul?,Origin of Hangul,english,"{'answer_start': [88], 'answer_text': ['Sejong...",Hangul was personally created and promulgated ...,https://en.wikipedia.org/wiki/Origin%20of%20Ha...,1,Sejong the Great,88,"[se, ##jong, the, great]","[hangul, was, personally, created, and, promu,...",16,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[se, ##jong, the, great]","[S, e, j, o, n, g, , t, h, e, , G, r, e, a, t]","[H, a, n, g, u, l, , w, a, s, , p, e, r, s, ..."
125,What do Grasshoppers eat?,Grasshopper,english,"{'answer_start': [0], 'answer_text': ['Grassho...","Grasshoppers are plant-eaters, with a few spec...",https://en.wikipedia.org/wiki/Grasshopper,1,"Grasshoppers are plant-eaters, with a few spec...",0,"[grasshoppers, are, plant, -, eat, ##ers, ,, w...","[grasshoppers, are, plant, -, eat, ##ers, ,, w...",207,"[B, I, I, I, I, I, I, I, I, I, I, I, I, I, I, ...","[grasshoppers, are, plant, -, eat, ##ers, ,, w...","[G, r, a, s, s, h, o, p, p, e, r, s, , a, r, ...","[G, r, a, s, s, h, o, p, p, e, r, s, , a, r, ..."
