##### Week 4

We now move from binary classification to span-based QA, i.e., identifying the span in the document that answers the question, when it is answerable.
Let k be the number of members in your group. Using the training data, implement k different sequence labellers for each of the three languages, which predict which tokens in a document are part of the answer to the correspond- ing question. Evaluate the sequence labellers on the respective validation sets, report and analyse the performance for each language and compare the scores across languages.

In [2]:
!pip install bpemb
!pip install gensim
!python -m spacy download en_core_web_sm
!pip install fasttext
!pip install datasets
!pip install sklearn

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [5]:
import io
from math import log
from numpy import array
from numpy import argmax
import torch
import random
from math import log
from numpy import array
from numpy import argmax
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR, CyclicLR
from typing import List, Tuple, AnyStr
from tqdm.notebook import tqdm
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt
from copy import deepcopy
from datasets import load_dataset, load_metric
from sklearn.metrics import confusion_matrix
import torch.nn.functional as F
import heapq

In [6]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently 
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

enforce_reproducibility()

In [7]:
# Preamble 
import sys 
sys.path.append('..')

In [8]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("copenlu/answerable_tydiqa")

train_set = dataset["train"]
validation_set = dataset["validation"]

df_train = train_set.to_pandas()
df_val = validation_set.to_pandas()

print(len(df_train))
print(len(df_val))

df_train.head()


Found cached dataset parquet (/Users/emmastoklundlee/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-cceecfb5416d988a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

116067
13325


Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url
0,Milloin Charles Fort syntyi?,Charles Fort,finnish,"{'answer_start': [18], 'answer_text': ['6. elo...",Charles Hoy Fort (6. elokuuta (joidenkin lähte...,https://fi.wikipedia.org/wiki/Charles%20Fort
1,“ダン” ダニエル・ジャドソン・キャラハンの出身はどこ,ダニエル・J・キャラハン,japanese,"{'answer_start': [35], 'answer_text': ['カリフォルニ...",“ダン”こと、ダニエル・ジャドソン・キャラハンは1890年7月26日、カリフォルニア州サンフ...,https://ja.wikipedia.org/wiki/%E3%83%80%E3%83%...
2,వేప చెట్టు యొక్క శాస్త్రీయ నామం ఏమిటి?,వేప,telugu,"{'answer_start': [12], 'answer_text': ['Azadir...","వేప (లాటిన్ Azadirachta indica, syn. Melia aza...",https://te.wikipedia.org/wiki/%E0%B0%B5%E0%B1%...
3,চেঙ্গিস খান কোন বংশের রাজা ছিলেন ?,চেঙ্গিজ খান,bengali,"{'answer_start': [414], 'answer_text': ['বোরজি...",চেঙ্গিজ খান (মঙ্গোলীয়: Чингис Хаан আ-ধ্ব-ব: ...,https://bn.wikipedia.org/wiki/%E0%A6%9A%E0%A7%...
4,రెయ్యలగడ్ద గ్రామ విస్తీర్ణత ఎంత?,రెయ్యలగడ్ద,telugu,"{'answer_start': [259], 'answer_text': ['27 హె...","రెయ్యలగడ్ద, విశాఖపట్నం జిల్లా, గంగరాజు మాడుగుల...",https://te.wikipedia.org/wiki/%E0%B0%B0%E0%B1%...


In [280]:
# Get train and validation data for each language
df_train_bengali = df_train[df_train['language'] == 'bengali']
df_train_arabic = df_train[df_train['language'] == 'arabic']
df_train_indonesian = df_train[df_train['language'] == 'indonesian']

df_val_bengali = df_val[df_val['language'] == 'bengali']
df_val_arabic = df_val[df_val['language'] == 'arabic']
df_val_indonesian = df_val[df_val['language'] == 'indonesian']


# For testing
df_val_english = df_val[df_val['language'] == 'english']
df_train_english = df_train[df_train['language'] == 'english']


In [281]:
from transformers import AutoTokenizer
mbert_tokeniser = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")

def tokenize(df, key, transformer_model):
  df.loc[:, f'{key}_tokenized'] = [transformer_model.tokenize(row) for row in df[key]]


def answer_text(df):
    # create new column with 1 if answerable, 0 if not answerable
    df['answerable'] = df['annotations'].apply(lambda x: 0 if x['answer_start'] == [-1] else 1)
    # drop all rows with answerable = 0
    # df = df[df['answerable'] == 1]
    # return answer_text from annotations
    df['answer_text'] = df['annotations'].apply(lambda x: x['answer_text'][0])
    # create new column with answer_start converted to int
    df['answer_start_int'] = df['annotations'].apply(lambda x: int(x['answer_start'][0]))
    
    return df


In [282]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer

# Load spaCy model (you can choose a different model if needed)
nlp = spacy.load("en_core_web_sm")


In [291]:
from transformers import AutoTokenizer
import torch


def label(df):
    # Initialize labels with zeros for each document token for each document in the dataframe
    df['labels'] = df['document_plaintext'].apply(lambda x: [0] * len(mbert_tokeniser.tokenize(x)))

    # Tokenize the answer text
    df['answer_text_tokenized'] = df['answer_text'].apply(mbert_tokeniser.tokenize)

    # Tokenize and process the document plaintext
    df['document_plaintext_tokenized'] = df['document_plaintext'].apply(mbert_tokeniser.tokenize)

    # Find the starting index of answer_text in document_plaintext for each document
    df['start_index'] = df.apply(lambda x: x['document_plaintext_tokenized'].index(x['answer_text_tokenized'][0]) if x['answer_text_tokenized'] and x['answer_text_tokenized'][0] in x['document_plaintext_tokenized'] else -1, axis=1)

    # Mark the corresponding tokens in document_plaintext_tokenized with 1
    df['labels'] = df.apply(lambda x: [1 if i >= x['start_index'] and i < x['start_index'] + len(x['answer_text_tokenized']) else 0 for i in range(len(x['document_plaintext_tokenized']))], axis=1)


    return df




In [292]:
df_train_english = label(answer_text(df_train_english))
df_val_english = label(answer_text(df_val_english))
df_train_bengali = label(answer_text(df_train_bengali))
df_val_bengali = label(answer_text(df_val_bengali))
df_train_arabic = label(answer_text(df_train_arabic))
df_val_arabic = label(answer_text(df_val_arabic))
df_train_indonesian = label(answer_text(df_train_indonesian))
df_val_indonesian = label(answer_text(df_val_indonesian))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['answerable'] = df['annotations'].apply(lambda x: 0 if x['answer_start'] == [-1] else 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['answer_text'] = df['annotations'].apply(lambda x: x['answer_text'][0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['answer_start_int'] = df['annotatio

In [293]:
# count values in answerable column
print(df_train_english['answerable'].value_counts())

answerable
1    3696
0    3693
Name: count, dtype: int64


In [146]:
df_train_arabic['answer_text'][6]

'مسألة وجود العثمانيين المسلمين في أوروبا وطردهم منها واستعادة القسطنطينية من العثمانيين بعد سقوطها في 1453 وتهديد مصالح الدول الأوروبية في هذه المنطقة'

In [294]:
import numpy as np
from bpemb import BPEmb

# Load English model with 25k word-pieces
bpemb_id = BPEmb(lang='eng', dim=100, vs=25000)

# Assuming bpemb_id is your pre-trained word embeddings (e.g., fastText, Word2Vec, or GloVe)

# Extract the embeddings
pretrained_embeddings = bpemb_id.emb.vectors

# Define the [PAD] token embedding as all zeros
pad_embedding = np.zeros(shape=(1, 100))

# Concatenate the embeddings with the [PAD] token
pretrained_embeddings_with_pad = np.concatenate([pretrained_embeddings, pad_embedding], axis=0)

# Extract the vocab and add an extra [PAD] token
vocabulary = bpemb_id.emb.index_to_key + ['[PAD]']

# Create a dictionary from the embeddings
embedding_dict = {token: embedding for token, embedding in zip(vocabulary, pretrained_embeddings_with_pad)}

# Ensure that the shape of pretrained_embeddings_with_pad is correct
print(pretrained_embeddings_with_pad.shape)


# Define a function to tokenize and embed text
def embed_text(df, embedding_dict):
    tokenized_text_list = []

    for document_text in df['document_plaintext_tokenized']:
        # Tokenize the document text
        tokens = [token for token in document_text]

        # Map tokens to embeddings using the dictionary
        token_embeddings = [embedding_dict.get(token, embedding_dict['[PAD]']) for token in tokens]

        # Append the token embeddings to the list
        tokenized_text_list.extend(token_embeddings)

    # Return the sequence embeddings as a NumPy array
    sequence_embedding = np.array(tokenized_text_list)
    return sequence_embedding

# Usage:
sequence_embedding = embed_text(df_train_english, embedding_dict)


(25001, 100)


In [295]:
pretrained_embeddings.shape

(25000, 100)

In [296]:
seq_embed_train_english = embed_text(df_train_english, embedding_dict)
seq_embed_val_englihs = embed_text(df_val_english, embedding_dict)
seq_embed_train_bengali = embed_text(df_train_bengali, embedding_dict)
seq_embed_val_bengali = embed_text(df_val_bengali, embedding_dict)
seq_embed_train_arabic = embed_text(df_train_arabic, embedding_dict)
seq_embed_val_arabic = embed_text(df_val_arabic, embedding_dict)
seq_embed_train_indonesian = embed_text(df_train_indonesian, embedding_dict)
seq_embed_val_indonesian = embed_text(df_val_indonesian, embedding_dict)

In [297]:
seq_embed_train_english.shape

(983911, 100)

In [298]:
# reindex dataframe
df_train_english = df_train_english.reset_index(drop=True)
df_val_english = df_val_english.reset_index(drop=True)
df_train_bengali = df_train_bengali.reset_index(drop=True)
df_val_bengali = df_val_bengali.reset_index(drop=True)
df_train_arabic = df_train_arabic.reset_index(drop=True)
df_val_arabic = df_val_arabic.reset_index(drop=True)
df_train_indonesian = df_train_indonesian.reset_index(drop=True)
df_val_indonesian = df_val_indonesian.reset_index(drop=True)


In [299]:
# create one long list of the lists in df_train_english['labels']

def get_labels(df):
    labels = []
    for i in range(len(df)):
        labels.extend(df['labels'][i])
    return np.array(labels)

In [300]:
english_train_labels = get_labels(df_train_english)
english_val_labels = get_labels(df_val_english)
bengali_train_labels = get_labels(df_train_bengali)
bengali_val_labels = get_labels(df_val_bengali)
arabic_train_labels = get_labels(df_train_arabic)
arabic_val_labels = get_labels(df_val_arabic)
indonesian_train_labels = get_labels(df_train_indonesian)
indonesian_val_labels = get_labels(df_val_indonesian)

In [304]:
df_train_english

Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url,answerable,answer_text,answer_start_int,labels,answer_text_tokenized,document_plaintext_tokenized,start_index
0,When was quantum field theory developed?,Quantum field theory,english,"{'answer_start': [159], 'answer_text': ['1920s']}",Quantum field theory naturally began with the ...,https://en.wikipedia.org/wiki/Quantum%20field%...,1,1920s,159,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[1920s],"[quantum, field, theory, naturally, began, wit...",31
1,Who was the first Nobel prize winner for Liter...,List of Nobel laureates in Literature,english,"{'answer_start': [610], 'answer_text': ['Sully...",The Nobel Prize in Literature (Swedish: Nobelp...,https://en.wikipedia.org/wiki/List%20of%20Nobe...,1,Sully Prudhomme,610,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[sull, ##y, pr, ##ud, ##hom, ##me]","[the, nobel, prize, in, literature, (, swedish...",120
2,When is the dialectical method used?,Dialectic,english,"{'answer_start': [129], 'answer_text': ['disco...","Dialectic or dialectics (Greek: διαλεκτική, di...",https://en.wikipedia.org/wiki/Dialectic,1,discourse between two or more people holding d...,129,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[discourse, between, two, or, more, people, ho...","[dialect, ##ic, or, dialect, ##ics, (, greek, ...",33
3,Who invented Hangul?,Origin of Hangul,english,"{'answer_start': [88], 'answer_text': ['Sejong...",Hangul was personally created and promulgated ...,https://en.wikipedia.org/wiki/Origin%20of%20Ha...,1,Sejong the Great,88,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[se, ##jong, the, great]","[hangul, was, personally, created, and, promu,...",18
4,What do Grasshoppers eat?,Grasshopper,english,"{'answer_start': [0], 'answer_text': ['Grassho...","Grasshoppers are plant-eaters, with a few spec...",https://en.wikipedia.org/wiki/Grasshopper,1,"Grasshoppers are plant-eaters, with a few spec...",0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[grasshoppers, are, plant, -, eat, ##ers, ,, w...","[grasshoppers, are, plant, -, eat, ##ers, ,, w...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7384,What was Neil Brooks' fastest recorded time?,Swimming at the 1980 Summer Olympics – Men's 4...,english,"{'answer_start': [-1], 'answer_text': ['']}",The medley relay was scheduled in the Olympisk...,https://en.wikipedia.org/wiki/Swimming%20at%20...,0,,-1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],"[the, medley, relay, was, scheduled, in, the, ...",-1
7385,Who are the three most important eastern philo...,Eastern philosophy,english,"{'answer_start': [-1], 'answer_text': ['']}",Sāmkhya is a dualist philosophical tradition b...,https://en.wikipedia.org/wiki/Eastern%20philos...,0,,-1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],"[sam, ##kh, ##ya, is, a, dual, ##ist, philosop...",-1
7386,Who was costume designer for the first Star Wa...,John Mollo,english,"{'answer_start': [-1], 'answer_text': ['']}",Mollo was surprised by the success of Star War...,https://en.wikipedia.org/wiki/John%20Mollo,0,,-1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],"[moll, ##o, was, surprise, ##d, by, the, succe...",-1
7387,Who developed the first thermonuclear weapon?,History of nuclear weapons,english,"{'answer_start': [-1], 'answer_text': ['']}","In the end, President Truman made the final de...",https://en.wikipedia.org/wiki/History%20of%20n...,0,,-1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[],"[in, the, end, ,, president, truman, made, the...",-1


In [301]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# define parameters

model_english = LogisticRegression()
model_english.fit(seq_embed_train_english, english_train_labels)

# English
y_pred_english = model_english.predict(seq_embed_val_englihs)
print()
print("ENGLISH - Logistic Regression")
print("Accuracy:", accuracy_score(english_val_labels, y_pred_english))
print("Precision:", precision_score(english_val_labels, y_pred_english))
print("Recall:", recall_score(english_val_labels, y_pred_english))
print("F1:", f1_score(english_val_labels, y_pred_english))

# model_bengali = LogisticRegression()
# model_bengali.fit(seq_embed_train_bengali, bengali_train_labels)

# # Bengali
# y_pred_bengali = model_bengali.predict(seq_embed_val_bengali)
# print()
# print("BENGALI - Logistic Regression")
# print("Accuracy:", accuracy_score(bengali_val_labels, y_pred_bengali))
# print("Precision:", precision_score(bengali_val_labels, y_pred_bengali))
# print("Recall:", recall_score(bengali_val_labels, y_pred_bengali))
# print("F1:", f1_score(bengali_val_labels, y_pred_bengali))

# model_arabic = LogisticRegression()
# model_arabic.fit(seq_embed_train_arabic, arabic_train_labels)

# # Arabic
# y_pred_arabic = model_arabic.predict(seq_embed_val_arabic)
# print()
# print("ARABIC - Logistic Regression")
# print("Accuracy:", accuracy_score(arabic_val_labels, y_pred_arabic))
# print("Precision:", precision_score(arabic_val_labels, y_pred_arabic))
# print("Recall:", recall_score(arabic_val_labels, y_pred_arabic))
# print("F1:", f1_score(arabic_val_labels, y_pred_arabic))

# model_indonesian = LogisticRegression()
# model_indonesian.fit(seq_embed_train_indonesian, indonesian_train_labels)

# # Indonesian
# y_pred_indonesian = model_indonesian.predict(seq_embed_val_indonesian)
# print()
# print("INDONESIAN - Logistic Regression")
# print("Accuracy:", accuracy_score(indonesian_val_labels, y_pred_indonesian))
# print("Precision:", precision_score(indonesian_val_labels, y_pred_indonesian))
# print("Recall:", recall_score(indonesian_val_labels, y_pred_indonesian))
# print("F1:", f1_score(indonesian_val_labels, y_pred_indonesian))




ENGLISH - Logistic Regression
Accuracy: 0.974860459262988
Precision: 0.0
Recall: 0.0
F1: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [303]:
# Random forest classifier
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier
# model_indonesian = RandomForestClassifier()
# model_bengali = RandomForestClassifier()
# model_arabic = RandomForestClassifier()
model_english = RandomForestClassifier()

# Fit the model to the data
model_english.fit(seq_embed_train_english, english_train_labels)
# model_indonesian.fit(seq_embed_train_indonesian, indonesian_train_labels)
# model_bengali.fit(seq_embed_train_bengali, bengali_train_labels)
# model_arabic.fit(seq_embed_train_arabic, arabic_train_labels)


# Evaluate the model
# # Indonesian
# y_pred_indonesian = model_indonesian.predict(seq_embed_val_indonesian)
# print()
# print("INDONESIAN - Random Forest")
# print("Accuracy:", accuracy_score(indonesian_val_labels, y_pred_indonesian))
# print("Precision:", precision_score(indonesian_val_labels, y_pred_indonesian))
# print("Recall:", recall_score(indonesian_val_labels, y_pred_indonesian))
# print("F1:", f1_score(indonesian_val_labels, y_pred_indonesian))

# # Bengali
# y_pred_bengali = model_bengali.predict(seq_embed_val_bengali)
# print()
# print("BENGALI - Random Forest")
# print("Accuracy:", accuracy_score(bengali_val_labels, y_pred_bengali))
# print("Precision:", precision_score(bengali_val_labels, y_pred_bengali))
# print("Recall:", recall_score(bengali_val_labels, y_pred_bengali))
# print("F1:", f1_score(bengali_val_labels, y_pred_bengali))


# # Arabic
# y_pred_arabic = model_arabic.predict(seq_embed_val_arabic)
# print()
# print("ARABIC - Random Forest")
# print("Accuracy:", accuracy_score(arabic_val_labels, y_pred_arabic))
# print("Precision:", precision_score(arabic_val_labels, y_pred_arabic))
# print("Recall:", recall_score(arabic_val_labels, y_pred_arabic))
# print("F1:", f1_score(arabic_val_labels, y_pred_arabic))

# English
y_pred_english = model_english.predict(seq_embed_val_englihs)
print()
print("ENGLISH - Random Forest")
print("Accuracy:", accuracy_score(english_val_labels, y_pred_english))
print("Precision:", precision_score(english_val_labels, y_pred_english))
print("Recall:", recall_score(english_val_labels, y_pred_english))
print("F1:", f1_score(english_val_labels, y_pred_english))


KeyboardInterrupt: 

In [None]:
! pip install transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)


# Create a tqdm progress bar to track the processing
with tqdm(total=len(ner_results)) as pbar:
    for result in ner_results:
        # Process each NER result here if needed
        print(result)
        # Update the progress bar
        pbar.update(1)



In [None]:
# version using the larger model
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

nlp_large = pipeline("ner", model=model, tokenizer=tokenizer)

In [None]:
# create new column in df applying nlp to document_plaintext_tokenized
def ner(df):
    df['ner'] = df['document_plaintext'].apply(lambda x: nlp(x))
    return df


In [None]:
# create new column in df applying nlp to document_plaintext_tokenized
def ner_large(df):
    df['ner_large'] = df['document_plaintext'].apply(lambda x: nlp_large(x))
    return df