##### Week 4

We now move from binary classification to span-based QA, i.e., identifying the span in the document that answers the question, when it is answerable.
Let k be the number of members in your group. Using the training data, implement k different sequence labellers for each of the three languages, which predict which tokens in a document are part of the answer to the correspond- ing question. Evaluate the sequence labellers on the respective validation sets, report and analyse the performance for each language and compare the scores across languages.

In [33]:
# !pip install bpemb
# !pip install gensim
# !python -m spacy download en_core_web_sm
# !pip install fasttext
# !pip install datasets
# !pip install sklearn

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
import io
from math import log
from numpy import array
from numpy import argmax
import torch
import random
from math import log
from numpy import array
from numpy import argmax
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR, CyclicLR
from typing import List, Tuple, AnyStr
from tqdm.notebook import tqdm
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt
from copy import deepcopy
from datasets import load_dataset, load_metric
from sklearn.metrics import confusion_matrix
import torch.nn.functional as F
import heapq


In [5]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently 
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

enforce_reproducibility()

In [6]:
# Preamble 
import sys 
sys.path.append('..')

In [7]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("copenlu/answerable_tydiqa")

train_set = dataset["train"]
validation_set = dataset["validation"]

df_train = train_set.to_pandas()
df_val = validation_set.to_pandas()

print(len(df_train))
print(len(df_val))

df_train.head()


Found cached dataset parquet (/Users/emmastoklundlee/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-cceecfb5416d988a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

116067
13325


Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url
0,Milloin Charles Fort syntyi?,Charles Fort,finnish,"{'answer_start': [18], 'answer_text': ['6. elo...",Charles Hoy Fort (6. elokuuta (joidenkin lähte...,https://fi.wikipedia.org/wiki/Charles%20Fort
1,“ダン” ダニエル・ジャドソン・キャラハンの出身はどこ,ダニエル・J・キャラハン,japanese,"{'answer_start': [35], 'answer_text': ['カリフォルニ...",“ダン”こと、ダニエル・ジャドソン・キャラハンは1890年7月26日、カリフォルニア州サンフ...,https://ja.wikipedia.org/wiki/%E3%83%80%E3%83%...
2,వేప చెట్టు యొక్క శాస్త్రీయ నామం ఏమిటి?,వేప,telugu,"{'answer_start': [12], 'answer_text': ['Azadir...","వేప (లాటిన్ Azadirachta indica, syn. Melia aza...",https://te.wikipedia.org/wiki/%E0%B0%B5%E0%B1%...
3,চেঙ্গিস খান কোন বংশের রাজা ছিলেন ?,চেঙ্গিজ খান,bengali,"{'answer_start': [414], 'answer_text': ['বোরজি...",চেঙ্গিজ খান (মঙ্গোলীয়: Чингис Хаан আ-ধ্ব-ব: ...,https://bn.wikipedia.org/wiki/%E0%A6%9A%E0%A7%...
4,రెయ్యలగడ్ద గ్రామ విస్తీర్ణత ఎంత?,రెయ్యలగడ్ద,telugu,"{'answer_start': [259], 'answer_text': ['27 హె...","రెయ్యలగడ్ద, విశాఖపట్నం జిల్లా, గంగరాజు మాడుగుల...",https://te.wikipedia.org/wiki/%E0%B0%B0%E0%B1%...


In [8]:
# Get train and validation data for each language
df_train_bengali = df_train[df_train['language'] == 'bengali']
df_train_arabic = df_train[df_train['language'] == 'arabic']
df_train_indonesian = df_train[df_train['language'] == 'indonesian']

df_val_bengali = df_val[df_val['language'] == 'bengali']
df_val_arabic = df_val[df_val['language'] == 'arabic']
df_val_indonesian = df_val[df_val['language'] == 'indonesian']


# For testing
df_val_english = df_val[df_val['language'] == 'english']
df_train_english = df_train[df_train['language'] == 'english']


In [9]:
from transformers import AutoTokenizer
mbert_tokeniser = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")

def tokenize(df, key, transformer_model):
  df.loc[:, f'{key}_tokenized'] = [transformer_model.tokenize(row) for row in df[key]]


def answer_text(df):
    # create new column with 1 if answerable, 0 if not answerable
    df['answerable'] = df['annotations'].apply(lambda x: 0 if x['answer_start'] == [-1] else 1)
    # drop all rows with answerable = 0
    # df = df[df['answerable'] == 1]
    # return answer_text from annotations
    df['answer_text'] = df['annotations'].apply(lambda x: x['answer_text'][0])
    # create new column with answer_start converted to int
    df['answer_start_int'] = df['annotations'].apply(lambda x: int(x['answer_start'][0]))
    
    return df


In [10]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer

# Load spaCy model (you can choose a different model if needed)
nlp = spacy.load("en_core_web_sm")


In [11]:
from transformers import AutoTokenizer
import torch


def label(df):
    # Initialize labels with zeros for each document token for each document in the dataframe
    df['labels'] = df['document_plaintext'].apply(lambda x: [0] * len(mbert_tokeniser.tokenize(x)))

    # Tokenize the answer text
    df['answer_text_tokenized'] = df['answer_text'].apply(mbert_tokeniser.tokenize)

    # Tokenize and process the document plaintext
    df['document_plaintext_tokenized'] = df['document_plaintext'].apply(mbert_tokeniser.tokenize)

    # Find the starting index of answer_text in document_plaintext for each document
    df['start_index'] = df.apply(lambda x: x['document_plaintext_tokenized'].index(x['answer_text_tokenized'][0]) if x['answer_text_tokenized'] and x['answer_text_tokenized'][0] in x['document_plaintext_tokenized'] else -1, axis=1)

    # Mark the corresponding tokens in document_plaintext_tokenized with 1
    df['labels'] = df.apply(lambda x: [1 if i >= x['start_index'] and i < x['start_index'] + len(x['answer_text_tokenized']) else 0 for i in range(len(x['document_plaintext_tokenized']))], axis=1)
    df['iob_tags'] = df.apply(lambda x: ['O' if i < x['start_index'] or i >= x['start_index'] + len(x['answer_text_tokenized']) else 'B' if i == x['start_index'] else 'I' for i in range(len(x['document_plaintext_tokenized']))], axis=1)


    return df




In [12]:
df_train_english = label(answer_text(df_train_english))
df_val_english = label(answer_text(df_val_english))
df_train_bengali = label(answer_text(df_train_bengali))
df_val_bengali = label(answer_text(df_val_bengali))
df_train_arabic = label(answer_text(df_train_arabic))
df_val_arabic = label(answer_text(df_val_arabic))
df_train_indonesian = label(answer_text(df_train_indonesian))
df_val_indonesian = label(answer_text(df_val_indonesian))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['answerable'] = df['annotations'].apply(lambda x: 0 if x['answer_start'] == [-1] else 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['answer_text'] = df['annotations'].apply(lambda x: x['answer_text'][0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['answer_start_int'] = df['annotatio

In [45]:
# import numpy as np
# from bpemb import BPEmb

# import numpy as np
# from bpemb import BPEmb

# def load_pretrained_embeddings(lang_code, dim=100, vs=25000):
#     # Initialize BPEmb for the specified language
#     bpemb_lang = BPEmb(lang=lang_code, dim=dim, vs=vs)

#     # Extract the embeddings
#     pretrained_embeddings = bpemb_lang.emb.vectors

#     # Define the [PAD] token embedding as all zeros
#     pad_embedding = np.zeros(shape=(1, dim))

#     # Concatenate the embeddings with the [PAD] token
#     pretrained_embeddings_with_pad = np.concatenate([pretrained_embeddings, pad_embedding], axis=0)

#     # Extract the vocab and add an extra [PAD] token
#     vocabulary = bpemb_lang.emb.index_to_key + ['[PAD]']

#     # Create a dictionary from the embeddings
#     embedding_dict = {token: embedding for token, embedding in zip(vocabulary, pretrained_embeddings_with_pad)}

#     # Ensure that the shape of pretrained_embeddings_with_pad is correct
#     print(pretrained_embeddings_with_pad.shape)

#     return embedding_dict

# # Example usage:
# # Load pretrained embeddings for Indonesian
# indonesian_embeddings = load_pretrained_embeddings(lang_code='id')

# # Load pretrained embeddings for Arabic
# arabic_embeddings = load_pretrained_embeddings(lang_code='ar')

# # Load pretrained embeddings for Bengali
# bengali_embeddings = load_pretrained_embeddings(lang_code='bn')

# # Load pretrained embeddings for English
# english_embeddings = load_pretrained_embeddings(lang_code='en')



# # Define a function to tokenize and embed text
# def embed_text(df, embedding_dict):
#     tokenized_text_list = []

#     for document_text in df['document_plaintext_tokenized']:
#         # Tokenize the document text
#         tokens = [token for token in document_text]

#         # Map tokens to embeddings using the dictionary
#         token_embeddings = [embedding_dict.get(token, embedding_dict['[PAD]']) for token in tokens]

#         # Append the token embeddings to the list
#         tokenized_text_list.extend(token_embeddings)

#     # Return the sequence embeddings as a NumPy array
#     sequence_embedding = np.array(tokenized_text_list)
#     return sequence_embedding




In [46]:
# import torch
# from transformers import BertTokenizer, BertModel

# # Load the pre-trained BERT model and tokenizer
# model_name = "bert-base-multilingual-uncased"  # You can choose a different BERT variant
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertModel.from_pretrained(model_name)

# def bert_embeddings(df):
#     # Assuming you have a DataFrame 'df' with a column 'document_plaintext_tokenized'

#     # Tokenize the text in your DataFrame
#     tokenized_texts = [tokenizer.encode(doc, add_special_tokens=True) for doc in df['document_plaintext_tokenized']]

#     # Truncate or pad the sequences to a fixed length
#     max_seq_length = 128  # Set your desired maximum sequence length
#     truncated_padded_tokenized_texts = []
#     for tokens in tokenized_texts:
#         if len(tokens) > max_seq_length:
#             tokens = tokens[:max_seq_length]  # Truncate if longer than max_seq_length
#         else:
#             padding_length = max_seq_length - len(tokens)
#             tokens = tokens + [tokenizer.pad_token_id] * padding_length
#         truncated_padded_tokenized_texts.append(tokens)

#     # Convert tokenized texts to tensors
#     input_ids = torch.tensor(truncated_padded_tokenized_texts)

#     # Generate BERT embeddings
#     with torch.no_grad():
#         outputs = model(input_ids)
#         embeddings = outputs.last_hidden_state  # Use the embeddings from the last layer

#     # The 'embeddings' variable now contains the BERT embeddings for your tokenized text
#     return embeddings



In [13]:
# import count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
# Tokenize the text
vectorizer = CountVectorizer()

# Indonesian
X_train_indonesian_tokenized = vectorizer.fit_transform(df_train_indonesian['document_plaintext'].ravel())
X_val_tokenized_indonesian = vectorizer.transform(df_val_indonesian['document_plaintext'].ravel())
Y_train_indonesian = df_train_indonesian['iob_tags'].ravel()
Y_val_indonesian = df_val_indonesian['iob_tags'].ravel()
# X_val_tokenized_indonesian = vectorizer.transform(X_val_indonesian.ravel())

# # Bengali
# X_train_bengali_tokenized = vectorizer.fit_transform(X_train_bengali.ravel())
# X_val_tokenized_bengali = vectorizer.transform(X_val_bengali.ravel())

# # Arabic
# X_train_arabic_tokenized = vectorizer.fit_transform(X_train_arabic.ravel())
# X_val_tokenized_arabic = vectorizer.transform(X_val_arabic.ravel())

# # English
# X_train_english_tokenized = vectorizer.fit_transform(X_train_english.ravel())
# X_val_tokenized_english = vectorizer.transform(X_val_english.ravel())


In [None]:
# add padding of Os to the end of each list in Y_train_indonesian and Y_val_indonesian
max_length = X_train_indonesian_tokenized.shape[1]
Y_train_indonesian = [label + ["O"] * (max_length - len(label)) for label in Y_train_indonesian]
Y_val_indonesian = [label + ["O"] * (max_length - len(label)) for label in Y_val_indonesian]

In [21]:
# fit random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Indonesian
rf_indonesian = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_indonesian.fit(X_train_indonesian_tokenized, Y_train_indonesian)
Y_pred_indonesian = rf_indonesian.predict(X_val_tokenized_indonesian)


ValueError: Unknown label type: 'unknown'

In [48]:
# seq_embed_train_english = tfidf_embeddings(df_train_english)
# seq_embed_val_english = tfidf_embeddings(df_val_english)
# seq_embed_train_bengali = tfidf_embeddings(df_train_bengali)
# seq_embed_val_bengali = tfidf_embeddings(df_val_bengali)
# seq_embed_train_arabic = tfidf_embeddings(df_train_arabic)
# seq_embed_val_arabic = tfidf_embeddings(df_val_arabic)
# seq_embed_train_indonesian = tfidf_embeddings(df_train_indonesian)
# seq_embed_val_indonesian = tfidf_embeddings(df_val_indonesian)



In [49]:
# # Determine the maximum length of TF-IDF vectors
# max_tfidf_length = seq_embed_train_english.shape[1]

# # Add padding ("O" tags) to the end of each IOB tag sequence to match the maximum length
# padded_iob_tags = [tags + ["O"] * (max_tfidf_length - len(tags)) for tags in english_train_iob_labels]

In [50]:
len(padded_iob_tags[0])

50046

In [51]:
seq_embed_train_english.shape

(7389, 50046)

In [52]:
# reindex dataframe
df_train_english = df_train_english.reset_index(drop=True)
df_val_english = df_val_english.reset_index(drop=True)
df_train_bengali = df_train_bengali.reset_index(drop=True)
df_val_bengali = df_val_bengali.reset_index(drop=True)
df_train_arabic = df_train_arabic.reset_index(drop=True)
df_val_arabic = df_val_arabic.reset_index(drop=True)
df_train_indonesian = df_train_indonesian.reset_index(drop=True)
df_val_indonesian = df_val_indonesian.reset_index(drop=True)


In [53]:
# create one long list of the lists in df_train_english['labels']

def get_labels(df):
    labels = []
    for i in range(len(df)):
        labels.append(df['labels'][i])
    return labels

In [54]:
def get_iob_labels(df):
    labels = []
    for i in range(len(df)):
        labels.append(df['iob_tags'][i])
    return labels

In [55]:
english_train_labels = get_labels(df_train_english)
english_val_labels = get_labels(df_val_english)
bengali_train_labels = get_labels(df_train_bengali)
bengali_val_labels = get_labels(df_val_bengali)
arabic_train_labels = get_labels(df_train_arabic)
arabic_val_labels = get_labels(df_val_arabic)
indonesian_train_labels = get_labels(df_train_indonesian)
indonesian_val_labels = get_labels(df_val_indonesian)

In [56]:
english_train_iob_labels = get_iob_labels(df_train_english)
english_val_iob_labels = get_iob_labels(df_val_english)
bengali_train_iob_labels = get_iob_labels(df_train_bengali)
bengali_val_iob_labels = get_iob_labels(df_val_bengali)
arabic_train_iob_labels = get_iob_labels(df_train_arabic)
arabic_val_iob_labels = get_iob_labels(df_val_arabic)
indonesian_train_iob_labels = get_iob_labels(df_train_indonesian)
indonesian_val_iob_labels = get_iob_labels(df_val_indonesian)


In [57]:
from sklearn.linear_model import LogisticRegression
import numpy as np

# Convert the CSR matrix to a dense NumPy array
seq_embed_train_english_dense = seq_embed_train_english.toarray()

# Define the model
model_english = LogisticRegression()

# Convert the list of labels to a NumPy array
# Flatten the list of lists into a 1D list
english_train_iob_labels = [label for sublist in english_train_iob_labels for label in sublist]

# Now, convert it to a NumPy array
english_train_iob_labels = np.array(english_train_iob_labels)


# Fit the model
model_english.fit(seq_embed_train_english_dense, english_train_iob_labels)

# Now, you should be able to use the fitted model



ValueError: Found input variables with inconsistent numbers of samples: [7389, 983911]

In [None]:
print(seq_embed_train_english.shape)
print(len(padded_iob_tags))


(7389, 50046)
7389


In [None]:
type(seq_embed_train_english)

scipy.sparse._csr.csr_matrix

In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# define parameters

# model_english = LogisticRegression()
# model_english.fit(seq_embed_train_english, padded_iob_tags)

# # English
# y_pred_english = model_english.predict(seq_embed_val_english)
# print()
# print("ENGLISH - Logistic Regression")
# print("Accuracy:", accuracy_score(padded_iob_tags, y_pred_english))
# print("Precision:", precision_score(padded_iob_tags, y_pred_english, average='weighted'))
# print("Recall:", recall_score(padded_iob_tags, y_pred_english, average='weighted'))
# print("F1:", f1_score(padded_iob_tags, y_pred_english, average='weighted'))

# model_bengali = LogisticRegression()
# model_bengali.fit(seq_embed_train_bengali, bengali_train_iob_labels)

# # Bengali
# y_pred_bengali = model_bengali.predict(seq_embed_val_bengali)
# print()
# print("BENGALI - Logistic Regression")
# print("Accuracy:", accuracy_score(bengali_val_iob_labels, y_pred_bengali))
# print("Precision:", precision_score(bengali_val_iob_labels, y_pred_bengali, average='weighted'))
# print("Recall:", recall_score(bengali_val_iob_labels, y_pred_bengali, average='weighted'))
# print("F1:", f1_score(bengali_val_iob_labels, y_pred_bengali, average='weighted'))

# model_arabic = LogisticRegression()
# model_arabic.fit(seq_embed_train_arabic, arabic_train_labels)

# # Arabic
# y_pred_arabic = model_arabic.predict(seq_embed_val_arabic)
# print()
# print("ARABIC - Logistic Regression")
# print("Accuracy:", accuracy_score(arabic_val_labels, y_pred_arabic))
# print("Precision:", precision_score(arabic_val_labels, y_pred_arabic))
# print("Recall:", recall_score(arabic_val_labels, y_pred_arabic))
# print("F1:", f1_score(arabic_val_labels, y_pred_arabic))

model_indonesian = LogisticRegression()
model_indonesian.fit(X_train_indonesian_tokenized, Y_train_indonesian)

# Indonesian
y_pred_indonesian = model_indonesian.predict(X_val_tokenized_indonesian)
print()
print("INDONESIAN - Logistic Regression")
print("Accuracy:", accuracy_score(Y_val_indonesian, y_pred_indonesian, average='weighted'))
print("Precision:", precision_score(Y_val_indonesian, y_pred_indonesian, average='weighted'))
print("Recall:", recall_score(Y_val_indonesian, y_pred_indonesian, average='weighted'))
print("F1:", f1_score(Y_val_indonesian, y_pred_indonesian, average='weighted'))



NameError: name 'X_train_indonesian_tokenized' is not defined

In [25]:
# Assuming you have a list of tokens
token_list = seq_embed_val_english

# Create an empty list to store the classifications
classifications = []

# Loop through each token and classify it using your scikit-learn model
for token in token_list:
    # Perform classification using your model (replace with your actual model)
    classification = model_english.predict([token])  # Assuming your_model is trained and can classify a single token
    classifications.append(classification)

# Now, 'classifications' contains the classification results for each token
print(classifications[:5])


[array(['O'], dtype='<U1'), array(['O'], dtype='<U1'), array(['O'], dtype='<U1'), array(['O'], dtype='<U1'), array(['O'], dtype='<U1')]


In [40]:
model_english.predict(seq_embed_val_english[5].reshape(1, -1))

  proba /= len(self.estimators_)


AttributeError: 'list' object has no attribute 'take'

In [30]:

# Random forest classifier
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier
# model_indonesian = RandomForestClassifier()
# model_bengali = RandomForestClassifier()
# model_arabic = RandomForestClassifier()
# model_english = RandomForestClassifier()

# # Fit the model to the data
# model_english.fit(seq_embed_train_english, english_train_iob_labels)
# model_indonesian.fit(seq_embed_train_indonesian, indonesian_train_labels)
# model_bengali.fit(seq_embed_train_bengali, bengali_train_labels)
# model_arabic.fit(seq_embed_train_arabic, arabic_train_labels)


# Evaluate the model
# # Indonesian
model_indonesian = RandomForestClassifier()
model_indonesian.fit(X_train_indonesian_tokenized, Y_train_indonesian)

# Indonesian
y_pred_indonesian = model_indonesian.predict(X_val_tokenized_indonesian)
print()
print("INDONESIAN - Logistic Regression")
print("Accuracy:", accuracy_score(Y_val_indonesian, y_pred_indonesian))
print("Precision:", precision_score(Y_val_indonesian, y_pred_indonesian))
print("Recall:", recall_score(Y_val_indonesian, y_pred_indonesian))
print("F1:", f1_score(Y_val_indonesian, y_pred_indonesian))

# # Bengali
# y_pred_bengali = model_bengali.predict(seq_embed_val_bengali)
# print()
# print("BENGALI - Random Forest")
# print("Accuracy:", accuracy_score(bengali_val_labels, y_pred_bengali))
# print("Precision:", precision_score(bengali_val_labels, y_pred_bengali))
# print("Recall:", recall_score(bengali_val_labels, y_pred_bengali))
# print("F1:", f1_score(bengali_val_labels, y_pred_bengali))


# # Arabic
# y_pred_arabic = model_arabic.predict(seq_embed_val_arabic)
# print()
# print("ARABIC - Random Forest")
# print("Accuracy:", accuracy_score(arabic_val_labels, y_pred_arabic))
# print("Precision:", precision_score(arabic_val_labels, y_pred_arabic))
# print("Recall:", recall_score(arabic_val_labels, y_pred_arabic))
# print("F1:", f1_score(arabic_val_labels, y_pred_arabic))

# English
# y_pred_english = model_english.predict(seq_embed_val_english)
# print()
# print("ENGLISH - Random Forest")
# print("Accuracy:", accuracy_score(english_val_iob_labels, y_pred_english))
# print("Precision:", precision_score(english_val_iob_labels, y_pred_english))
# print("Recall:", recall_score(english_val_iob_labels, y_pred_english))
# print("F1:", f1_score(english_val_iob_labels, y_pred_english))


: 

: 

In [29]:
# add padding of Os to the end of each list in Y_train_indonesian and Y_val_indonesian
max_length = X_train_indonesian_tokenized.shape[1]
Y_train_indonesian = [label + ["O"] * (max_length - len(label)) for label in Y_train_indonesian]
Y_val_indonesian = [label + ["O"] * (max_length - len(label)) for label in Y_val_indonesian]


In [None]:
! pip install transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)


# Create a tqdm progress bar to track the processing
with tqdm(total=len(ner_results)) as pbar:
    for result in ner_results:
        # Process each NER result here if needed
        print(result)
        # Update the progress bar
        pbar.update(1)



In [None]:
# version using the larger model
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

nlp_large = pipeline("ner", model=model, tokenizer=tokenizer)

In [None]:
# create new column in df applying nlp to document_plaintext_tokenized
def ner(df):
    df['ner'] = df['document_plaintext'].apply(lambda x: nlp(x))
    return df


In [None]:
# create new column in df applying nlp to document_plaintext_tokenized
def ner_large(df):
    df['ner_large'] = df['document_plaintext'].apply(lambda x: nlp_large(x))
    return df