##### Week 4

We now move from binary classification to span-based QA, i.e., identifying the span in the document that answers the question, when it is answerable.
Let k be the number of members in your group. Using the training data, implement k different sequence labellers for each of the three languages, which predict which tokens in a document are part of the answer to the correspond- ing question. Evaluate the sequence labellers on the respective validation sets, report and analyse the performance for each language and compare the scores across languages.

In [187]:
!pip install bpemb
!pip install gensim
!python -m spacy download en_core_web_sm
!pip install fasttext
!pip install datasets
!pip install sklearn

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [188]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [189]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [190]:
import io
from math import log
from numpy import array
from numpy import argmax
import torch
import random
from math import log
from numpy import array
from numpy import argmax
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR, CyclicLR
from typing import List, Tuple, AnyStr
from tqdm.notebook import tqdm
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt
from copy import deepcopy
from datasets import load_dataset, load_metric
from sklearn.metrics import confusion_matrix
import torch.nn.functional as F
import heapq

In [191]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently 
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

enforce_reproducibility()

In [192]:
# Preamble 
import sys 
sys.path.append('..')

In [193]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("copenlu/answerable_tydiqa")

train_set = dataset["train"]
validation_set = dataset["validation"]

df_train = train_set.to_pandas()
df_val = validation_set.to_pandas()

print(len(df_train))
print(len(df_val))

df_train.head()


Found cached dataset parquet (/Users/emmastoklundlee/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-cceecfb5416d988a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

116067
13325


Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url
0,Milloin Charles Fort syntyi?,Charles Fort,finnish,"{'answer_start': [18], 'answer_text': ['6. elo...",Charles Hoy Fort (6. elokuuta (joidenkin lähte...,https://fi.wikipedia.org/wiki/Charles%20Fort
1,“ダン” ダニエル・ジャドソン・キャラハンの出身はどこ,ダニエル・J・キャラハン,japanese,"{'answer_start': [35], 'answer_text': ['カリフォルニ...",“ダン”こと、ダニエル・ジャドソン・キャラハンは1890年7月26日、カリフォルニア州サンフ...,https://ja.wikipedia.org/wiki/%E3%83%80%E3%83%...
2,వేప చెట్టు యొక్క శాస్త్రీయ నామం ఏమిటి?,వేప,telugu,"{'answer_start': [12], 'answer_text': ['Azadir...","వేప (లాటిన్ Azadirachta indica, syn. Melia aza...",https://te.wikipedia.org/wiki/%E0%B0%B5%E0%B1%...
3,চেঙ্গিস খান কোন বংশের রাজা ছিলেন ?,চেঙ্গিজ খান,bengali,"{'answer_start': [414], 'answer_text': ['বোরজি...",চেঙ্গিজ খান (মঙ্গোলীয়: Чингис Хаан আ-ধ্ব-ব: ...,https://bn.wikipedia.org/wiki/%E0%A6%9A%E0%A7%...
4,రెయ్యలగడ్ద గ్రామ విస్తీర్ణత ఎంత?,రెయ్యలగడ్ద,telugu,"{'answer_start': [259], 'answer_text': ['27 హె...","రెయ్యలగడ్ద, విశాఖపట్నం జిల్లా, గంగరాజు మాడుగుల...",https://te.wikipedia.org/wiki/%E0%B0%B0%E0%B1%...


In [194]:
# Get train and validation data for each language
df_train_bengali = df_train[df_train['language'] == 'bengali']
df_train_arabic = df_train[df_train['language'] == 'arabic']
df_train_indonesian = df_train[df_train['language'] == 'indonesian']

df_val_bengali = df_val[df_val['language'] == 'bengali']
df_val_arabic = df_val[df_val['language'] == 'arabic']
df_val_indonesian = df_val[df_val['language'] == 'indonesian']


# For testing
df_val_english = df_val[df_val['language'] == 'english']
df_train_english = df_train[df_train['language'] == 'english']


In [195]:
from transformers import AutoTokenizer
mbert_tokeniser = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")

def tokenize(df, key, transformer_model):
  df.loc[:, f'{key}_tokenized'] = [transformer_model.tokenize(row) for row in df[key]]


def answer_text(df):
    # create new column with 1 if answerable, 0 if not answerable
    df['answerable'] = df['annotations'].apply(lambda x: 0 if x['answer_start'] == [-1] else 1)
    # drop all rows with answerable = 0
    df = df[df['answerable'] == 1]
    # return answer_text from annotations
    df['answer_text'] = df['annotations'].apply(lambda x: x['answer_text'][0])
    # create new column with answer_start converted to int
    df['answer_start_int'] = df['annotations'].apply(lambda x: int(x['answer_start'][0]))
    
    return df


In [196]:
df_train_english = answer_text(df_train_english)
df_val_english = answer_text(df_val_english)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['answerable'] = df['annotations'].apply(lambda x: 0 if x['answer_start'] == [-1] else 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['answer_text'] = df['annotations'].apply(lambda x: x['answer_text'][0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['answer_start_int'] = df['annotatio

In [197]:
def all_tokenize(df):
    tokenize(df, 'answer_text', mbert_tokeniser)
    tokenize(df, 'document_plaintext', mbert_tokeniser)
    return df

In [198]:
def split_words_to_characters(df):
    """
    Split tokenized words in a DataFrame into individual characters and save them in a new column.

    Args:
        df (pandas.DataFrame): The DataFrame containing tokenized words.
        word_column_name (str): The name of the column containing tokenized words.
        new_column_name (str): The name of the new column to store individual characters.

    Returns:
        pandas.DataFrame: The DataFrame with the new column added.
    """
    answer_text_char = []
    document_text_char = []

    for index, row in df.iterrows():
        answer_text = row['answer_text']
        document_text = row['document_plaintext']
        chars_ans = []
        chars_doc = []

        for word in answer_text:
            chars_ans.extend(list(word))  # Split word into individual characters and extend the list
        for word in document_text:
            chars_doc.extend(list(word))
        
        answer_text_char.append(chars_ans)
        document_text_char.append(chars_doc)

    df['answer_text_char'] = answer_text_char
    df['document_text_char'] = document_text_char
    return df




In [199]:
df_train_english = split_words_to_characters(df_train_english)
df_val_english = split_words_to_characters(df_val_english)

In [200]:
# return length of answer_text_tokenized
def answer_length(df):
    df['answer_length'] = df['answer_text_char'].apply(lambda x: len(x))
    return df

In [201]:
df_train_english = answer_length(df_train_english)
df_val_english = answer_length(df_val_english)

In [202]:
# create bio tags for document_plaintext_tokenized where B is index of answer_start_int and I is index of answer_start_int + answer_length, and other are 0
def iob_tags(df):
    df['iob_tags'] = df.apply(lambda x: ['0' if i < x['answer_start_int'] or i >= x['answer_start_int'] + x['answer_length'] else '1' if i == x['answer_start_int'] else '1' for i in range(len(x['document_text_char']))], axis=1)
    return df


In [203]:
df_train_english = iob_tags(df_train_english)
df_val_english = iob_tags(df_val_english)

In [225]:
# display first row
df_train_english['iob_tags'][26]

['0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '0',
 '0'

In [205]:
import numpy as np
from bpemb import BPEmb

# Load english model with 25k word-pieces
bpemb_id= BPEmb(lang='eng', dim=100, vs=25000)

# Assuming bpemb_id is your pre-trained word embeddings (e.g., fastText, Word2Vec, or GloVe)

# Extract the embeddings and add a randomly initialized embedding for our extra [PAD] token
pretrained_embeddings = np.concatenate([bpemb_id.emb.vectors, np.zeros(shape=(1, 100))], axis=0)

# Extract the vocab and add an extra [PAD] token
vocabulary = bpemb_id.emb.index_to_key + ['[PAD]']


In [206]:
# Create a dictionary from the embeddings
embedding_dict = {token: embedding for token, embedding in zip(vocabulary, pretrained_embeddings)}

# Define a function to tokenize and embed text
def tokenize_and_embed_text(df, embedding_dict):
    tokenized_text_list = []

    for document_text in df['document_text_char']:
        # Tokenize the document text
        tokens = [token for token in document_text]

        # Map tokens to embeddings using the dictionary
        token_embeddings = [embedding_dict.get(token, embedding_dict['[PAD]']) for token in tokens]

        # Append the token embeddings to the list
        tokenized_text_list.extend(token_embeddings)

    # Return the sequence embeddings as a NumPy array
    sequence_embedding = np.array(tokenized_text_list)
    return sequence_embedding

# Usage:
sequence_embedding = tokenize_and_embed_text(df_train_english, embedding_dict)


In [207]:
import torch
import torch.nn as nn

class BiLSTMCharacterLabeler(nn.Module):
    def __init__(
            self,
            sequence_embedding: torch.tensor,  # Replace with sequence_embedding
            lstm_dim: int,
            dropout_prob: float = 0.1,
            n_classes: int = 2
    ):
        super(BiLSTMCharacterLabeler, self).__init__()

        self.model = nn.ModuleDict({
            'embeddings': nn.Embedding.from_pretrained(sequence_embedding, padding_idx=sequence_embedding.shape[0] - 1),
            'bilstm': nn.LSTM(
                sequence_embedding.shape[1],
                lstm_dim,
                1,
                batch_first=True,
                dropout=dropout_prob,
                bidirectional=True),
            'cls': nn.Linear(2 * lstm_dim, n_classes)
        })

        self.n_classes = n_classes
        self.dropout = nn.Dropout(p=dropout_prob)

        self._init_weights()

    def _init_weights(self):
        all_params = list(self.model['bilstm'].named_parameters()) + list(self.model['cls'].named_parameters())
        for n, p in all_params:
            if 'weight' in n:
                nn.init.xavier_normal_(p)
            elif 'bias' in n:
                nn.init.zeros_(p)

    def forward(self, inputs, input_lens, labels=None):
        embeds = self.model['embeddings'](inputs)

        lstm_in = nn.utils.rnn.pack_padded_sequence(
            embeds,
            input_lens.cpu(),
            batch_first=True,
            enforce_sorted=False
        )

        lstm_out, _ = self.model['bilstm'](lstm_in)

        lstm_out, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)

        ff_in = self.dropout(torch.max(lstm_out, 1)[0])

        logits = self.model['cls'](ff_in).view(-1, self.n_classes)

        outputs = (logits,)

        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
            outputs = (loss,) + outputs

        return outputs


In [208]:
device = torch.device("cpu")
if torch.cuda.is_available():
  print("cuda available")
  device = torch.device("cuda")

In [226]:
# Create the model
model = BiLSTMCharacterLabeler(
    sequence_embedding=torch.FloatTensor(sequence_embedding),  # Use sequence_embedding instead of pretrained_embeddings
    lstm_dim=100,
    dropout_prob=0.1,
    n_classes=2
).to(device)





In [210]:
def accuracy(logits, labels):
  logits = np.asarray(logits).reshape(-1, len(logits[0]))
  labels = np.asarray(labels).reshape(-1)
  return np.sum(np.argmax(logits, axis=-1) == labels).astype(np.float32) / float(labels.shape[0])

In [211]:
from tqdm import tqdm_notebook as tqdm

In [227]:
def evaluate(model: nn.Module, valid_dl: DataLoader):
  """
  Evaluates the model on the given dataset
  :param model: The model under evaluation
  :param valid_dl: A `DataLoader` reading validation data
  :return: The accuracy of the model on the dataset
  """
  # VERY IMPORTANT: Put your model in "eval" mode -- this disables things like
  # layer normalization and dropout
  model.eval()
  labels_all = []
  logits_all = []

  # ALSO IMPORTANT: Don't accumulate gradients during this process
  with torch.no_grad():
    for batch in tqdm(valid_dl, desc='Evaluation'):
      batch = tuple(t.to(device) for t in batch)
      input_ids = batch[0]
      seq_lens = batch[1]
      labels = batch[2]

      _, logits = model(input_ids, seq_lens, labels=labels)
      labels_all.extend(list(labels.detach().cpu().numpy()))
      logits_all.extend(list(logits.detach().cpu().numpy()))
    acc = accuracy(logits_all, labels_all)

    return acc,labels_all,logits_all

In [228]:
def train(
    model: nn.Module,
    train_dl: DataLoader,
    valid_dl: DataLoader,
    optimizer: torch.optim.Optimizer,
    n_epochs: int,
    device: torch.device,
    patience: int = 10
):
  """
  The main training loop which will optimize a given model on a given dataset
  :param model: The model being optimized
  :param train_dl: The training dataset
  :param valid_dl: A validation dataset
  :param optimizer: The optimizer used to update the model parameters
  :param n_epochs: Number of epochs to train for
  :param device: The device to train on
  :return: (model, losses) The best model and the losses per iteration
  """

  # Keep track of the loss and best accuracy
  losses = []
  best_acc = 0.0
  pcounter = 0

  # Iterate through epochs
  for ep in range(n_epochs):

    loss_epoch = []

    #Iterate through each batch in the dataloader
    for batch in tqdm(train_dl):
      # VERY IMPORTANT: Make sure the model is in training mode, which turns on
      # things like dropout and layer normalization
      model.train()

      # VERY IMPORTANT: zero out all of the gradients on each iteration -- PyTorch
      # keeps track of these dynamically in its computation graph so you need to explicitly
      # zero them out
      optimizer.zero_grad()

      # Place each tensor on the GPU
      batch = tuple(t.to(device) for t in batch)
      input_ids = batch[0]
      seq_lens = batch[1]
      labels = batch[2]

      # Pass the inputs through the model, get the current loss and logits
      loss, logits = model(input_ids, seq_lens, labels=labels)
      losses.append(loss.item())
      loss_epoch.append(loss.item())

      # Calculate all of the gradients and weight updates for the model
      loss.backward()

      # Optional: clip gradients
      #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      # Finally, update the weights of the model
      optimizer.step()
      #gc.collect()

    # Perform inline evaluation at the end of the epoch
    acc,_,_ = evaluate(model, valid_dl)
    print(f'Validation accuracy: {acc}, train loss: {sum(loss_epoch) / len(loss_epoch)}')

    # Keep track of the best model based on the accuracy
    if acc > best_acc:
      torch.save(model.state_dict(), 'best_model')
      best_acc = acc
      pcounter = 0
    else:
      pcounter += 1
      if pcounter == patience:
        break
        #gc.collect()

  model.load_state_dict(torch.load('best_model'))
  return model, losses

In [229]:
from torch.optim import Adam

In [230]:
# Define some hyperparameters
batch_size = 32
lr = 3e-4
n_epochs = 100

In [231]:
# def text_to_batch_bilstm(text: List, tokenizer, max_len=512) -> Tuple[List, List]:
#     """
#     Creates a tokenized batch for input to a bilstm model
#     :param text: A list of sentences to tokenize
#     :param tokenizer: A tokenization function to use (i.e. fasttext)
#     :return: Tokenized text as well as the length of the input sequence
#     """
#     # Some light preprocessing
#     input_ids = [tokenizer.encode_ids_with_eos(t)[:max_len] for t in text]

#     return input_ids, [len(ids) for ids in input_ids]

In [238]:
import torch

def collate_batch_bilstm(batch):
    """
    Combines multiple data samples into a single batch
    :param batch: List of data samples, where each sample is a dictionary with 'input' and 'label' keys
    :return: A tuple of tensors (input_ids, seq_lens, labels)
    """
    input_data = [sample['input'] for sample in batch]
    labels = [sample['label'] for sample in batch]

    max_length = max([len(seq) for seq in input_data])

    # Pad all of the input samples to the max length (25000 is the ID of the [PAD] token)
    input_ids = [seq + [25000] * (max_length - len(seq)) for seq in input_data]

    # Make sure each sample is max_length long
    assert all(len(seq) == max_length for seq in input_ids)

    return torch.tensor(input_ids), torch.tensor(labels)



In [233]:

# # This will load the dataset and process it lazily in the __getitem__ function
# class ClassificationDatasetReader(Dataset):
#   def __init__(self, df, tokenizer):
#     self.df = df
#     self.tokenizer = tokenizer

#   def __len__(self):
#     return len(self.df)

#   def __getitem__(self, idx):
#     row = self.df.values[idx]
#     # Calls the text_to_batch function
#     input_ids,seq_lens = text_to_batch_bilstm([row[0]], self.tokenizer)
#     label = row[1]
#     return input_ids, seq_lens, label

In [234]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, df, text_column_name, label_column_name):
        self.data = df[text_column_name].tolist()
        self.labels = df[label_column_name].tolist()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = {
            'input': self.data[idx],  # Replace with your input data (sequence_embeddings)
            'label': self.labels[idx]  # Replace with your labels
        }
        return sample

In [239]:
# Define your model, optimizer, and other hyperparameters
# Create the model
model = BiLSTMCharacterLabeler(
    sequence_embedding=torch.FloatTensor(pretrained_embeddings),  # Use sequence_embedding instead of pretrained_embeddings
    lstm_dim=100,
    dropout_prob=0.1,
    n_classes=2
).to(device)

optimizer = Adam(model.parameters(), lr=lr)
n_epochs = 10  # Define the number of training epochs
batch_size = 32  # Define your batch size

# Create the dataset readers
# Assuming 'document_text_char' and 'iob_tags' are columns in your DataFrame
# Assuming 'document_text_char' is your text data and 'iob_tags' is your label data
train_dataset = CustomDataset(df_train_english[:1000], 'document_text_char', 'iob_tags')

train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch_bilstm)

# Assuming 'document_text_char' and 'iob_tags' are columns in your DataFrame
val_dataset = CustomDataset(df_val_english[:1000], 'document_text_char', 'iob_tags')
valid_dl = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_batch_bilstm)

# Train
model, losses = train(model, train_dl, valid_dl, optimizer, n_epochs, device)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm(train_dl):


  0%|          | 0/32 [00:00<?, ?it/s]

ValueError: too many dimensions 'str'

In [155]:
# Create the optimizer
optimizer = Adam(model.parameters(), lr=lr)

# Train
model, losses = train(model, train_dl, valid_dl, optimizer, n_epochs, device)

NameError: name 'train_dl' is not defined

In [107]:
# split document_plaintext and answer text into words 
def split_words(df):
    df['document_plaintext'] = df['document_plaintext'].apply(lambda x: x.split())
    df['answer_text'] = df['answer_text'].apply(lambda x: x.split())
    return df

In [122]:
# # iterate through each row of df and create a dictionary for each row with document_text_char as key and iob_tags as value
# def char_to_iob(df):
#     df['char_to_iob'] = df.apply(lambda x: dict(zip(x['document_text_char'], x['iob_tags'])), axis=1)
#     return df

# convert iob_tags list to string
def iob_tags_to_string(df):
    df['iob_tags'] = df['iob_tags'].apply(lambda x: ' '.join(x))
    return df

def iob_to_char(df):
    # Initialize an empty dictionary
    result_dict = {}
    
    # Iterate through each row of the DataFrame
    for _, row in df.iterrows():
        iob_tags = row['iob_tags']
        char = row['document_text_char']
        
        # Check if iob_tags is already a key in the dictionary
        if iob_tags not in result_dict:
            result_dict[iob_tags] = []
        
        # Append the character to the list associated with the iob_tags key
        result_dict[iob_tags].append(char)
    
    # Add the dictionary as a new column in the DataFrame
    df['iob_to_char'] = df['iob_tags'].map(result_dict)
    
    return df

# def char_to_iob(df):
#     # Initialize an empty dictionary
#     result_dict = {}
    
#     # Iterate through each row of the DataFrame
#     for _, row in df.iterrows():
#         iob_tags = row['iob_tags']
#         char = row['document_text_char']
        
#         # Check if iob_tags is already a key in the dictionary
#         if char not in result_dict:
#             result_dict[char] = []
        
#         # Append the character to the list associated with the iob_tags key
#         result_dict[char].append(iob_tags)
    
#     # Add the dictionary as a new column in the DataFrame
#     df['char_to_iob'] = df['document_text_char'].map(result_dict)
    
#     return df




In [101]:
# # iterate through each row of df and create a dictionary for each row with iob_tags as key and document_text_char as value
# def iob_to_char(df):
#     df['iob_to_char'] = df.apply(lambda x: dict(zip(x['iob_tags'], x['document_text_char'])), axis=1)
#     return df

In [123]:
# df_train_english = char_to_iob(df_train_english)
df_train_english = iob_to_char(df_train_english)

TypeError: unhashable type: 'list'

In [125]:
df_train_english['char_to_iob'][26]


{'Q': 'O',
 'u': 'O',
 'a': 'O',
 'n': 'O',
 't': 'O',
 'm': 'O',
 ' ': 'O',
 'f': 'O',
 'i': 'O',
 'e': 'O',
 'l': 'O',
 'd': 'O',
 'h': 'O',
 'o': 'O',
 'r': 'O',
 'y': 'O',
 'b': 'O',
 'g': 'O',
 'w': 'O',
 's': 'I',
 'c': 'O',
 ',': 'O',
 'k': 'O',
 '1': 'O',
 '9': 'I',
 '2': 'I',
 '0': 'I',
 '.': 'O',
 '[': 'O',
 '8': 'O',
 ']': 'O',
 ':': 'O'}

In [61]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
     "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
 )

NameError: name 'id2label' is not defined

In [104]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(example)
print(ner_results)


Downloading (…)okenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

[{'entity': 'B-PER', 'score': 0.9990139, 'index': 4, 'word': 'Wolfgang', 'start': 11, 'end': 19}, {'entity': 'B-LOC', 'score': 0.999645, 'index': 9, 'word': 'Berlin', 'start': 34, 'end': 40}]


In [110]:
# create new column in df applying nlp to document_plaintext_tokenized
def ner(df):
    df['ner'] = df['document_plaintext'].apply(lambda x: nlp(x))
    return df


In [108]:
df_train_english = split_words(df_train_english)

In [111]:
# df_train_english = ner(df_train_english)

KeyboardInterrupt: 