In [27]:
import numpy as np
from transformers import BertTokenizer
import torch.nn as nn
from transformers import BertTokenizer, BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset, TensorDataset
import torch.optim as optim
import torch

# Preprocessing

In [19]:
class NERDataset(Dataset):
    def __init__(self, file_path):
        # Read data from file
        with open(file_path, 'r', encoding='utf-8') as file:
            data = file.readlines()

        self.sentences = []
        current_sentence = []
        for line in data:
            if line.startswith('# sent_id'):
                if current_sentence:
                    self.sentences.append(current_sentence)
                current_sentence = []
            elif line.strip() and not line.startswith('#'):
                parts = line.split('\t')
                current_sentence.append((parts[1], parts[2]))

        if current_sentence:
            self.sentences.append(current_sentence)

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx]

def tokenize_text(sentences, tokenizer):
    tokenized_texts = []
    labels = []
    for sentence in sentences:
        tokens = [token for token, _ in sentence]
        labels_per_sentence = [label for _, label in sentence]
        tokenized_sentence = tokenizer.encode(tokens, add_special_tokens=True)
        tokenized_texts.append(tokenized_sentence)
        labels.append(labels_per_sentence)
    return tokenized_texts, labels

def encode_labels(labels, label_map):
    encoded_labels = [[label_map[label] for label in sentence] for sentence in labels]
    return encoded_labels

def pad_sequences(sequences, max_length):
    padded_sequences = np.zeros((len(sequences), max_length), dtype=np.int32)
    for i, seq in enumerate(sequences):
        padded_sequences[i, :len(seq)] = seq
    return padded_sequences

In [45]:
train_dataset = NERDataset('en_ewt-ud-train.iob2')
val_dataset = NERDataset('en_ewt-ud-dev.iob2')
test_dataset = NERDataset('en_ewt-ud-test-masked.iob2')

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenized_texts, labels = tokenize_text(train_dataset, tokenizer)

label_map = {label: i for i, label in enumerate(set(label for sublist in labels for label in sublist))}
num_labels = len(label_map)

max_length = max(len(seq) for seq in tokenized_texts)

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
def collate_fn(batch):
    sentences = [item[0] for item in batch]
    labels = [item[1] for item in batch]

    tokenized_texts, _ = tokenize_text(sentences, tokenizer)

    encoded_labels = encode_labels(labels, label_map)

    padded_tokenized_texts = pad_sequences(tokenized_texts, max_length)
    padded_encoded_labels = pad_sequences(encoded_labels, max_length)

    return padded_tokenized_texts, padded_encoded_labels

# Create DataLoader instances with custom collate function
batch_size = 32

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [44]:
# Inspect DataLoader batches
for batch_idx, batch in enumerate(train_dataloader):
    print(f"Batch {batch_idx + 1}:")
    print("Type of batch:", type(batch))
    print("Length of batch:", len(batch))
    print()

    try:
        input_ids, labels = batch
        print(f"Type of input_ids:", type(input_ids))
        print(f"Shape of input_ids:", input_ids.shape)
        print()
        print(f"Type of labels:", type(labels))
        print(f"Shape of labels:", labels.shape)
    except Exception as e:
        print("Error:", e)
    
    print()


IndexError: list index out of range

### Further preprocessing ideas
- Converting to lowercase
- Other normalization

# Fine tuning

In [29]:
model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=num_labels)
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        b_input_ids = batch[0]
        b_labels = batch[1]

        optimizer.zero_grad()

        outputs = model(b_input_ids, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)

    model.eval()
    val_loss = 0
    for batch in val_dataloader:
        with torch.no_grad():
            b_input_ids = batch[0]
            b_labels = batch[1]

            outputs = model(b_input_ids, labels=b_labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f'Epoch {epoch + 1}:')
    print(f'Training Loss: {avg_train_loss}')
    print(f'Validation Loss: {avg_val_loss}')

model.save_pretrained('fine_tuned_mbert')

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


IndexError: list index out of range

# Evaluation