In [5]:
import torch
import datasets

In [6]:
# load the dataset
dataset = datasets.load_dataset(path="universal_dependencies", name="de_gsd", trust_remote_code=True)
print(dataset)
train_dataset = dataset["train"]
valid_dataset = dataset["validation"]
test_dataset = dataset["test"]

DatasetDict({
    train: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 13814
    })
    validation: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 799
    })
    test: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 977
    })
})


In [7]:
# read the first 10 examples
print(dataset)
print(train_dataset["text"][:10])
print(train_dataset["tokens"][:10])
print(train_dataset["upos"][:10])

upos_mapping = dataset["train"].features["upos"].feature

# store the possible pos tags
pos_list = upos_mapping.names
print(pos_list)

DatasetDict({
    train: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 13814
    })
    validation: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 799
    })
    test: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 977
    })
})
['Sehr gute Beratung, schnelle Behebung der Probleme, so stelle ich mir Kundenservice vor.', 'Die Kosten sind definitiv auch im Rahmen.', 'Nette Gespräche, klasse Ergebnis', 'Ich bin seit längerer Zeit zur Behandlung verschiedenster "Leiden" in der Physiotherapieraxis "Gaby Montag" im Vital Center und kann ausschließlich Positives berichten!', 'Ob bei der Terminvergabe, den Behandlungsräumen oder den individuell zugeschnittenen Trainingsplänen sind alle Mitarbeiter äußerst kompete

In [8]:
def tokenize_and_align_labels(examples, tokenizer, label_all_tokens=False, skip_index=-100):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=200
    )
    labels = []

    for i, label in enumerate(examples["upos"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids: list[int] = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(skip_index)

            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])

            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else skip_index)

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [9]:
# convert numerical labels to string labels
def upos_id_to_label(upos_mapping, i):
    return upos_mapping.int2str(i)

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")

# test tokenization
tokenized_inputs = tokenize_and_align_labels(train_dataset[:5], tokenizer)
print(tokenized_inputs)



{'input_ids': [[0, 93404, 25989, 58860, 6, 4, 17230, 13, 873, 195285, 122, 39344, 6, 4, 221, 91151, 654, 2296, 14829, 22584, 1248, 6, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 622, 30882, 1276, 44836, 921, 566, 23, 745, 36070, 6, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [11]:
# test if the tokenization and alignment worked
for i in range(len(tokenized_inputs["labels"])):
    print("text", i, len(tokenized_inputs["input_ids"][i]))
    print(tokenizer.decode(tokenized_inputs["input_ids"][i]))
    print([upos_id_to_label(upos_mapping, x) for x in tokenized_inputs["labels"][i] if x != -100])
    print([upos_id_to_label(upos_mapping, x) for x in train_dataset["upos"][i]])

text 0 200
<s> Sehr gute Beratung, schnelle Behebung der Probleme, so stelle ich mir Kundenservice vor.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
['ADV', 'ADJ

In [12]:
# convert the dataset to dataloader
train_dataset = train_dataset.map(lambda examples: tokenize_and_align_labels(examples, tokenizer), batched=True)
valid_dataset = valid_dataset.map(lambda examples: tokenize_and_align_labels(examples, tokenizer), batched=True)
test_datatset = test_dataset.map(lambda examples: tokenize_and_align_labels(examples, tokenizer), batched=True)
print(train_dataset)

Dataset({
    features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 13814
})


In [20]:
# test if huggingface dataset is converted to torch dataset
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)
valid_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=32)
test_datatset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
next(iter(train_dataloader))

{'input_ids': tensor([[     0,  93404,  25989,  ...,      1,      1,      1],
         [     0,    622,  30882,  ...,      1,      1,      1],
         [     0,  43268,     13,  ...,      1,      1,      1],
         ...,
         [     0,  43268,      6,  ...,      1,      1,      1],
         [     0,    563, 109833,  ...,      1,      1,      1],
         [     0,  35302,   1649,  ...,      1,      1,      1]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[-100,   14,    6,  ..., -100, -100, -100],
         [-100,    8,    0,  ..., -100, -100, -100],
         [-100,    6, -100,  ..., -100, -100, -100],
         ...,
         [-100,    6,    1,  ..., -100, -100, -100],
         [-100,   16, -100,  ..., -100, -100, -100],
         [-100,    7,   14,  ..., -100, -100, -1

In [15]:
from transformers import XLMRobertaModel


class POSTaggingModel(torch.nn.Module):
    def __init__(self, num_labels=18):
        super(POSTaggingModel, self).__init__()

        # load pre-trained XLM-RoBERTa model
        self.roberta = XLMRobertaModel.from_pretrained("xlm-roberta-base")

        # freeze RoBERTa parameters
        for param in self.roberta.parameters():
            param.requires_grad = False

        # project 768 hidden states to 0-17 POS tags
        self.ffn = torch.nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)

        # Shape: (batch_size, seq_length, hidden_size:768)
        hidden_states = outputs.last_hidden_state

        # feed forward layer
        # Shape: (batch_size, seq_length, num_labels:18)
        logits = self.ffn(hidden_states)

        return logits


# Extra: a more complex FFN model
class POSTaggingProModel(torch.nn.Module):
    def __init__(self, num_labels=18):
        super(POSTaggingProModel, self).__init__()

        # load pre-trained XLM-RoBERTa model
        self.roberta = XLMRobertaModel.from_pretrained("xlm-roberta-base")

        # freeze RoBERTa parameters
        for param in self.roberta.parameters():
            param.requires_grad = False

        # feed forward layers with activation functions
        self.ffn = torch.nn.Sequential(
            torch.nn.Linear(768, 256),
            torch.nn.GELU(),
            torch.nn.Linear(256, 128),
            torch.nn.GELU(),
            torch.nn.Linear(128, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)

        # Shape: (batch_size, seq_length, hidden_size:768)
        hidden_states = outputs.last_hidden_state

        # feed forward layer
        # Shape: (batch_size, seq_length, num_labels:18)
        logits = self.ffn(hidden_states)

        return logits

In [16]:
model = POSTaggingModel()
data = next(iter(train_dataloader))
x = data["input_ids"]
mask = data["attention_mask"]

print(model(x, mask))

  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[[-0.7207, -0.6867, -0.7943,  ..., -0.9311, -0.2526,  0.3514],
         [-0.4489, -0.3172, -0.6395,  ..., -0.2470, -0.2027,  0.1096],
         [-0.4852, -0.2170, -0.6565,  ..., -0.2056, -0.0989,  0.0149],
         ...,
         [-0.4165, -0.3430, -0.5566,  ..., -0.2985, -0.1887,  0.1986],
         [-0.4165, -0.3430, -0.5566,  ..., -0.2985, -0.1887,  0.1986],
         [-0.4165, -0.3430, -0.5566,  ..., -0.2985, -0.1887,  0.1986]],

        [[-0.6672, -0.6549, -0.6787,  ..., -0.8625, -0.2844,  0.3617],
         [-0.4161, -0.2460, -0.6047,  ..., -0.3318, -0.2200,  0.0841],
         [-0.5245, -0.3537, -0.5550,  ..., -0.2673, -0.1963,  0.1286],
         ...,
         [-0.7250, -0.6882, -0.4199,  ..., -0.7705, -0.3405,  0.4433],
         [-0.7250, -0.6882, -0.4199,  ..., -0.7705, -0.3405,  0.4433],
         [-0.7250, -0.6882, -0.4199,  ..., -0.7705, -0.3405,  0.4433]],

        [[-0.6184, -0.5650, -0.6222,  ..., -0.6214, -0.2944,  0.3046],
         [-0.3953, -0.4430, -0.6173,  ..., -0

In [12]:
from torch.utils.tensorboard import SummaryWriter


# train the model
def train(model, train_dataloader, valid_dataset, test_dataset, device, num_epochs=3, lr=1e-4, weight_decay=1e-2):
    # init tensorBoard path: base-base model pro-pro model
    writer = SummaryWriter("./runs/base")

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    # ignore the padding tokens
    criterion = torch.nn.CrossEntropyLoss(ignore_index=-100)

    model.to(device)
    model.train()

    print("batch num", len(train_dataloader))

    for epoch in range(num_epochs):
        total_loss = 0.0

        for i, data in enumerate(train_dataloader):
            x = data["input_ids"].to(device)
            mask = data["attention_mask"].to(device)
            y = data["labels"].to(device)

            optimizer.zero_grad()
            logits = model(x, mask)
            loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            loss.backward()
            optimizer.step()

            # log the loss curve
            total_loss += loss.item()
            print(f"Epoch {epoch}, Iteration {i}, Loss: {loss.item()}")
            writer.add_scalar("Training Loss Iter", loss.item(), epoch * len(train_dataloader) + i)

            if i > 0 and i % 50 == 0:
                # test the model
                model.eval()
                with torch.no_grad():
                    x = test_dataset["input_ids"].to(device)
                    mask = test_dataset["attention_mask"].to(device)
                    y = test_dataset["labels"].to(device)

                    logits = model(x, mask)
                    loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
                    print(f"Epoch {epoch}, Iteration {i}, Test Loss: {loss}")
                    writer.add_scalar("Test Loss", loss, epoch * len(train_dataloader) + i)
                model.train()

        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss}")

        # evaluation
        model.eval()
        with torch.no_grad():
            total_count = 0
            accurate_count = 0
            for data in valid_dataset:
                x = data["input_ids"].to(device)
                mask = data["attention_mask"].to(device)
                y = data["labels"].to(device)

                logits = model(x, mask)
                predictions = torch.argmax(logits, dim=-1)

                # flatten predictions and labels for comparison
                predictions = predictions.view(-1)
                y = y.view(-1)

                for prediction, label in zip(predictions, y):
                    # ignore following and padding tokens
                    if label == -100:
                        continue

                    # count label tokens
                    total_count += 1

                    # compare the prediction with the ground truth and count
                    if prediction == label:
                        accurate_count += 1

            # compute average accuracy
            avg_acc = accurate_count / total_count

            print(f"Epoch {epoch + 1}/{num_epochs}, Validation Accuracy: {avg_acc}")
            writer.add_scalar("Validation Accuracy", avg_acc, epoch)
        model.train()

    torch.save(model.state_dict(), "pos_tagging_model_0.pth")
    writer.close()

In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"

train(model, train_dataloader, valid_dataloader, test_datatset[:32], device, lr=1e-3, num_epochs=5)

# Extra: train the model with a more complex feed forward network
# model = POSTaggingProModel()
# train(model, train_dataloader, valid_dataloader, test_datatset[:32], device, lr=3e-4, num_epochs=15)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


batch num 432
Epoch 0, Iteration 0, Loss: 2.9078338146209717
Epoch 0, Iteration 1, Loss: 2.8946533203125
Epoch 0, Iteration 2, Loss: 2.8790156841278076
Epoch 0, Iteration 3, Loss: 2.859837055206299
Epoch 0, Iteration 4, Loss: 2.856666088104248
Epoch 0, Iteration 5, Loss: 2.8485870361328125
Epoch 0, Iteration 6, Loss: 2.8326451778411865
Epoch 0, Iteration 7, Loss: 2.8118059635162354
Epoch 0, Iteration 8, Loss: 2.8001186847686768
Epoch 0, Iteration 9, Loss: 2.7950963973999023
Epoch 0, Iteration 10, Loss: 2.7803945541381836
Epoch 0, Iteration 11, Loss: 2.7646584510803223
Epoch 0, Iteration 12, Loss: 2.737338066101074
Epoch 0, Iteration 13, Loss: 2.7233240604400635
Epoch 0, Iteration 14, Loss: 2.7123448848724365
Epoch 0, Iteration 15, Loss: 2.6781132221221924
Epoch 0, Iteration 16, Loss: 2.668684244155884
Epoch 0, Iteration 17, Loss: 2.6365914344787598
Epoch 0, Iteration 18, Loss: 2.6187744140625
Epoch 0, Iteration 19, Loss: 2.6043052673339844
Epoch 0, Iteration 20, Loss: 2.614780902862549

In [23]:
# evaluate the trained model
# to run this, please ignore the two training cells above
device = "cuda" if torch.cuda.is_available() else "cpu"

model = POSTaggingProModel().to(device)

model.load_state_dict(torch.load("pos_tagging_model_0.pth"))

model.eval()
with torch.no_grad():
    total_count = 0
    accurate_count = 0
    for data in valid_dataloader:
        x = data["input_ids"].to(device)
        mask = data["attention_mask"].to(device)
        y = data["labels"].to(device)

        logits = model(x, mask)
        predictions = torch.argmax(logits, dim=-1)

        # flatten predictions and labels for comparison
        predictions = predictions.view(-1)
        y = y.view(-1)

        for prediction, label in zip(predictions, y):
            # ignore following and padding tokens
            if label == -100:
                continue

            # count label tokens
            total_count += 1

            # compare the prediction with the ground truth and count
            if prediction == label:
                accurate_count += 1

    # compute average accuracy
    avg_acc = accurate_count / total_count

print(f"Validation Accuracy: {avg_acc}")

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


KeyboardInterrupt: 

# Observation
I trained 4 neural network models with different hyperparameters and architectures. 
The models are:
- Base Model: RoBERTa + Linear Layer
    - Learning Rate: 1e-3
    - Epochs: 5     
    - Final Validation Accuracy: 0.9356
- Pro Model1: RoBERTa + 3 Linear Layers with GELU activation functions
    - Learning Rate: 1e-3
    - Epochs: 5
    - Final Validation Accuracy: 0.9377
- Pro Model2: RoBERTa + 3 Linear Layers with GELU activation functions
    - Learning Rate: 6e-4
    - Epochs: 7
    - Final Validation Accuracy: 0.9382
- Pro Model3: RoBERTa + 3 Linear Layers with GELU activation functions
    - Learning Rate: 3e-4
    - Epochs: 15
    - Final Validation Accuracy: 0.9435

All of the models are trained with a 32 batch size. Compared to the HMM model whose valid accuracy is 91%, all the neural network models have a higher accuracy rate. The best model is Pro Model3 with a 94.35% validation accuracy. The model has a more complex feed forward network with 3 linear layers and GELU activation functions.

Compared to the HMM model, the neural network model has a higher accuracy score. However, it also takes a longer time to be trained. And the inference time is also longer than the HMM model. 
In summary, the neural network model is more complex and requires more computational resources. The HMM model is simpler and faster but less accurate.  

