In [22]:
import torch
import datasets
from jupyterlab.semver import valid

In [23]:
# load the dataset
dataset = datasets.load_dataset(path="universal_dependencies", name="de_gsd", trust_remote_code=True)
print(dataset)
train_dataset = dataset["train"]
valid_dataset = dataset["validation"]
test_dataset = dataset["test"]

DatasetDict({
    train: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 13814
    })
    validation: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 799
    })
    test: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 977
    })
})


In [24]:
# read the first 10 examples
print(dataset)
print(train_dataset["text"][:10])
print(train_dataset["tokens"][:10])
print(train_dataset["upos"][:10])

upos_mapping = dataset["train"].features["upos"].feature

# store the possible pos tags
pos_list = upos_mapping.names
print(pos_list)

DatasetDict({
    train: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 13814
    })
    validation: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 799
    })
    test: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 977
    })
})
['Sehr gute Beratung, schnelle Behebung der Probleme, so stelle ich mir Kundenservice vor.', 'Die Kosten sind definitiv auch im Rahmen.', 'Nette Gespräche, klasse Ergebnis', 'Ich bin seit längerer Zeit zur Behandlung verschiedenster "Leiden" in der Physiotherapieraxis "Gaby Montag" im Vital Center und kann ausschließlich Positives berichten!', 'Ob bei der Terminvergabe, den Behandlungsräumen oder den individuell zugeschnittenen Trainingsplänen sind alle Mitarbeiter äußerst kompete

In [25]:
def tokenize_and_align_labels(examples, tokenizer, label_all_tokens=False, skip_index=-100):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=200
    )
    labels = []

    for i, label in enumerate(examples["upos"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids: list[int] = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(skip_index)

            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])

            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else skip_index)

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [26]:
# convert numerical labels to string labels
def upos_id_to_label(upos_mapping, i):
    return upos_mapping.int2str(i)

In [27]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")

# test tokenization
tokenized_inputs = tokenize_and_align_labels(train_dataset[:5], tokenizer)
print(tokenized_inputs)



{'input_ids': [[0, 93404, 25989, 58860, 6, 4, 17230, 13, 873, 195285, 122, 39344, 6, 4, 221, 91151, 654, 2296, 14829, 22584, 1248, 6, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 622, 30882, 1276, 44836, 921, 566, 23, 745, 36070, 6, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [28]:
# test if the tokenization and alignment worked
for i in range(len(tokenized_inputs["labels"])):
    print("text", i, len(tokenized_inputs["input_ids"][i]))
    print(tokenizer.decode(tokenized_inputs["input_ids"][i]))
    print([upos_id_to_label(upos_mapping, x) for x in tokenized_inputs["labels"][i] if x != -100])
    print([upos_id_to_label(upos_mapping, x) for x in train_dataset["upos"][i]])

text 0 200
<s> Sehr gute Beratung, schnelle Behebung der Probleme, so stelle ich mir Kundenservice vor.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
['ADV', 'ADJ

In [29]:
# convert the dataset to dataloader
train_dataset = train_dataset.map(lambda examples: tokenize_and_align_labels(examples, tokenizer), batched=True)
valid_dataset = valid_dataset.map(lambda examples: tokenize_and_align_labels(examples, tokenizer), batched=True)
test_datatset = test_dataset.map(lambda examples: tokenize_and_align_labels(examples, tokenizer), batched=True)
print(train_dataset)

Map: 100%|██████████| 799/799 [00:00<00:00, 4761.59 examples/s]


Dataset({
    features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 13814
})


In [30]:
# test if huggingface dataset is converted to torch dataset
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)
valid_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=32)
next(iter(train_dataloader))

test_datatset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [31]:
from transformers import XLMRobertaModel


class POSTaggingModel(torch.nn.Module):
    def __init__(self, num_labels=18):
        super(POSTaggingModel, self).__init__()

        # load pre-trained XLM-RoBERTa model
        self.roberta = XLMRobertaModel.from_pretrained("xlm-roberta-base")

        # freeze RoBERTa parameters
        for param in self.roberta.parameters():
            param.requires_grad = False

        # project 768 hidden states to 0-17 POS tags
        self.ffn = torch.nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)

        # Shape: (batch_size, seq_length, hidden_size:768)
        hidden_states = outputs.last_hidden_state

        # feed forward layer
        # Shape: (batch_size, seq_length, num_labels:18)
        logits = self.ffn(hidden_states)

        return logits


class POSTaggingProModel(torch.nn.Module):
    def __init__(self, num_labels=18):
        super(POSTaggingProModel, self).__init__()

        # load pre-trained XLM-RoBERTa model
        self.roberta = XLMRobertaModel.from_pretrained("xlm-roberta-base")

        # feed forward layers with activation functions
        self.ffn = torch.nn.Sequential(
            torch.nn.Linear(768, 256),
            torch.nn.GELU(),
            torch.nn.Linear(256, 128),
            torch.nn.GELU(),
            torch.nn.Linear(128, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)

        # Shape: (batch_size, seq_length, hidden_size:768)
        hidden_states = outputs.last_hidden_state

        # feed forward layer
        # Shape: (batch_size, seq_length, num_labels:18)
        logits = self.ffn(hidden_states)

        return logits

In [32]:
model = POSTaggingModel()
data = next(iter(train_dataloader))
x = data["input_ids"]
mask = data["attention_mask"]

print(model(x, mask))

  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[[-0.7052,  0.4490, -0.2810,  ..., -0.5439,  0.6579, -0.6866],
         [-0.4819,  0.6817, -0.0558,  ..., -0.3724,  0.4584, -0.4302],
         [-0.3715,  0.7065,  0.0167,  ..., -0.3403,  0.5151, -0.4281],
         ...,
         [-0.5224,  0.8041, -0.0966,  ..., -0.5087,  0.4484, -0.4302],
         [-0.5224,  0.8041, -0.0966,  ..., -0.5087,  0.4484, -0.4302],
         [-0.5224,  0.8041, -0.0966,  ..., -0.5087,  0.4484, -0.4302]],

        [[-0.7328,  0.4769, -0.2220,  ..., -0.5567,  0.6482, -0.6443],
         [-0.5163,  0.6134, -0.0052,  ..., -0.4056,  0.5138, -0.3669],
         [-0.4789,  0.6031,  0.0403,  ..., -0.4588,  0.5077, -0.4225],
         ...,
         [-0.7404,  0.5278, -0.1200,  ..., -0.5236,  0.5083, -0.5747],
         [-0.7404,  0.5277, -0.1200,  ..., -0.5236,  0.5083, -0.5747],
         [-0.7404,  0.5277, -0.1200,  ..., -0.5236,  0.5083, -0.5747]],

        [[-0.7035,  0.6186, -0.1961,  ..., -0.5132,  0.6089, -0.6345],
         [-0.5267,  0.7035,  0.0530,  ..., -0

In [33]:
from torch.utils.tensorboard import SummaryWriter


# train the model
def train(model, train_dataloader, valid_dataset, test_dataset, device, num_epochs=3, lr=1e-4, weight_decay=1e-2):
    # init tensorBoard
    writer = SummaryWriter("./pos_tagging_RoBERTa")

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    # ignore the padding tokens
    criterion = torch.nn.CrossEntropyLoss(ignore_index=-100)

    model.to(device)
    model.train()

    print("batch num", len(train_dataloader))

    for epoch in range(num_epochs):
        total_loss = 0.0

        for i, data in enumerate(train_dataloader):
            x = data["input_ids"].to(device)
            mask = data["attention_mask"].to(device)
            y = data["labels"].to(device)

            optimizer.zero_grad()
            logits = model(x, mask)
            loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            loss.backward()
            optimizer.step()

            # log the loss curve
            total_loss += loss.item()
            print(f"Epoch {epoch}, Iteration {i}, Loss: {loss.item()}")
            writer.add_scalar("Training Loss Iter", loss.item(), epoch * len(train_dataloader) + i)

            if i > 0 and i % 50 == 0:
                # test the model
                model.eval()
                with torch.no_grad():
                    x = test_dataset["input_ids"].to(device)
                    mask = test_dataset["attention_mask"].to(device)
                    y = test_dataset["labels"].to(device)

                    logits = model(x, mask)
                    loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
                    print(f"Epoch {epoch}, Iteration {i}, Test Loss: {loss}")
                    writer.add_scalar("Test Loss", loss, epoch * len(train_dataloader) + i)
                model.train()

        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss}")

        # evaluation
        model.eval()
        with torch.no_grad():
            total_count = 0
            accurate_count = 0
            for data in valid_dataset:
                x = data["input_ids"].to(device)
                mask = data["attention_mask"].to(device)
                y = data["labels"].to(device)

                logits = model(x, mask)
                predictions = torch.argmax(logits, dim=-1)

                # flatten predictions and labels for comparison
                predictions = predictions.view(-1)
                y = y.view(-1)

                for prediction, label in zip(predictions, y):
                    # ignore following and padding tokens
                    if label == -100:
                        continue

                    # count label tokens
                    total_count += 1

                    # compare the prediction with the ground truth and count
                    if prediction == label:
                        accurate_count += 1

            # compute average accuracy
            avg_acc = accurate_count / total_count

            print(f"Epoch {epoch + 1}/{num_epochs}, Validation Accuracy: {avg_acc}")
            writer.add_scalar("Validation Accuracy", avg_acc, epoch)
        model.train()

    torch.save(model.state_dict(), "pos_tagging_model.pth")
    writer.close()

In [34]:
device = "cuda" if torch.cuda.is_available() else "cpu"

train(model, train_dataloader, valid_dataloader, test_datatset[:32], device, lr=1e-3, num_epochs=5)

batch num 432
Epoch 0, Iteration 0, Loss: 2.95466947555542
Epoch 0, Iteration 1, Loss: 2.9056386947631836
Epoch 0, Iteration 2, Loss: 2.8444619178771973
Epoch 0, Iteration 3, Loss: 2.8100311756134033
Epoch 0, Iteration 4, Loss: 2.7696523666381836
Epoch 0, Iteration 5, Loss: 2.6911494731903076
Epoch 0, Iteration 6, Loss: 2.696136474609375
Epoch 0, Iteration 7, Loss: 2.6482436656951904
Epoch 0, Iteration 8, Loss: 2.6043989658355713
Epoch 0, Iteration 9, Loss: 2.604863405227661
Epoch 0, Iteration 10, Loss: 2.5598111152648926
Epoch 0, Iteration 11, Loss: 2.4841158390045166
Epoch 0, Iteration 12, Loss: 2.425588369369507
Epoch 0, Iteration 13, Loss: 2.424830198287964
Epoch 0, Iteration 14, Loss: 2.375490427017212
Epoch 0, Iteration 15, Loss: 2.375823974609375
Epoch 0, Iteration 16, Loss: 2.3214871883392334
Epoch 0, Iteration 17, Loss: 2.3005247116088867
Epoch 0, Iteration 18, Loss: 2.282163619995117
Epoch 0, Iteration 19, Loss: 2.2967851161956787
Epoch 0, Iteration 20, Loss: 2.33836078643798