In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
import torch
from ipywidgets import IntProgress
import ast
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, PreTrainedModel
import itertools
from transformers import DataCollatorForTokenClassification
from datasets import load_metric
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
import string
import csv
import collections
from datasets import load_from_disk

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import wandb
wandb.login(key="427ae538d18eb2b96015789f43aba80680ce5294")

[34m[1mwandb[0m: Currently logged in as: [33mrensongyursy[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/songyuren/.netrc


True

## Import data

In [4]:
dataset = 'updated_tags.csv'

In [5]:
 def read_csv_to_df(csv_file):
    #dataframe = pd.read_csv(csv_file, sep = ";")
    dataframe = pd.read_csv(csv_file, sep = ",", converters={'tokens': eval, 'new_srl_tags': eval})
    return dataframe

## I commented out the original line and made a new line with added converters for the tokens and srl_tags columns.
## Although it looks like the values in these columns are lists, they are not. Eval transforms them to lists.
## When reading in the dataframe we get a SyntaxError: EOL while scanning string literal.
## This error is caused by double quotes '' at the end of line 35, 53 and 56 in the srl_tags column.
## (Temporary) solution: I created a new csv (_cleaned) where the double quotes are manually removed.

In [6]:
tagged_file = dataset
dataframe = read_csv_to_df(tagged_file)
df_to_train = dataframe[['sentence_id', 'tokens', 'new_srl_tags']]
print(df_to_train)

                                          sentence_id  \
0                    326-Wet_rechterlijke_organisatie   
1                                 536-Politiewet_2012   
2                           400-Vreemdelingenwet_2000   
3                             406-Wet_op_de_jeugdzorg   
4                               1594-aanbestedingswet   
..                                                ...   
93  107-Reglement_verkeersregels_en_verkeerstekens...   
94                            359-Wet_op_de_jeugdzorg   
95                          561-Wet_natuurbescherming   
96                      220-Vreemdelingenbesluit_2000   
97                               637-Werkloosheidswet   

                                               tokens  \
0   [Degene, die, zitting, heeft, in, de, enkelvou...   
1   [De, examinatoren, verstrekken, de, examencomm...   
2   [Indien, de, aanvraag, tot, het, verlenen, van...   
3   [De, griffier, zendt, ,, onverminderd, ,, een,...   
4   [Indien, een, voorgenomen,

In [7]:
df_to_train = df_to_train.rename(columns = {'srl_tags': 'old_srl_tags', 'new_srl_tags': 'srl_tags'})


## Checking and cleaning data

In [8]:
 # checking if number of tokens is equal to number of srl_tags
print("FOR THESE ROWS, THE NUMBER OF TOKENS IS --NOT-- EQUAL TO THE NUMBER OF SRL_TAGS")
print()

indices_to_remove = []
for index, row in df_to_train.iterrows():
    if len(row['tokens']) != len(row['srl_tags']):
        print(index)
        print(row['tokens'])
        print(row['srl_tags'])
        print()
        indices_to_remove.append(index)

FOR THESE ROWS, THE NUMBER OF TOKENS IS --NOT-- EQUAL TO THE NUMBER OF SRL_TAGS



In [9]:
 # removing rows where number of tokens != number of srl_tags
df_to_train = df_to_train.drop(indices_to_remove)

## Changing undesired tags

In [10]:
 # Checking if there are any other srl_tags apart from the desired ones
set_of_tags = set(itertools.chain.from_iterable(df_to_train['srl_tags']))
set_of_desired_tags = {'Recipient', 'O', 'Object', 'Actor', 'Action', 'Precondition'}

if set_of_tags != set_of_desired_tags:
    print(set_of_tags)
else:
    print("set_of_tags is equal to set_of_desired_tags")

{'.', 'O', 'Precondition', ':', ';', 'Object', 'Recipient', 'Actor', ',', '%', '/', 'Action'}


In [11]:
## 1. creating a list with the new tags
new_srl_tags = []
for index, row in df_to_train.iterrows():
    #print(row['srl_tags_name'])
    #print(['O' if x not in set_of_desired_tags else x for x in row['srl_tags_name']])
    #print()
    new_srl_tags.append(['O' if x not in set_of_desired_tags else x for x in row['srl_tags']])

## 2. removing column with the old srl tags and creating a new column with the updated srl tags
df_to_train = df_to_train.drop(columns=['srl_tags'])
df_to_train['srl_tags'] = new_srl_tags
#print(df_to_train)



In [12]:
 # Checking again
set_of_tags = set(itertools.chain.from_iterable(df_to_train['srl_tags']))
set_of_desired_tags = {'Recipient', 'O', 'Object', 'Actor', 'Action', 'Precondition'}

if set_of_tags != set_of_desired_tags:
    print(set_of_tags)
else:
    print("set_of_tags is equal to set_of_desired_tags")

set_of_tags is equal to set_of_desired_tags


## Add column with number for each tag

In [13]:
# creating new column with numbers for srl tags instead of text

## 1. create a dict to store translation srl_tags --> numbers
srl_keys={"O": 0,
          "Action": 1,
          "Actor": 2,
          "Object": 3,
          "Recipient": 4,
          "Precondition": 5}

## 2. rename the srl_tags column to srl_tags_name
df_to_train.rename(columns={'srl_tags': 'srl_tags_name'}, inplace=True)

## 3. map the srl tags to numbers
srl_tags_numbers = []
for index, row in df_to_train.iterrows():
    srl_tags_numbers.append([srl_keys.get(item) for item in row['srl_tags_name']])

## 4. create a new column with the numbers
df_to_train['srl_tags'] = srl_tags_numbers

In [14]:
 # print(df_to_train)

# count role distribution in training data
role_counts = dict()
occurence_count = 0
for sentence in df_to_train['srl_tags_name']:
    occurences = collections.Counter(sentence)
    occurence_count = occurence_count + 1
    if occurences['Actor'] > 0:
        role_counts['actor'] = role_counts.setdefault('actor', 0) + 1
    if occurences['Object'] > 0:
        role_counts['object'] = role_counts.setdefault('object', 0) + 1
    if occurences['Recipient'] > 0:
        role_counts['recipient'] = role_counts.setdefault('recipient', 0) + 1
    if occurences['Action'] > 0:
        role_counts['action'] = role_counts.setdefault('action', 0) + 1
    if occurences['Precondition'] > 0:
        role_counts['precondition'] = role_counts.setdefault('precondition', 0) + 1

print(role_counts)

{'actor': 47, 'object': 94, 'action': 90, 'recipient': 19, 'precondition': 37}


## Creating Dataset

In [15]:
dataset = Dataset.from_pandas(df_to_train)


In [16]:
 ## SPLITTING main dataset into train, validation, test as DatasetDict
## source 1: https://discuss.huggingface.co/t/how-to-split-main-dataset-into-train-dev-test-as-datasetdict/1090
## source 2: https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.train_test_split

# 90% train, 10% test + validation
train_testvalid = dataset.train_test_split(test_size=0.1)

# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

# Collect the two into a single DatasetDict
datasets = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']})
print(datasets)

DatasetDict({
    train: Dataset({
        features: ['sentence_id', 'tokens', 'srl_tags_name', 'srl_tags'],
        num_rows: 88
    })
    test: Dataset({
        features: ['sentence_id', 'tokens', 'srl_tags_name', 'srl_tags'],
        num_rows: 5
    })
    validation: Dataset({
        features: ['sentence_id', 'tokens', 'srl_tags_name', 'srl_tags'],
        num_rows: 5
    })
})


## Tokenize Data

In [17]:
# tokenizer
tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")

In [18]:
def tokenize_and_align_labels(examples, label_all_tokens = True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["srl_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [19]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/88 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

This saves the data to your disk and then reloades it again, does not work on my (Jan) computer, not sure why. But it's not a necessary step.

In [20]:
# tokenized_datasets.save_to_disk("dataset")
# reloaded_encoded_dataset = load_from_disk("dataset")

In [21]:

reloaded_encoded_dataset = tokenized_datasets

## Finetuning the model

In [22]:
srl_tags_set = set(itertools.chain.from_iterable(reloaded_encoded_dataset['train']['srl_tags_name']))

# import model
model = AutoModelForTokenClassification.from_pretrained("GroNLP/bert-base-dutch-cased", num_labels=len(srl_tags_set))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
batch_size = 8

args = TrainingArguments(
    output_dir=".",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=4,
    weight_decay=0.01
)

In [24]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [25]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [26]:
label_list = ["O", "Action", "Actor", "Object", "Recipient", "Precondition"]
labels = [label_list[i] for i in reloaded_encoded_dataset['train']['srl_tags'][0]]
metric.compute(predictions=[labels], references=[labels])



{'ction': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'recondition': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [27]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Flatten the predictions and labels
    true_predictions = [
        label_list[p] for prediction, label in zip(predictions, labels) 
        for p, l in zip(prediction, label) if l != -100
    ]
    true_labels = [
        label_list[l] for prediction, label in zip(predictions, labels) 
        for p, l in zip(prediction, label) if l != -100
    ]

    # Use sklearn's precision_recall_fscore_support function
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, true_predictions, labels=label_list, zero_division=0
    )

    accuracy = sum(t == l for t, l in zip(true_predictions, true_labels)) / len(true_predictions)

    results = {
        "precision": precision.mean(),
        "recall": recall.mean(),
        "f1": f1.mean(),
        "accuracy": accuracy,
    }

    return results

In [28]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=reloaded_encoded_dataset["train"],
    eval_dataset=reloaded_encoded_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [29]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.789763,0.528302,0.280071,0.255189,0.598958
2,No log,0.677304,0.597529,0.632886,0.56697,0.692708
3,No log,0.59136,0.571452,0.657239,0.57556,0.713542
4,No log,0.573861,0.589706,0.667239,0.582506,0.729167


TrainOutput(global_step=44, training_loss=0.4952379573475231, metrics={'train_runtime': 15.1263, 'train_samples_per_second': 23.271, 'train_steps_per_second': 2.909, 'total_flos': 12346718243328.0, 'train_loss': 0.4952379573475231, 'epoch': 4.0})