In [None]:
import csv
import torch
import random
import pandas as pd
import numpy as np
from transformers import XLMRobertaTokenizerFast
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaForTokenClassification

## DATA PREPARATION

In [None]:
csv_file_path = 'Labeled_dataset_de.csv'

sentences = {}
with open(csv_file_path, 'r') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)  # Skip the header row
    for row in csv_reader:
        sentence_id, token, label = row
        if sentence_id not in sentences:
            sentences[sentence_id] = {'tokens': [], 'labels': []}
        sentences[sentence_id]['tokens'].append(token)
        sentences[sentence_id]['labels'].append(label)

result = []
for sentence_id, data in sentences.items():
    words = data['tokens']
    labels = data['labels']
    result.append((words, labels))

for sentence in result:
    print(sentence)

(['es', 'gibt', 'leider', 'einen', 'haufen', 'perverse', 'wixer', 'auf', 'dieser', 'welt', 'und', 'in', 'unseren', 'städten', ',', 'dass', 'man', 'sich', 'dann', 'so', 'über', 'männer', 'aufregen', 'muss', 'die', 'breiter', 'sitzen', '.'], ['n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n'])
(['hier', 'macht', 'sich', 'deutschalnd', 'in', 'verfassungswidriger', 'weise', 'zum', 'mittäter', 'an', 'solche', 'taten', '!'], ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n'])
(['das', 'ist', 'so', 'dumm', '!'], ['n', 'n', 'n', 'B-B', 'n'])
(['mein', 'freund', 'hat', 'ein', 'tolles', 'bild', 'von', 'meinem', 'auto', 'gemacht', '.'], ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n'])
(['der', 'witz', 'liegt', 'mir', 'auf', 'der', 'zunge', '.'], ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n'])
(['wenn', 'ich', 'einen', 'asiaten', 'auf', 'der', 'straße', 'oder', 'auf', 'dem', 'schulhof', 'als

In [None]:
train_data, test_data = train_test_split(result, test_size=0.2, random_state=42)

val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

print(f"Training set length: {len(train_data)}")
print(f"Validation set length: {len(val_data)}")
print(f"Test set length: {len(test_data)}")

Training set length: 468
Validation set length: 58
Test set length: 59


In [None]:
gold_set_for_validation = set(word for (words, labels) in val_data for word, label in zip(words, labels) if label == 'B-B' or label == 'B')

df_gold_val = pd.DataFrame({'terms': list(gold_set_for_validation)})
print(df_gold_val)

            terms
0          kaffer
1          nippel
2      scheißkerl
3            dich
4       arschloch
5    rattenscharf
6        feigling
7       flittchen
8          dummen
9     vollscheißt
10          fotze
11            ins
12     hackfresse
13         ficken
14    hodenbeißer
15           fick
16     beschissen
17   fotzenlecker
18    dumpfbacken
19       mistkerl
20        schwein
21        scheiße
22     hirnrissig
23           möse
24    arschficker
25      vollidiot
26  hosenscheißer
27         bumsen
28    volltrottel
29        rechten
30          nutte
31   vollscheißen
32      scheissen
33         transe
34       abschaum
35    wurmwichser
36          pöbel
37     blödmänner
38            kuh
39       blödmann
40  arschkriecher
41      arschfick
42       anschiss
43   dreckskerlen
44            sau
45          blöde
46      shitstorm
47           knie


In [None]:
gold_set_for_test = set(word for (words, labels) in test_data for word, label in zip(words, labels) if label == 'B-B' or label == 'B')

df_gold_test = pd.DataFrame({'terms': list(gold_set_for_test)})
print(df_gold_test)

              terms
0      schweinehund
1        arschgeige
2            kaffer
3             arsch
4        volldeppen
5          großmaul
6         arschloch
7            deppen
8              dumm
9             fotze
10       hackfresse
11           pimmel
12      vollidioten
13  schwanzlutscher
14             fick
15       scheißegal
16       beschissen
17      morgenlatte
18               im
19       hirnrissig
20          scheiße
21      arschficker
22           pisser
23           lecken
24      volltrottel
25           bumsen
26        drecksack
27            idiot
28     gottverdammt
29            stück
30         asoziale
31     vollscheißen
32           muschi
33             homo
34          trottel
35         dummkopf
36        scheissen
37     flachwichser
38             depp
39       fickfehler
40            dumme
41              sau
42         schlampe
43          schwanz
44           kanake
45       schwuchtel
46        hurensohn
47      drecksfotze


In [None]:
train_tags=[tup[1] for tup in train_data]
train_texts=[tup[0] for tup in train_data]

val_tags=[tup[1] for tup in val_data]
val_texts=[tup[0] for tup in val_data]

test_tags=[tup[1] for tup in test_data]
test_texts=[tup[0] for tup in test_data]

In [None]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

In [None]:
label_list=["n", "B-B", "B"]
label_to_id = {l: i for i, l in enumerate(label_list)}
num_labels=len(label_list)

def tokenize_and_align_labels(texts, tags):
  tokenized_inputs = tokenizer(
      texts,
      padding=True,
      truncation=True,
      is_split_into_words=True,
  )
  labels = []
  for i, label in enumerate(tags):
      word_ids = tokenized_inputs.word_ids(batch_index=i)
      previous_word_idx = None
      label_ids = []
      for word_idx in word_ids:
          if word_idx is None:
              label_ids.append(-100)
          elif word_idx != previous_word_idx:
              label_ids.append(label_to_id[label[word_idx]])
          else:
              label_ids.append(-100)
          previous_word_idx = word_idx

      labels.append(label_ids)
  tokenized_inputs["labels"] = labels
  return tokenized_inputs

train_input_and_labels = tokenize_and_align_labels(train_texts, train_tags)

val_input_and_labels = tokenize_and_align_labels(val_texts, val_tags)

test_input_and_labels = tokenize_and_align_labels(test_texts, test_tags)

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_input_and_labels, train_input_and_labels["labels"])

val_dataset = Dataset(val_input_and_labels, val_input_and_labels["labels"])

test_dataset = Dataset(test_input_and_labels, test_input_and_labels["labels"])

In [None]:
def extract_terms(token_predictions, val_texts):
  extracted_terms = set()
  for i in range(len(token_predictions)):
    pred = token_predictions[i]
    txt  = val_texts[i]
    for j in range(len(pred)):
      if pred[j]=="B-B":
        term=txt[j]
        for k in range(j+1,len(pred)):
          if pred[k]=="B": term+=" "+txt[k]
          else: break
        extracted_terms.add(term)
  return extracted_terms

In [None]:
#VAL
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    extracted_terms=extract_terms(true_predictions, val_texts)
    extracted_terms = set([item.lower() for item in extracted_terms])
    gold_set=gold_set_for_validation

    true_pos=extracted_terms.intersection(gold_set)
    recall=len(true_pos)/len(gold_set)
    precision=len(true_pos)/len(extracted_terms)

    return {
        "precision": precision,
        "recall": recall,
        "f1": 2*(precision*recall)/(precision+recall),
    }

## FINE-TUNING

In [None]:
# training arguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=0,
    weight_decay=7.2913700560979e-05,
    learning_rate=1.2939119486050953e-05,
    logging_dir='./logs',
    eval_strategy= "steps",
    eval_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

In [None]:
model = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=num_labels)

In [None]:
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1
100,No log,0.139503,0.848485,0.583333,0.691358
200,No log,0.097908,0.883721,0.791667,0.835165
300,No log,0.079122,0.891304,0.854167,0.87234
400,No log,0.095392,0.926829,0.791667,0.853933
500,0.115800,0.069853,0.913043,0.875,0.893617


TrainOutput(global_step=585, training_loss=0.10583935843573676, metrics={'train_runtime': 153.3084, 'train_samples_per_second': 15.263, 'train_steps_per_second': 3.816, 'total_flos': 82401085489320.0, 'train_loss': 0.10583935843573676, 'epoch': 5.0})

## TEST SET

In [None]:
def compute_metrics_test(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    extracted_terms=extract_terms(true_predictions, test_texts)
    extracted_terms = set([item.lower() for item in extracted_terms])
    gold_set=gold_set_for_test

    true_pos=extracted_terms.intersection(gold_set)
    recall=len(true_pos)/len(gold_set)
    precision=len(true_pos)/len(extracted_terms)

    return {
        "precision": precision,
        "recall": recall,
        "f1": 2*(precision*recall)/(precision+recall),
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    do_eval=True
)

In [None]:
trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics_test,
    )

In [None]:
eval_results = trainer.evaluate()
print(f"evaluation results: {eval_results}")

In [None]:
test_predictions, test_labels, test_metrics = trainer.predict(test_dataset)
test_predictions = np.argmax(test_predictions, axis=2)

true_test_predictions = [
    [label_list[p] for (p, l) in zip(test_prediction, test_label) if l != -100]
    for test_prediction, test_label in zip(test_predictions, test_labels)
]

In [None]:
i=49
print('{:>10}  {:>10}  {:>10}'.format("Text", "Label", "Prediction"))
for j in range(len(true_test_predictions[i])):
  print('{:>10}  {:>10}  {:>10}'.format(test_texts[i][j], test_tags[i][j], true_test_predictions[i][j]))

      Text       Label  Prediction
       ich           n           n
  wünschte           n           n
         ,           n           n
       ich           n           n
      wäre           n           n
     nicht           n           n
        so           n           n
       ein           n           n
   krasser           n           n
 drecksack         B-B         B-B
         .           n           n


In [None]:
def computeTermEvalMetrics(extracted_terms, gold_df):
  extracted_terms = set([item.lower() for item in extracted_terms])
  gold_set=set(gold_df)
  true_pos=extracted_terms.intersection(gold_set)
  recall=len(true_pos)/len(gold_set)
  precision=len(true_pos)/len(extracted_terms)

  print("Intersection",len(true_pos))
  print("Gold",len(gold_set))
  print("Extracted",len(extracted_terms))
  print("Recall:", recall)
  print("Precision:", precision)
  print("F1:", 2*(precision*recall)/(precision+recall))

In [None]:
test_extracted_terms = extract_terms(true_test_predictions, test_texts)

In [None]:
computeTermEvalMetrics(test_extracted_terms, gold_set_for_test)

Intersection 39
Gold 48
Extracted 47
Recall: 0.8125
Precision: 0.8297872340425532
F1: 0.8210526315789474
