In [424]:
import csv
import torch
import pandas as pd
import numpy as np
from transformers import XLMRobertaTokenizerFast
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaForTokenClassification

## DATA PREPARATION

In [629]:
csv_file_path = 'Labeled_dataset_ocd.csv'

sentences = {}
with open(csv_file_path, 'r') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)  # Skip the header row
    for row in csv_reader:
        sentence_id, token, label = row
        if sentence_id not in sentences:
            sentences[sentence_id] = {'tokens': [], 'labels': []}
        sentences[sentence_id]['tokens'].append(token)
        sentences[sentence_id]['labels'].append(label)

result = []
for sentence_id, data in sentences.items():
    words = data['tokens']
    labels = data['labels']
    result.append((words, labels))

for sentence in result:
    print(sentence)

(['i', 'hope', 'you', 'are', 'doing', 'gut', '.'], ['n', 'n', 'n', 'n', 'n', 'n', 'n'])
(['i', 'need', 'to', 'get', 'festgeschnallt', 'any', 'mommies', 'volunteer', '.'], ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n'])
(['sometimes', 'i', 'think', 'are', 'there', 'any', 'people', 'who', 'see', 'my', 'arts', 'and', 'just', 'oh', 'no', 'is', 'that', 'the', 'one', 'that', 'does', 'those', 'scheiße', 'seafire', 'stuff', '.'], ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n'])
(['you', 'just', 'start', 'at', 'the', 'base', 'and', 'work', 'your', 'way', 'hoch', 'then', 'give', 'it', 'a', 'good', 'tug', 'once', 'you', 'get', 'to', 'the', 'top', '.'], ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n'])
(['wenn', 'dein', 'ex', 'sein', 'new', 'thang', 'besser', 'behandelt', ',', 'als', 'er', 'es', 'jemals', 'behandelt', 'hat', ',', 'forget', 'tha

In [630]:
data = result
print(f"OCD set length: {len(data)}")

OCD set length: 250


In [631]:
train_data, test_data = train_test_split(result, test_size=0.2, random_state=42)

val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

print(f"Training set length: {len(train_data)}")
print(f"Validation set length: {len(val_data)}")
print(f"Test set length: {len(test_data)}")

Training set length: 200
Validation set length: 25
Test set length: 25


In [632]:
gold_set_for_validation = set(word for (words, labels) in val_data for word, label in zip(words, labels) if label == 'B-B' or label == 'B')

df_gold_val = pd.DataFrame({'terms': list(gold_set_for_validation)})
print(df_gold_val)

           terms
0        fucking
1         faggot
2           blöd
3       verdammt
4   stuhlscheiße
5       retarded
6         fucked
7         ficken
8          arsch
9     verdammtes
10          dumm
11      schlampe
12        nigger
13    sandnigger
14         kikes
15            up
16   sandniggern
17     schlampen


In [633]:
gold_set_for_test = set(word for (words, labels) in test_data for word, label in zip(words, labels) if label == 'B-B' or label == 'B')

df_gold_test = pd.DataFrame({'terms': list(gold_set_for_test)})
print(df_gold_test)

                             terms
0                       verdammter
1                          niggers
2                             fick
3                            bitch
4                         verdammt
5                          verpiss
6   ghettoabschaum-gangsta-pussies
7                             dumm
8                           hündin
9                      sandniggern
10                         faggots
11                       verdammte
12                            dich
13                            dumb
14                           coons
15                            over
16                   fettrückstand
17                          nigger
18                     schwuchteln
19                       niggersau


In [634]:
train_tags=[tup[1] for tup in train_data]
train_texts=[tup[0] for tup in train_data]

val_tags=[tup[1] for tup in val_data]
val_texts=[tup[0] for tup in val_data]

test_tags=[tup[1] for tup in test_data]
test_texts=[tup[0] for tup in test_data]

In [635]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

In [636]:
label_list=["n", "B-B", "B"]
label_to_id = {l: i for i, l in enumerate(label_list)}
num_labels=len(label_list)

def tokenize_and_align_labels(texts, tags):
  tokenized_inputs = tokenizer(
      texts,
      padding=True,
      truncation=True,
      is_split_into_words=True,
  )
  labels = []
  for i, label in enumerate(tags):
      word_ids = tokenized_inputs.word_ids(batch_index=i)
      previous_word_idx = None
      label_ids = []
      for word_idx in word_ids:
          if word_idx is None:
              label_ids.append(-100)
          elif word_idx != previous_word_idx:
              label_ids.append(label_to_id[label[word_idx]])
          else:
              label_ids.append(-100)
          previous_word_idx = word_idx

      labels.append(label_ids)
  tokenized_inputs["labels"] = labels
  return tokenized_inputs

train_input_and_labels = tokenize_and_align_labels(train_texts, train_tags)

val_input_and_labels = tokenize_and_align_labels(val_texts, val_tags)

test_input_and_labels = tokenize_and_align_labels(test_texts, test_tags)

In [637]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_input_and_labels, train_input_and_labels["labels"])

val_dataset = Dataset(val_input_and_labels, val_input_and_labels["labels"])

test_dataset = Dataset(test_input_and_labels, test_input_and_labels["labels"])

In [638]:
def extract_terms(token_predictions, val_texts):
  extracted_terms = set()
  for i in range(len(token_predictions)):
    pred = token_predictions[i]
    txt  = val_texts[i]
    for j in range(len(pred)):
      if pred[j]=="B-B":
        term=txt[j]
        for k in range(j+1,len(pred)):
          if pred[k]=="B": term+=" "+txt[k]
          else: break
        extracted_terms.add(term)
  return extracted_terms

In [639]:
def compute_metrics_eval(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    extracted_terms=extract_terms(true_predictions, val_texts)
    extracted_terms = set([item.lower() for item in extracted_terms])
    gold_set=gold_set_for_validation

    true_pos=extracted_terms.intersection(gold_set)
    recall=len(true_pos)/len(gold_set)
    precision=len(true_pos)/len(extracted_terms)

    return {
        "precision": precision,
        "recall": recall,
        "f1": 2*(precision*recall)/(precision+recall),
    }

## FINE-TUNING

In [585]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=14,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=0,
    weight_decay=0.0009255754769953695,
    learning_rate=2.121035136134197e-05,
    logging_dir='./logs',
    eval_strategy= "steps",
    eval_steps=100,
    save_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

In [586]:
model = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=num_labels)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [587]:
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_eval,
    )

  trainer = Trainer(


In [588]:
trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1
100,No log,0.100351,0.823529,0.777778,0.8
200,No log,0.073508,0.882353,0.833333,0.857143
300,No log,0.059995,1.0,0.833333,0.909091


TrainOutput(global_step=350, training_loss=0.09774974278041294, metrics={'train_runtime': 240.7811, 'train_samples_per_second': 11.629, 'train_steps_per_second': 1.454, 'total_flos': 100028569032000.0, 'train_loss': 0.09774974278041294, 'epoch': 14.0})

## TEST SET

In [645]:
def compute_metrics_test(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    extracted_terms=extract_terms(true_predictions, test_texts)
    extracted_terms = set([item.lower() for item in extracted_terms])
    gold_set=gold_set_for_test

    true_pos=extracted_terms.intersection(gold_set)
    recall=len(true_pos)/len(gold_set)
    precision=len(true_pos)/len(extracted_terms)

    return {
        "precision": precision,
        "recall": recall,
        "f1": 2*(precision*recall)/(precision+recall),
    }

In [646]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=10,
    do_eval=True
)

In [647]:
trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics_test,
    )

In [648]:
eval_results = trainer.evaluate()
print(f"evaluation results: {eval_results}")

evaluation results: {'eval_loss': 0.0874839723110199, 'eval_model_preparation_time': 0.0035, 'eval_precision': 1.0, 'eval_recall': 0.8, 'eval_f1': 0.888888888888889, 'eval_runtime': 0.1259, 'eval_samples_per_second': 198.579, 'eval_steps_per_second': 31.773}


In [649]:
test_predictions, test_labels, test_metrics = trainer.predict(test_dataset)
test_predictions = np.argmax(test_predictions, axis=2)

true_test_predictions = [
    [label_list[p] for (p, l) in zip(test_prediction, test_label) if l != -100]
    for test_prediction, test_label in zip(test_predictions, test_labels)
]

In [650]:
i=16
print('{:>10}  {:>10}  {:>10}'.format("Text", "Label", "Prediction"))
for j in range(len(true_test_predictions[i])):
  print('{:>10}  {:>10}  {:>10}'.format(test_texts[i][j], test_tags[i][j], true_test_predictions[i][j]))

      Text       Label  Prediction
     jesus           n           n
verdammter         B-B         B-B
    christ           n           n
       its           n           n
      like           n           n
         i           n           n
         m           n           n
      back           n           n
        in           n           n
     saudi           n           n
    arabia           n           n
     again           n           n
         .           n           n


In [651]:
def computeTermEvalMetrics(extracted_terms, gold_df):
  extracted_terms = set([item.lower() for item in extracted_terms])
  gold_set=set(gold_df)
  true_pos=extracted_terms.intersection(gold_set)
  recall=len(true_pos)/len(gold_set)
  precision=len(true_pos)/len(extracted_terms)

  print("Intersection",len(true_pos))
  print("Gold",len(gold_set))
  print("Extracted",len(extracted_terms))
  print("Recall:", recall)
  print("Precision:", precision)
  print("F1:", 2*(precision*recall)/(precision+recall))

In [652]:
test_extracted_terms = extract_terms(true_test_predictions, test_texts)

In [655]:
print(test_extracted_terms)

{'bitch', 'fettrückstand', 'verdammte', 'verdammter', 'verdammt', 'niggers', 'dich', 'verpiss', 'dumb', 'dumm', 'nigger', 'schwuchteln', 'sandniggern', 'niggersau', 'fick', 'faggots'}


In [653]:
computeTermEvalMetrics(test_extracted_terms, gold_set_for_test)

Intersection 16
Gold 20
Extracted 16
Recall: 0.8
Precision: 1.0
F1: 0.888888888888889
