In [1]:
import csv
import torch
import zipfile
import numpy as np
import pandas as pd

from transformers import XLMRobertaTokenizerFast
from transformers import XLMRobertaForTokenClassification

from transformers import Trainer, TrainingArguments

# DE only Fine-Tuned Model (DFT) vs. DE-EN Bilingual Fine-Tuned Model (DEFT) vs. Code-Mixed Model (CMM) auf Rap Dataset (RAP)

# RAP Dataset Preparation

In [51]:
csv_file_path = 'Labeled_dataset_rap.csv'

sentences = {}
with open(csv_file_path, 'r') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)
    for row in csv_reader:
        sentence_id, token, label = row
        if sentence_id not in sentences:
            sentences[sentence_id] = {'tokens': [], 'labels': []}
        sentences[sentence_id]['tokens'].append(token)
        sentences[sentence_id]['labels'].append(label)

result_rap = []
for sentence_id, data in sentences.items():
    words = data['tokens']
    labels = data['labels']
    result_rap.append((words, labels))

for sentence in result_rap:
    print(sentence)

(['immer', 'mit', 'der', 'family', ',', 'du', 'denkst', ',', 'ich', 'komm', 'aus', 'quahog'], ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n'])
(['sie', 'wollen', 'den', 'cumshot', 'nicht', 'teilen', ',', 'also', 'spritz', 'ich', 'straight', 'in', 'den', 'deckenventilator', '.'], ['n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n'])
(['fick', 'ich', 'sie', 'in', 'den', 'arsch', 'bin', 'ich', 'bei', '7,80', 'euro', '.'], ['B-B', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n'])
(['und', 'alle', 'bitches', 'tanzen', ',', 'ja', ',', 'ich', 'lass', 'die', 'bitches', 'tanzen', 'uff', '.'], ['n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n'])
(['guck', ',', 'jetzt', 'mach', 'ich', 'money', ',', 'viel', 'patte', ',', 'geb', 'es', 'papa', '.'], ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n'])
(['renn', 'weg', ',', 'bevor', 'sich', 'bullets', 'in', 'deinen', 'schädel', 'bohrn', '.'], ['n', 'n', 'n', 'n', 

In [52]:
rap_data = result_rap
print(f"RAP set length: {len(rap_data)}")

RAP set length: 131


In [53]:
gold_set_for_rap = set(word for (words, labels) in rap_data for word, label in zip(words, labels) if label == 'B-B' or label == 'B')

df_gold_rap = pd.DataFrame({'terms': list(gold_set_for_rap)})
print(df_gold_rap)

           terms
0   hurentochter
1      boss-cock
2    scheißdreck
3          nutte
4         fotzen
..           ...
61       punanis
62      schlampe
63          slut
64    schwuchtel
65     bitchbart

[66 rows x 1 columns]


In [54]:
rap_tags=[tup[1] for tup in rap_data]
rap_texts=[tup[0] for tup in rap_data]

In [55]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

In [56]:
label_list=["n", "B-B", "B"]
label_to_id = {l: i for i, l in enumerate(label_list)}
num_labels=len(label_list)

def tokenize_and_align_labels(texts, tags):
  tokenized_inputs = tokenizer(
      texts,
      padding=True,
      truncation=True,
      is_split_into_words=True,
  )
  labels = []
  for i, label in enumerate(tags):
      word_ids = tokenized_inputs.word_ids(batch_index=i)
      previous_word_idx = None
      label_ids = []
      for word_idx in word_ids:
          if word_idx is None:
              label_ids.append(-100)
          elif word_idx != previous_word_idx:
              label_ids.append(label_to_id[label[word_idx]])
          else:
              label_ids.append(-100)
          previous_word_idx = word_idx

      labels.append(label_ids)
  tokenized_inputs["labels"] = labels
  return tokenized_inputs

In [57]:
rap_input_and_labels = tokenize_and_align_labels(rap_texts, rap_tags)

In [58]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [59]:
rap_dataset = Dataset(rap_input_and_labels, rap_input_and_labels["labels"])

In [60]:
def extract_terms(token_predictions, val_tokens):
    extracted_terms = set()
    for i in range(len(token_predictions)):
        pred = token_predictions[i]
        txt = val_tokens[i]
        term = ""
        inside_term = False
        for j in range(len(pred)):
            if pred[j] == "B-B" or pred[j] == "B":
                if inside_term:
                    extracted_terms.add(term)
                    term = ""
                    inside_term = False
                term += txt[j]
                inside_term = True
            elif pred[j] == "n":
                if inside_term:
                    extracted_terms.add(term)
                    term = ""
                    inside_term = False
        if inside_term:
            extracted_terms.add(term)
    return extracted_terms

In [61]:
def compute_metrics_rap(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    extracted_terms = extract_terms(true_predictions, rap_texts)
    extracted_terms = set([item.lower() for item in extracted_terms])

    gold_set = set(word for (words, labels) in rap_data for word, label in zip(words, labels) if label == 'B-B' or label == 'B')

    true_pos = extracted_terms.intersection(gold_set)
    recall = len(true_pos) / len(gold_set) if len(gold_set) > 0 else 0
    precision = len(true_pos) / len(extracted_terms) if len(extracted_terms) > 0 else 0

    return {
        "precision": precision,
        "recall": recall,
        "f1": 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0,
    }

# DFT

In [41]:
with zipfile.ZipFile('best_model_de.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/best_model_dft')

In [42]:
model = XLMRobertaForTokenClassification.from_pretrained("/content/best_model_dft")

In [43]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    do_eval=True
)

In [44]:
trainer_dft = Trainer(
        model=model,
        args=training_args,
        eval_dataset=rap_dataset,
        compute_metrics=compute_metrics_rap,
    )

In [45]:
eval_results_dft = trainer_dft.evaluate()
print(f"evaluation results: {eval_results_dft}")

evaluation results: {'eval_loss': 0.20088475942611694, 'eval_model_preparation_time': 0.0028, 'eval_precision': 0.6395348837209303, 'eval_recall': 0.8333333333333334, 'eval_f1': 0.7236842105263159, 'eval_runtime': 0.3423, 'eval_samples_per_second': 382.712, 'eval_steps_per_second': 26.293}


In [46]:
dft_predictions, dft_labels, dft_metrics = trainer_dft.predict(rap_dataset)

dft_predictions = np.argmax(dft_predictions, axis=2)

true_dft_predictions = [
    [label_list[p] for (p, l) in zip(dft_prediction, dft_label) if l != -100]
    for dft_prediction, dft_label in zip(dft_predictions, dft_labels)
]

for i in range(5):
    print(f"Predicted: {true_dft_predictions[i]}")
    print(f"True: {[label_list[l] for l in dft_labels[i] if l != -100]}")

Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
Predicted: ['n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['B-B', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n']
Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n']
Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']


In [47]:
dft_extracted_terms = extract_terms(true_dft_predictions, rap_texts)

In [48]:
print(dft_extracted_terms)

{'fotzen', 'krepier', 'zerhau', 'broke-ass-bitch', 'motherfucker', 'kaffer', 'nuttensohn', 'vollidioten', 'bitchrap', 'hoe', 'nigga', 'chick', 'flöten', 'nix', 'broke', 'diamonds', 'huren', 'arsch', 'schwuchtel', 'scheiße', 'blowjob', 'muschibattle', 'ficken', 'crewmaskottchen', 'mom', 'beef', 'sonny', 'spit', 'mutterficker', 'dreckstoys', 'punanis', 'schlampe', 'slut', 'mbeezy', 'suck', 'möse', 'gebumst', 'kanackenfreestyle', 'bitches', 'bullets', 'bitch', 'ficker', 'drecksbulle', 'freesy-stylie', 'vater', 'schrotteile', 'bratan', 'piss', 'scheiß', 'nacken', 'deep', 'blasen', 'krüppel', 'mighty', 'wack', 'hurenschleimer', 'pisser', 'fickst', 'whack', 'bitchbart', 'hurentochter', 'boss-cock', 'nutte', 'prishtina', 'steißbein', 'enissa', 'bitchtits', 'wichser', 'hurensohn', 'snitch', 'hurensohnköpfen', 'mauer', 'dummen', 'fickt', 'schwanz', 'fick', 'fly', 'ficke', 'schwanzlutscher', 'torte', 'hurensöhne', 'stoned', 'bastard', 'patte', 'miese', 'scheißdreck'}


In [49]:
def computeTermEvalMetrics(extracted_terms, gold_df):
  extracted_terms = set([item.lower() for item in extracted_terms])
  gold_set=set(gold_df)
  true_pos=extracted_terms.intersection(gold_set)
  recall=len(true_pos)/len(gold_set)
  precision=len(true_pos)/len(extracted_terms)

  print("Intersection",len(true_pos))
  print("Gold",len(gold_set))
  print("Extracted",len(extracted_terms))
  print("Recall:", recall)
  print("Precision:", precision)
  print("F1:", 2*(precision*recall)/(precision+recall))

In [50]:
computeTermEvalMetrics(dft_extracted_terms, gold_set_for_rap)

Intersection 55
Gold 66
Extracted 86
Recall: 0.8333333333333334
Precision: 0.6395348837209303
F1: 0.7236842105263159


# DEFT

In [62]:
with zipfile.ZipFile('best_model_de-en.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/best_model_deft')

In [63]:
model = XLMRobertaForTokenClassification.from_pretrained("/content/best_model_deft")

In [64]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    do_eval=True
)

In [65]:
trainer_deft = Trainer(
        model=model,
        args=training_args,
        eval_dataset=rap_dataset,
        compute_metrics=compute_metrics_rap,
    )

In [66]:
eval_results_deft = trainer_deft.evaluate()
print(f"evaluation results: {eval_results_deft}")

evaluation results: {'eval_loss': 0.09253334999084473, 'eval_model_preparation_time': 0.0056, 'eval_precision': 0.7380952380952381, 'eval_recall': 0.9393939393939394, 'eval_f1': 0.8266666666666667, 'eval_runtime': 0.3818, 'eval_samples_per_second': 343.088, 'eval_steps_per_second': 23.571}


In [67]:
deft_predictions, deft_labels, deft_metrics = trainer_deft.predict(rap_dataset)

deft_predictions = np.argmax(deft_predictions, axis=2)

true_deft_predictions = [
    [label_list[p] for (p, l) in zip(deft_prediction, deft_label) if l != -100]
    for deft_prediction, deft_label in zip(deft_predictions, deft_labels)
]

for i in range(5):
    print(f"Predicted: {true_deft_predictions[i]}")
    print(f"True: {[label_list[l] for l in deft_labels[i] if l != -100]}")

Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
Predicted: ['n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
Predicted: ['B-B', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['B-B', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n']
Predicted: ['n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n']
True: ['n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n']
Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']


In [68]:
deft_extracted_terms = extract_terms(true_deft_predictions, rap_texts)

In [69]:
print(deft_extracted_terms)

{'fotzen', 'zerhau', 'ganja-verkäufer', 'broke-ass-bitch', 'motherfucker', 'kaffer', 'nuttensohn', 'vollidioten', 'kugel', 'bitchrap', 'hoe', 'nigga', 'chick', 'nix', 'huren', 'arsch', 'schwuchtel', 'scheiße', 'blowjob', 'homo', 'muschibattle', 'cock', 'ficken', 'dich', 'fotze', 'bras', 'spit', 'mutterficker', 'blowt', 'dreckstoys', 'punanis', 'schlampe', 'slut', 'mbeezy', 'möse', 'kanackenfreestyle', 'fuck', 'bitches', 'bullets', 'blacks', 'bitch', 'ficker', 'drecksbulle', 'freesy-stylie', 'bratan', 'piss', 'scheiß', 'bad-boys', 'nacken', 'blasen', 'geil', 'krüppel', 'pussy', 'mighty', 'hurenschleimer', 'pisser', 'fickst', 'lärm', 'whack', 'bitchbart', 'hurentochter', 'boss-cock', 'nutte', 'prishtina', 'steißbein', 'mich', 'bitchtits', 'wichser', 'hurensohn', 'cumshot', 'snitch', 'hurensohnköpfen', 'dummen', 'fickt', 'schwanz', 'motherfucking', 'fick', 'ficke', 'shit', 'schwanzlutscher', 'hurensöhne', 'bastard', 'patte', 'scheißdreck'}


In [70]:
computeTermEvalMetrics(deft_extracted_terms, gold_set_for_rap)

Intersection 62
Gold 66
Extracted 84
Recall: 0.9393939393939394
Precision: 0.7380952380952381
F1: 0.8266666666666667


# CMM

In [15]:
with zipfile.ZipFile('best_model_cmm.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/best_model_cmm')

In [16]:
model = XLMRobertaForTokenClassification.from_pretrained("/content/best_model_cmm")

In [17]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    do_eval=True
)

In [18]:
trainer_cmm = Trainer(
        model=model,
        args=training_args,
        eval_dataset=rap_dataset,
        compute_metrics=compute_metrics_rap,
    )

In [20]:
eval_results_cmm = trainer_cmm.evaluate()
print(f"evaluation results: {eval_results_cmm}")

evaluation results: {'eval_loss': 0.24947714805603027, 'eval_model_preparation_time': 0.0053, 'eval_precision': 0.5526315789473685, 'eval_recall': 0.6363636363636364, 'eval_f1': 0.5915492957746479, 'eval_runtime': 0.3445, 'eval_samples_per_second': 380.231, 'eval_steps_per_second': 26.123}


In [21]:
cmm_predictions, cmm_labels, cmm_metrics = trainer_cmm.predict(rap_dataset)

cmm_predictions = np.argmax(cmm_predictions, axis=2)

true_cmm_predictions = [
    [label_list[p] for (p, l) in zip(cmm_prediction, cmm_label) if l != -100]
    for cmm_prediction, cmm_label in zip(cmm_predictions, cmm_labels)
]

for i in range(5):
    print(f"Predicted: {true_cmm_predictions[i]}")
    print(f"True: {[label_list[l] for l in cmm_labels[i] if l != -100]}")

Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
Predicted: ['n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['B-B', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n']
Predicted: ['n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'B-B', 'n', 'n']
True: ['n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n']
Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']


In [24]:
cmm_extracted_terms = extract_terms(true_cmm_predictions, rap_texts)

In [25]:
print(cmm_extracted_terms)

{'tanzen', 'boss-cock', 'fotzen', 'scheiße', 'kleine', 'zerhau', 'krasseste', 'suck', 'fuck', 'bitches', 'steißbein', 'broke-ass-bitch', 'muschibattle', 'cock', 'motherfucker', 'krass', 'bitchtits', 'bullets', 'ficken', 'bitch', 'ficker', 'gouda', 'hurensohn', 'dich', 'snitch', 'nuttensohn', 'hurensohnköpfen', 'freesy-stylie', 'rapper', 'drecksbulle', 'whiskey-coke', 'schrotteile', 'kugel', 'bratan', 'bitchrap', 'dummen', 'piss', 'rauche', 'fickt', 'fotze', 'scheiß', 'ross', 'motherfucking', 'fick', 'nigga', 'facedrive', 'bras', 'flöten', 'deep', 'nix', 'boss', 'porsche-jeep', 'loch', 'broke', 'ficke', 'krüppel', 'halbfinale', 'shit', 'whack', 'pussy', 'diamonds', 'dreckstoys', 'torte', 'mighty', 'wack', 'hurenschleimer', 'bastard', 'pisser', 'arsch', 'schlampe', 'patte', 'slut', 'kickdown', 'brett', 'scheißdreck', 'bitchbart'}


In [28]:
computeTermEvalMetrics(cmm_extracted_terms, gold_set_for_rap)

Intersection 42
Gold 66
Extracted 76
Recall: 0.6363636363636364
Precision: 0.5526315789473685
F1: 0.5915492957746479


# RESULTS:

## DE Only Fine-Tuned Model (DFT):
Recall: 0.8333333333333334
Precision: 0.6395348837209303
**F1: 0.7236842105263159**

## DE-EN Bilingual Fine-Tuned Model (DEFT):
Recall: 0.9393939393939394
Precision: 0.7380952380952381
**F1: 0.8266666666666667**

## Code-Mixed Model (CMM):
Recall: 0.6363636363636364
Precision: 0.5526315789473685
**F1: 0.5915492957746479**



# DE only Fine-Tuned Model (DFT) vs. DE-EN Bilingual Fine-Tuned Model (DEFT) vs. Code-Mixed Model (CMM) auf Cross-Domain Dataset (CDD)

# CDD Preparation

In [71]:
csv_file_path = 'Labeled_dataset_cdd.csv'

sentences = {}
with open(csv_file_path, 'r') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)
    for row in csv_reader:
        sentence_id, token, label = row
        if sentence_id not in sentences:
            sentences[sentence_id] = {'tokens': [], 'labels': []}
        sentences[sentence_id]['tokens'].append(token)
        sentences[sentence_id]['labels'].append(label)

result_cdd = []
for sentence_id, data in sentences.items():
    words = data['tokens']
    labels = data['labels']
    result_cdd.append((words, labels))

for sentence in result_cdd:
    print(sentence)

(['yeah', ',', 'zieh', 'den', 'carlo', 'aus', ',', 'du', 'nuttensohn', ',', 'sonny', 'bounct', 'den', 'beat', '–', 'mission', 'complete', '.'], ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n'])
(['politiker', 'sind', 'alles', 'abschaum', '!'], ['n', 'n', 'n', 'B-B', 'n'])
(['haiti', 'like', 'ghetto-scheißlöcher', 'across', 'america', 'is', 'an', 'open', 'finanzkanalisation', 'throw', 'billions', 'at', 'the', 'place', 'and', 'you', 'will', 'not', 'even', 'see', 'where', 'the', 'geld', 'ging', 'weiß', 'do', 'not', 'have', 'the', 'stomach', 'to', 'rule', 'these', 'places', 'like', 'they', 'need', 'to', 'be', 'ruled', 'with', 'strongmen', '.'], ['n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n'])
(['weswegen', 'hab', 'ich', 'immer', 'wieder', 'trouble', 'mit', 'den', 'cops', ',', 'ra

In [72]:
cdd_data = result_cdd
print(f"CDD set length: {len(cdd_data)}")

CDD set length: 150


In [73]:
gold_set_for_cdd = set(word for (words, labels) in cdd_data for word, label in zip(words, labels) if label == 'B-B' or label == 'B')

df_gold_cdd = pd.DataFrame({'terms': list(gold_set_for_cdd)})
print(df_gold_cdd)

                   terms
0                 stupid
1              subhumans
2                  sluts
3              sub-human
4                  filth
..                   ...
144             fagtards
145            behindert
146  ghetto-scheißlöcher
147                kackt
148              apeshit

[149 rows x 1 columns]


In [74]:
cdd_tags=[tup[1] for tup in cdd_data]
cdd_texts=[tup[0] for tup in cdd_data]

In [75]:
cdd_input_and_labels = tokenize_and_align_labels(cdd_texts, cdd_tags)

In [76]:
cdd_dataset = Dataset(cdd_input_and_labels, cdd_input_and_labels["labels"])

In [77]:
def compute_metrics_cdd(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    extracted_terms = extract_terms(true_predictions, cdd_texts)
    extracted_terms = set([item.lower() for item in extracted_terms])

    gold_set = set(word for (words, labels) in cdd_data for word, label in zip(words, labels) if label == 'B-B' or label == 'B')

    true_pos = extracted_terms.intersection(gold_set)
    recall = len(true_pos) / len(gold_set) if len(gold_set) > 0 else 0
    precision = len(true_pos) / len(extracted_terms) if len(extracted_terms) > 0 else 0

    return {
        "precision": precision,
        "recall": recall,
        "f1": 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0,
    }

# DFT

In [78]:
model = XLMRobertaForTokenClassification.from_pretrained("/content/best_model_dft")

In [79]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    do_eval=True
)

In [80]:
trainer_dft = Trainer(
        model=model,
        args=training_args,
        eval_dataset=cdd_dataset,
        compute_metrics=compute_metrics_cdd,
    )

In [81]:
eval_results_dft = trainer_dft.evaluate()
print(f"evaluation results: {eval_results_dft}")

evaluation results: {'eval_loss': 0.2319251000881195, 'eval_model_preparation_time': 0.0034, 'eval_precision': 0.7142857142857143, 'eval_recall': 0.6040268456375839, 'eval_f1': 0.6545454545454547, 'eval_runtime': 0.6563, 'eval_samples_per_second': 228.554, 'eval_steps_per_second': 15.237}


In [82]:
dft_predictions, dft_labels, dft_metrics = trainer_dft.predict(cdd_dataset)

dft_predictions = np.argmax(dft_predictions, axis=2)

true_dft_predictions = [
    [label_list[p] for (p, l) in zip(dft_prediction, dft_label) if l != -100]
    for dft_prediction, dft_label in zip(dft_predictions, dft_labels)
]

for i in range(5):
    print(f"Predicted: {true_dft_predictions[i]}")
    print(f"True: {[label_list[l] for l in dft_labels[i] if l != -100]}")

Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
Predicted: ['n', 'n', 'n', 'B-B', 'n']
True: ['n', 'n', 'n', 'B-B', 'n']
Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', '

In [83]:
dft_extracted_terms = extract_terms(true_dft_predictions, cdd_texts)

In [84]:
print(dft_extracted_terms)

{'stupid', 'ärsche', 'deppen', 'motherfucker', 'toter', 'schwein', 'nuttensohn', 'gucci', 'sandniggers', 'cumslut', 'hater', 'versace-nutten', 'shook', 'armani', 'freestyln', 'scheiß-show', 'amerikaner', 'chick', 'schwuchtel-muzzie', 'flava', 'kaukasier', 'scheiß-faker', 'thick', 'dumm', 'chimp', 'fettarsch', 'schwanzsaugen', 'klugscheisser', 'abspritz', 'arsch', 'booty', 'raghead', 'kopf', 'recorden', 'blöd', 'scheiss', 'scheiße', 'spic', 'niggers', 'tripe', 'mistkind', 'ficken', 'mohr', 'dumme', 'idioten', 'scheißhaut', 'fotze', 'gangster', 'crack-bitch', 'brioni', 'vollpfosten', 'bonzen-slut', 'depp', 'muzzie', 'volldeppen', 'whore', 'trottel', 'pimmel', 'stinkender', 'groupie', 'schlampe', 'idiot', 'handschellen', 'shithole', 'fresher', 'käppi', 'bitches', 'zähne', 'geese', 'kike', 'verdammte', 'bitch', 'spinner', 'mannsweib', 'drecksratten', 'kalkarsch', 'behinderte', 'scheiß', 'faggot', 'bounct', 'rolies', 'neger', 'bullshitting', 'bender', 'kikes', 'ghetto-niggerh-affe', 'drecks

In [85]:
computeTermEvalMetrics(dft_extracted_terms, gold_set_for_cdd)

Intersection 90
Gold 149
Extracted 126
Recall: 0.6040268456375839
Precision: 0.7142857142857143
F1: 0.6545454545454547


# DEFT

In [86]:
model = XLMRobertaForTokenClassification.from_pretrained("/content/best_model_deft")

In [87]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    do_eval=True
)

In [88]:
trainer_deft = Trainer(
        model=model,
        args=training_args,
        eval_dataset=cdd_dataset,
        compute_metrics=compute_metrics_cdd,
    )

In [89]:
eval_results_deft = trainer_deft.evaluate()
print(f"evaluation results: {eval_results_deft}")

evaluation results: {'eval_loss': 0.13920938968658447, 'eval_model_preparation_time': 0.003, 'eval_precision': 0.8702290076335878, 'eval_recall': 0.7651006711409396, 'eval_f1': 0.8142857142857143, 'eval_runtime': 0.6433, 'eval_samples_per_second': 233.157, 'eval_steps_per_second': 15.544}


In [90]:
deft_predictions, deft_labels, deft_metrics = trainer_deft.predict(cdd_dataset)

deft_predictions = np.argmax(deft_predictions, axis=2)

true_deft_predictions = [
    [label_list[p] for (p, l) in zip(deft_prediction, deft_label) if l != -100]
    for deft_prediction, deft_label in zip(deft_predictions, deft_labels)
]

for i in range(5):
    print(f"Predicted: {true_deft_predictions[i]}")
    print(f"True: {[label_list[l] for l in deft_labels[i] if l != -100]}")

Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
Predicted: ['n', 'n', 'n', 'B-B', 'n']
True: ['n', 'n', 'n', 'B-B', 'n']
Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
Predicted: ['n', 'n', 'n', 'n', 'n', 'B-B', 'n', '

In [91]:
deft_extracted_terms = extract_terms(true_deft_predictions, cdd_texts)

In [92]:
print(deft_extracted_terms)

{'stupid', 'sluts', 'filth', 'deppen', 'hillbilly', 'motherfucker', 'schwein', 'shitting', 'damn', 'nuttensohn', 'cumslut', 'versace-nutten', 'dick', 'armani', 'freestyln', 'crap', 'scheiß-show', 'chick', 'cum', 'nix', 'schwuchtel-muzzie', 'fuck-ups', 'scheiß-faker', 'thick', 'dumm', 'spitte', 'fettarsch', 'chimp', 'schwanzsaugen', 'klugscheisser', 'abspritz', 'arsch', 'booty', 'of', 'blöd', 'scheiss', 'retarded', 'scheiße', 'blowjob', 'spic', 'niggers', 'piece', 'mistkind', 'ficken', 'fuckers', 'dumme', 'idioten', 'scheißhaut', 'fotze', 'crack-bitch', 'bums', 'vollpfosten', 'bonzen-slut', 'depp', 'fast-life', 'smart-ass', 'volldeppen', 'faggots', 'whore', 'trottel', 'abschaum-herrscher', 'groupie', 'schlampe', 'idiot', 'slut', 'hookers', 'handschellen', 'shithole', 'käppi', 'blowjobs', 'son', 'fuck', 'bitches', 'kike', 'verdammte', 'bitch', 'spinner', 'mannsweib', 'fucking', 'drecksratten', 'kalkarsch', 'behinderte', 'thug', 'cocksuckers', 'scheiß', 'faggot', 'neger', 'fettfrauen', 'b

In [93]:
computeTermEvalMetrics(deft_extracted_terms, gold_set_for_cdd)

Intersection 114
Gold 149
Extracted 131
Recall: 0.7651006711409396
Precision: 0.8702290076335878
F1: 0.8142857142857143


# CMM



In [94]:
model = XLMRobertaForTokenClassification.from_pretrained("/content/best_model_cmm")

In [95]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    do_eval=True
)

In [96]:
trainer_cmm = Trainer(
        model=model,
        args=training_args,
        eval_dataset=cdd_dataset,
        compute_metrics=compute_metrics_cdd,
    )

In [97]:
eval_results_cmm = trainer_cmm.evaluate()
print(f"evaluation results: {eval_results_cmm}")

evaluation results: {'eval_loss': 0.22946569323539734, 'eval_model_preparation_time': 0.0029, 'eval_precision': 0.6854838709677419, 'eval_recall': 0.5704697986577181, 'eval_f1': 0.6227106227106227, 'eval_runtime': 0.6398, 'eval_samples_per_second': 234.453, 'eval_steps_per_second': 15.63}


In [98]:
cmm_predictions, cmm_labels, cmm_metrics = trainer_cmm.predict(cdd_dataset)

cmm_predictions = np.argmax(cmm_predictions, axis=2)

true_cmm_predictions = [
    [label_list[p] for (p, l) in zip(cmm_prediction, cmm_label) if l != -100]
    for cmm_prediction, cmm_label in zip(cmm_predictions, cmm_labels)
]

for i in range(5):
    print(f"Predicted: {true_cmm_predictions[i]}")
    print(f"True: {[label_list[l] for l in cmm_labels[i] if l != -100]}")

Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
Predicted: ['n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'n', 'B-B', 'n']
Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
True: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-B', 'n', 'n', 'n', 'n', 'n', 'n', 'n']
Predicted: ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-

In [99]:
cmm_extracted_terms = extract_terms(true_cmm_predictions, cdd_texts)

In [100]:
print(cmm_extracted_terms)

{'stupid', 'sluts', 'filth', 'steglitz', 'ging', 'hillbilly', 'motherfucker', 'toter', 'damn', 'nuttensohn', 'gucci', 'sandniggers', 'cumslut', 'shuddered', 'versace-nutten', 'disgusting', 'shook', 'dick', 'crap', 'scheiß-show', 'halblang', 'nix', 'pädophile', 'weißer', 'schwuchtel-muzzie', 'flava', 'fuck-ups', 'scheiß-faker', 'thick', 'dumm', 'spitte', 'fettarsch', 'chimp', 'arsch', 'raghead', 'blöd', 'retarded', 'scheiße', 'spic', 'niggers', 'schwarze', 'piece', 'lied', 'ficken', 'rapper', 'fuckers', 'dumme', 'scheißhaut', 'gangster', 'mohammed', 'kick', 'crack-bitch', 'bonzen-slut', 'muzzie', 'dunkle', 'fm', 'zervixabstrichtests', 'faggots', 'whore', 'trottel', 'schlampe', 'slut', 'handschellen', 'washington', 'gladbach', 'pumpgun', 'shithole', 'fuck', 'bitches', 'geese', 'cuckservative', 'kike', 'verdammte', 'bitch', 'spinner', 'fucking', 'drecksratten', 'kalkarsch', 'thug', 'cocksuckers', 'scheiß', 'faggot', 'neger', 'fettfrauen', 'bullshitting', 'fuckloads', 'kikes', 'dreckskerl'

In [101]:
computeTermEvalMetrics(cmm_extracted_terms, gold_set_for_cdd)

Intersection 85
Gold 149
Extracted 124
Recall: 0.5704697986577181
Precision: 0.6854838709677419
F1: 0.6227106227106227


# RESULTS

## DE Only Fine-Tuned Model (DFT):
Recall: 0.6040268456375839
Precision: 0.7142857142857143
**F1: 0.6545454545454547**

## DE-EN Bilingual Fine-Tuned Model (DEFT):
Recall: 0.7651006711409396
Precision: 0.8702290076335878
**F1: 0.8142857142857143**

## Code-Switched Model (CMM):
Recall: 0.5704697986577181
Precision: 0.6854838709677419
**F1: 0.6227106227106227**



In [104]:
data = {
    "Model": [
        "DE Only Fine-Tuned Model (DFT)",
        "DE-EN Bilingual Fine-Tuned Model (DEFT)",
        "Code-Mixed Model (CMM)"
    ],
    "F1 Score (RAP Dataset) %": [
        72.37, 82.67, 59.15
    ],
    "F1 Score (CDD Dataset) %": [
        65.45, 81.43, 62.27
    ]
}

df = pd.DataFrame(data)
print(df.to_markdown(index=False, tablefmt="grid"))


+-----------------------------------------+----------------------------+----------------------------+
| Model                                   |   F1 Score (RAP Dataset) % |   F1 Score (CDD Dataset) % |
| DE Only Fine-Tuned Model (DFT)          |                      72.37 |                      65.45 |
+-----------------------------------------+----------------------------+----------------------------+
| DE-EN Bilingual Fine-Tuned Model (DEFT) |                      82.67 |                      81.43 |
+-----------------------------------------+----------------------------+----------------------------+
| Code-Mixed Model (CMM)                  |                      59.15 |                      62.27 |
+-----------------------------------------+----------------------------+----------------------------+


# NEOLOGISMS

In [105]:
neologisms = [
    "bitchbart", "bitchrap", "bitchtits", "boss-cock", "broke-ass-bitch",
    "disco-hoes", "dreckstoys", "facedrive", "hurenschleimer",
    "hurensohnköpfen", "kanackenfreestyle", "muschibattle"
]

detection_dft = [
    "bitchbart", "bitchrap", "bitchtits", "boss-cock", "broke-ass-bitch",
    "dreckstoys", "hurenschleimer", "hurensohnköpfen", "kanackenfreestyle",
    "muschibattle"
]

detection_cmm = [
    "bitchbart", "bitchrap", "bitchtits", "boss-cock", "broke-ass-bitch",
    "dreckstoys", "facedrive", "hurenschleimer", "hurensohnköpfen", "muschibattle"
]

detection_deft = [
    "bitchbart", "bitchrap", "bitchtits", "boss-cock", "broke-ass-bitch",
    "dreckstoys", "hurenschleimer", "hurensohnköpfen", "kanackenfreestyle",
    "muschibattle"
]

df = pd.DataFrame({
    "Neologism": neologisms,
    "Detected by DFT": [1 if n in detection_dft else 0 for n in neologisms],
    "Detected by CMM": [1 if n in detection_cmm else 0 for n in neologisms],
    "Detected by DEFT": [1 if n in detection_deft else 0 for n in neologisms]
})

print(df.to_markdown(index=False))

| Neologism         |   Detected by DFT |   Detected by CMM |   Detected by DEFT |
|:------------------|------------------:|------------------:|-------------------:|
| bitchbart         |                 1 |                 1 |                  1 |
| bitchrap          |                 1 |                 1 |                  1 |
| bitchtits         |                 1 |                 1 |                  1 |
| boss-cock         |                 1 |                 1 |                  1 |
| broke-ass-bitch   |                 1 |                 1 |                  1 |
| disco-hoes        |                 0 |                 0 |                  0 |
| dreckstoys        |                 1 |                 1 |                  1 |
| facedrive         |                 0 |                 1 |                  0 |
| hurenschleimer    |                 1 |                 1 |                  1 |
| hurensohnköpfen   |                 1 |                 1 |                  1 |
| ka