# Key Problem III: Model Performance Plateaus
# Training on Confident Examples

## Data Preparation

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
# First, we read data consisting of tweets with multiple labels for argumentation strategy.
df = pd.read_csv("data/multiple_labels.csv", delimiter=";")
df.head()

Unnamed: 0,label_1,label_2,label_3,label_4,text
0,sarc,sarc,sarc,sarc,@realvibes @CoolMaas Also müssen wir erstmal a...
1,construct,opin,opin,construct,@Max_Muller Jetzt: Unsere Schwierigkeiten sind...
2,opin,construct,opin,opin,@user123 @newsflashxyz @daily_report @global_w...
3,opin,opin,opin,opin,". @LenaMeyer Wer Europa festigen will, darf da..."
4,other,other,sarc,other,"@maxmiller @beauty.sight @newsflash Jo, aber e..."


### Select Confident Examples

In [3]:
def calculate_confidence(row):
    """
    Determine the rater agreement across four ratings. The function expects
    a row of a data frame with four labels. It returns the agreement level
    depending on the number of labels that are the same.
    """
    labels = list(row[[f"label_{i}" for i in range(1, 5)]].values)
    # all raters agree
    if len(set(labels)) == 1: 
        return "unanimous"
    # two raters agree, the other raters disagree
    elif len(set(labels)) == 3: 
        return "weak-majority"
    # all raters disagree
    elif len(set(labels)) == 4: 
        return "disagreement"
    # raters are split between two labels
    elif len(set(labels)) == 2 and labels.count(labels[0]) == 2: 
        return "split"
    # three out of four raters agree
    else:
        return "majority"

In [4]:
# calculate the rater confidence and add it to the data frame as an additional column
df["rater_confidence"] = df.apply(calculate_confidence, axis=1)
df.head()

Unnamed: 0,label_1,label_2,label_3,label_4,text,rater_confidence
0,sarc,sarc,sarc,sarc,@realvibes @CoolMaas Also müssen wir erstmal a...,unanimous
1,construct,opin,opin,construct,@Max_Muller Jetzt: Unsere Schwierigkeiten sind...,split
2,opin,construct,opin,opin,@user123 @newsflashxyz @daily_report @global_w...,majority
3,opin,opin,opin,opin,". @LenaMeyer Wer Europa festigen will, darf da...",unanimous
4,other,other,sarc,other,"@maxmiller @beauty.sight @newsflash Jo, aber e...",majority


In [5]:
# We will drop all examples were the rater confidence is "split" or "disagreement".
df["rater_confidence"].value_counts()

rater_confidence
majority         308
unanimous        286
weak-majority    180
split            124
disagreement      19
Name: count, dtype: int64

In [6]:
# select only examples where two or more raters agree (no split decisions!)
df = df[df["rater_confidence"].isin(["unanimous", "majority", "weak-majority"])]
df.head()

Unnamed: 0,label_1,label_2,label_3,label_4,text,rater_confidence
0,sarc,sarc,sarc,sarc,@realvibes @CoolMaas Also müssen wir erstmal a...,unanimous
2,opin,construct,opin,opin,@user123 @newsflashxyz @daily_report @global_w...,majority
3,opin,opin,opin,opin,". @LenaMeyer Wer Europa festigen will, darf da...",unanimous
4,other,other,sarc,other,"@maxmiller @beauty.sight @newsflash Jo, aber e...",majority
5,leave_fact,other,other,opin,@xyzauthor @frauexample seitdem unsere Politik...,weak-majority


In [7]:
# create a new column in the data frame with the majority label
df["label"] = np.nan
df["label"] = df["label"].astype("object")
for i, row in df[[f"label_{i}" for i in range(1, 5)]].iterrows():
    df.loc[i, "label"] = row.value_counts().index[0]
df.head()

Unnamed: 0,label_1,label_2,label_3,label_4,text,rater_confidence,label
0,sarc,sarc,sarc,sarc,@realvibes @CoolMaas Also müssen wir erstmal a...,unanimous,sarc
2,opin,construct,opin,opin,@user123 @newsflashxyz @daily_report @global_w...,majority,opin
3,opin,opin,opin,opin,". @LenaMeyer Wer Europa festigen will, darf da...",unanimous,opin
4,other,other,sarc,other,"@maxmiller @beauty.sight @newsflash Jo, aber e...",majority,other
5,leave_fact,other,other,opin,@xyzauthor @frauexample seitdem unsere Politik...,weak-majority,other


In [8]:
# Now we can drop the individual labels by the four raters and only retain the majority label.
df = df.reset_index()
df = df[["text", "label"]]
df.head()

Unnamed: 0,text,label
0,@realvibes @CoolMaas Also müssen wir erstmal a...,sarc
1,@user123 @newsflashxyz @daily_report @global_w...,opin
2,". @LenaMeyer Wer Europa festigen will, darf da...",opin
3,"@maxmiller @beauty.sight @newsflash Jo, aber e...",other
4,@xyzauthor @frauexample seitdem unsere Politik...,other


### Prepare Data for Training

In [9]:
# To train the BERT model, we need to replace the string description of classes 
# with numerical class labels.
label_to_id = {
    "construct": 0,
    "opin": 1,
    "sarc": 2,
    "leave_fact": 3,
    "other": 4,
}
df["label"] = df["label"].map(label_to_id)
df.head()

Unnamed: 0,text,label
0,@realvibes @CoolMaas Also müssen wir erstmal a...,2
1,@user123 @newsflashxyz @daily_report @global_w...,1
2,". @LenaMeyer Wer Europa festigen will, darf da...",1
3,"@maxmiller @beauty.sight @newsflash Jo, aber e...",4
4,@xyzauthor @frauexample seitdem unsere Politik...,4


### Create Data Splits

In [10]:
from sklearn.model_selection import StratifiedShuffleSplit
import os

In [11]:
# By splitting the data in five different ways, we are able to train five
# different models, letting us choose the best out of five.
dst = "data"

# fraction of data that is set aside for evaluation and testing
test_frac = 0.15
eval_frac = 0.15

# set a seed to make the splits reproducible
for s, seed in enumerate([42, 43, 44, 45, 46]):
    
    # get the eval split
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=eval_frac, 
        random_state=seed
    )
    sss.get_n_splits(df["text"], df["label"])
    for tmp_index, eval_index in sss.split(df["text"], df["label"]):
        X_tmp, X_eval = df["text"].loc[tmp_index], df["text"].loc[eval_index]
        y_tmp, y_eval = df["label"].loc[tmp_index], df["label"].loc[eval_index]

    tmp = pd.concat([X_tmp, y_tmp], axis=1).reset_index(drop=True)
    evaldata = pd.concat([X_eval, y_eval], axis=1)

    # get the test split
    sss = StratifiedShuffleSplit(
        n_splits=1, 
        test_size=test_frac / (1 - eval_frac), 
        random_state=s + 10
    )
    sss.get_n_splits(tmp["text"], tmp["label"])
    for train_index, test_index in sss.split(tmp["text"], tmp["label"]):
        X_train, X_test = tmp["text"].loc[train_index],\
                          tmp["text"].loc[test_index]
        y_train, y_test = tmp["label"].loc[train_index],\
                          tmp["label"].loc[test_index]

        traindata = pd.concat([X_train, y_train], axis=1)
        testdata = pd.concat([X_test, y_test], axis=1)

    # save the splits
    fname = dst + "/confident_examples/"

    if not os.path.exists(fname):
        os.makedirs(fname)
    
    traindata.to_csv(Path(fname + f"train_split_{s+1}.csv"), index=False, sep=";")
    testdata.to_csv(Path(fname + f"test_split_{s+1}.csv"), index=False, sep=";")
    evaldata.to_csv(Path(fname + f"eval_split_{s+1}.csv"), index=False, sep=";")

## Train a BERT Classifier on Confident Examples

In [12]:
from transformers import AutoTokenizer, EarlyStoppingCallback
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch import cuda
from sklearn.metrics import classification_report
import evaluate
import sys

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# Training parameters
# Note: training for 10 epochs and 5 splits takes about 1 hour on a relatively
# powerful laptop. On a GPU training takes only a few minutes. The number of
# epochs and splits has been set to 1 here for illustration purpuses
MODEL = "cardiffnlp/twitter-xlm-roberta-base"
EPOCHS = 1  # 10 
BATCH_SIZE = 128  # 256, reduce if running out of memory
EVAL_STEPS = 5 
DATA_SPLITS = 1  # 5
SAVE_MODEL = True
 
DATA_SRC = "data/confident_examples/"

datatypes = ['train','eval','test']
splits = [1, 2, 3, 4, 5]
dataset_dict = {i: {} for i in splits}

In [14]:
# sort the different data splits into a dictionary
for j in splits:
    for i in datatypes:
        dataset_dict[j][i] = {}
        data = pd.read_csv(
            Path(DATA_SRC + f"{i}_split_{j}.csv"),
            dtype={"text": str, "label": int},
            delimiter=";"
        )
        dataset_dict[j][i]["text"] = list(data["text"].values)
        dataset_dict[j][i]["labels"] = list(data["label"].values)

In [15]:
class MyDataset(torch.utils.data.Dataset):
    """Custom data set class to store the data"""
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [16]:
metric = evaluate.load("f1")

def compute_metrics(eval_preds):
    """Function to evaluate the micro F1 score"""
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="micro")

In [17]:
# Iterate over all data splits, train the model using the training data for
# training and evaluation data for model evaluation during training. The test
# data set is used at the very end to test the performance of the model on 
# unseen data. The report with different performance metrics is saved to the
# "results" folder. The trained model is saved to the "finetuned_models" folder.
for split in splits[0:DATA_SPLITS]:
    # load the tokenizer and encode the train, eval and test data sets of the
    # given split
    tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
    train_encodings = tokenizer(dataset_dict[split]['train']['text'],
                truncation=True, padding=True, max_length=180)
    eval_encodings = tokenizer(dataset_dict[split]['eval']['text'],
                truncation=True, padding=True, max_length=180)
    test_encodings = tokenizer(dataset_dict[split]['test']['text'],
                truncation=True, padding=True, max_length=180)

    # package the encoded data sets into a dataset class
    train_dataset = MyDataset(train_encodings, dataset_dict[split]['train']['labels'])
    eval_dataset = MyDataset(eval_encodings, dataset_dict[split]['eval']['labels'])
    test_dataset = MyDataset(test_encodings, dataset_dict[split]['test']['labels'])

    # set the training arguments
    training_args = TrainingArguments(
        output_dir='./results',                   # output directory
        num_train_epochs=EPOCHS,                  # total number of training epochs
        per_device_train_batch_size=BATCH_SIZE,   # batch size per device during training
        per_device_eval_batch_size=BATCH_SIZE,    # batch size for evaluation
        warmup_steps=100,                         # number of warmup steps for learning rate scheduler
        weight_decay=0.0025,                      # strength of weight decay
        label_smoothing_factor=0.2,
        logging_dir='./logs',                     # directory for storing logs
        logging_steps=10,                         # when to print log
        load_best_model_at_end=True,              # load or not best model at the end
        save_strategy='steps',
        metric_for_best_model="eval_loss",
        eval_strategy="steps",
        eval_steps=EVAL_STEPS,
        seed=42
    )

    # load the pretrained model
    num_labels = len(set(dataset_dict[split]["train"]["labels"]))
    model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels)

    # initialize the trainer with the model, training arguments, training data
    # set, evaluation data set, metric for evaluations, and the early stopping
    # callback which will stop training if the performance does not improve
    # over five consecutive training steps
    trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset = train_dataset,
            eval_dataset = eval_dataset,
            compute_metrics = compute_metrics,
            callbacks = [EarlyStoppingCallback(5, 0)]
    )

    # execute the training and save the trained model
    trainer.train()
    model_savename = f"{MODEL.split('/')[-1]}_split-{split}"
    trainer.save_model(Path("finetuned_models", model_savename))  # save best model

    # calculate the performance of the trained model on the held-back 
    # test dataset
    test_preds_raw, test_labels, _ = trainer.predict(test_dataset)
    test_preds = np.argmax(test_preds_raw, axis=-1)
    report = classification_report(test_labels, test_preds, digits=3, output_dict=True, zero_division=0)
    print(classification_report(test_labels, test_preds, digits=3, zero_division=0))

    # save the model performance report for later inspection
    report_df = pd.DataFrame(report).transpose()   
    report_savename = f"report_{MODEL.split('/')[-1]}_epochs-{EPOCHS}_bs-{BATCH_SIZE}_split-{split}.csv"
    report_df.to_csv(Path("results", report_savename), index=False)

# Warning notes:
# Some weights were not initialized: This is expected, since we are training the model on our custom downstream task.
# Insufficient memory: Reduce the batch size.
# For Mac users, pin_memory not supported: Raised due to different architecture of Apple GPUs, can be ignored.
# Precision is zero for some steps: Happens, if the model fails to predict at least one class entirely.

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,F1
5,No log,1.596713,0.264957


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


              precision    recall  f1-score   support

           0      0.000     0.000     0.000        13
           1      0.000     0.000     0.000        35
           2      0.000     0.000     0.000        10
           3      0.267     1.000     0.422        31
           4      0.000     0.000     0.000        27

    accuracy                          0.267       116
   macro avg      0.053     0.200     0.084       116
weighted avg      0.071     0.267     0.113       116



## Identify More Confident Examples

### Data Preparation

In [18]:
# We can now use the preliminary BERT model trained previously to identify new confident
# examples on data with only a single human label. Therefore, we first load additional data
# with one label.
one_label = pd.read_csv("data/one_label.csv", delimiter=";")
one_label.head()

Unnamed: 0,label,text
0,construct,"Die Idee is wohl, dass Anschläge von den Täter..."
1,sarc,"O, schaut mal, wie viele PKK-Fans hier in Deut..."
2,leave_fact,"Ehrlich sein zum Türken, aber dann Doppelmoral..."
3,other,Ihr seid voll durchgedreht und total verrückt ...
4,construct,Es hat einen Angriff auf ne Moschee gegeben.


### Reload Preliminary Classifier

In [19]:
# We can load the prelimiary classifier from the checkpoint saved in "finetuned_models" ...
tokenizer = AutoTokenizer.from_pretrained(MODEL)
inputs = tokenizer(one_label.text.to_list(), truncation=True, padding=True, max_length=180, return_tensors="pt")

model = AutoModelForSequenceClassification.from_pretrained("finetuned_models/twitter-xlm-roberta-base_split-1")

# ... and use it to classify our data with only a single label.
with torch.no_grad():
    logits = model(**inputs).logits

In [20]:
# create a new column in the dataframe containing the label inferred with the preliminary classifier
one_label["model"] = logits.argmax(axis=1).numpy()
one_label["model"] = one_label.model.replace({v: k for k, v in label_to_id.items()})

one_label.head()

Unnamed: 0,label,text,model
0,construct,"Die Idee is wohl, dass Anschläge von den Täter...",leave_fact
1,sarc,"O, schaut mal, wie viele PKK-Fans hier in Deut...",leave_fact
2,leave_fact,"Ehrlich sein zum Türken, aber dann Doppelmoral...",leave_fact
3,other,Ihr seid voll durchgedreht und total verrückt ...,leave_fact
4,construct,Es hat einen Angriff auf ne Moschee gegeben.,leave_fact


In [21]:
# retain only tweets where the human and the prelimiary model agree
new_confident_examples = one_label[one_label.label == one_label.model]
new_confident_examples = new_confident_examples.reset_index().drop(columns=["model", "index"])

print(f"Using the prelimiary BERT classifier, we identified {len(new_confident_examples)} new confident examples.")
new_confident_examples.head()

# The newly identified confident examples can now be used in a second round of training for the BERT model.

Using the prelimiary BERT classifier, we identified 138 new confident examples.


Unnamed: 0,label,text
0,leave_fact,"Ehrlich sein zum Türken, aber dann Doppelmoral..."
1,leave_fact,@KonzernKing Wenn man was von BWL verstehen wü...
2,leave_fact,"@NewsTalker Mit ""Wir"" meint der Typ die ganze ..."
3,leave_fact,@xyzparty/@abccrew/@lmnclub/@qwertzfun/@hijgro...
4,leave_fact,"Oh Mann, du bist echt armselig. Immer austeile..."
