# Legalis BERT Predictor

#### Notebook used to create BERT transformer model for text classification based prediction of outcomes

## importing packages and dataset
#### install via pip (needed especially for only jupyter notebook services)

In [None]:
import huggingface_hub
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install huggingface_hub
!pip install accelerate -U
!pip install transformers[torch]
!apt install git-lfs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Collec

#### imports for model fune tuning and dataset loading

In [1]:
# standard imports
import numpy as np
import os

#imports from huggingface
import datasets as ds #for loading datasets
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, DataCollatorWithPadding # for text classification
import huggingface_hub as hg_hub #for model upload with hub
import evaluate #for accuracy evaluation

#### login to huggingface hub for model upload (only needed for model upload)

In [None]:
#login to huggingface hub
hg_hub.login(os.environ['hub_token'])

#### loading dataset from huggingface hub or via csv

In [2]:
#loading dataset from csv (reproduction way)
#dataset=ds.load_dataset("csv", data_files="legalis-dataset.csv", split="train")
#dataset=dataset.train_test_split(test_size=0.2, shuffle=True)

#loading dataset from huggingface hub (production way)
dataset=ds.load_dataset("LennardZuendorf/legalis")

#renaming columns for easier access
dataset=dataset.rename_column("facts", "text")

#printing dataset features
print(dataset)

Found cached dataset csv (C:/Users/lenna/.cache/huggingface/datasets/csv/default-9d276a5be705aa8e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


DatasetDict({
    train: Dataset({
        features: ['id', 'file_number', 'date', 'type', 'content', 'tenor', 'facts', 'reasoning', 'winner', 'label'],
        num_rows: 2240
    })
    test: Dataset({
        features: ['id', 'file_number', 'date', 'type', 'content', 'tenor', 'facts', 'reasoning', 'winner', 'label'],
        num_rows: 561
    })
})


## data preprocessing & eval prep
#### tokenizing data with tokenizer used by German BERT model

In [None]:
#grabbing tokenizer from pretrained german BERT
tokenizer = AutoTokenizer.from_pretrained("deepset/gbert-base")

#creating function to tokenize data
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

#tokenizing data
tokenized_data = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/362 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/240k [00:00<?, ?B/s]

Map:   0%|          | 0/2660 [00:00<?, ? examples/s]

Map:   0%|          | 0/141 [00:00<?, ? examples/s]

#### creating function to compute metrics using huggingface evaluations

In [None]:
#creating accuracy evaluation class
accuracy = evaluate.load("accuracy")

#creating function to compute metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

#### transformers.trainer setup for training with BERT

In [None]:
#setting up labels
id2label = {0: "Verklagte*r", 1: "Klaeger*in"}
label2id = {"Verklagte*r": 0, "Klaeger*in": 1}

#setting up model from huggingface with labels
model = AutoModelForSequenceClassification.from_pretrained(
    "deepset/gbert-base", num_labels=2, id2label=id2label, label2id=label2id
)

#training arguments and trainer setup
training_args = TrainingArguments(
    output_dir="legalis-BERT",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

#trainer setup with model, training args, datasets, tokenizer, data collator and compute metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly

#### training using the transformers trainer

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.667719,0.58156
2,No log,0.666185,0.602837


TrainOutput(global_step=334, training_loss=0.6516124245649326, metrics={'train_runtime': 155.6689, 'train_samples_per_second': 34.175, 'train_steps_per_second': 2.146, 'total_flos': 1399750814515200.0, 'train_loss': 0.6516124245649326, 'epoch': 2.0})

#### pushing model to huggingface hub (only needed for model upload)

In [None]:
trainer.push_to_hub("legalis-BERT")