# Legalis Extractor

Notebook used to extract features and create binary values to predict

## importing packages and dataset

In [18]:
#imports from huggingface
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, DataCollatorWithPadding
import huggingface_hub as hg_hub
import evaluate

#other imports
import numpy as np
import os

In [11]:
#loading dataset
dataset=load_dataset("LennardZuendorf/legalis")

#printing dataset features
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'file_number', 'date', 'type', 'content', 'tenor', 'facts', 'reasoning', 'winner', 'label'],
        num_rows: 2686
    })
    test: Dataset({
        features: ['id', 'file_number', 'date', 'type', 'content', 'tenor', 'facts', 'reasoning', 'winner', 'label'],
        num_rows: 142
    })
})


Found cached dataset parquet (/home/datalore/.cache/huggingface/datasets/LennardZuendorf___parquet/LennardZuendorf--legalis-ac1f3638890b8970/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


## data preprocessing & eval prep

In [22]:
#dataset=dataset.remove_columns(['file_number', 'date', 'type', 'content', 'tenor', 'reasoning', 'winner'])
#dataset=dataset.rename_column("facts", "text")

print(dataset['train'][10]['facts'])
print(dataset['train'][10]['text'])

KeyError: KeyError: 'facts'

In [15]:
#grabbing tokenizer from pretrained german BERT
tokenizer = AutoTokenizer.from_pretrained("deepset/gbert-base")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

#tokenizing data
tokenized_data = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
#creating accuracy evaluation class
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [20]:

#setting up labels
id2label = {0: "Verklagter", 1: "Klaeger"}
label2id = {"Verklagter": 0, "Klaeger": 1}

#setting up model from huggingface with labels
model = AutoModelForSequenceClassification.from_pretrained(
    "deepset/gbert-base", num_labels=2, id2label=id2label, label2id=label2id
)

#training arguments and trainer setup
training_args = TrainingArguments(
    output_dir="legalis_predictor_BERT",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [21]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,9.7e-05,1.0
2,No log,6.8e-05,1.0


TrainOutput(global_step=336, training_loss=0.004092007520652953, metrics={'train_runtime': 536.7887, 'train_samples_per_second': 10.008, 'train_steps_per_second': 0.626, 'total_flos': 1413432589393920.0, 'train_loss': 0.004092007520652953, 'epoch': 2.0})

## final report and pushing to hub

In [7]:
hg_hub.login(token=os.getenv("hub_token"))
trainer.push_to_hub("LennardZuendorf/legalis")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/datalore/.cache/huggingface/token
Login successful


OSError: OSError: Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once).