# Legalis Data Set

Notebook used to edit the existing dataset for usage in the legalis project.

## Importing Libraries and Dataset

In [1]:
from datasets import load_dataset
import datasets
from bs4 import BeautifulSoup
import re
from datetime import date
import time
import openai
import tiktoken

In [2]:
raw_dataset = load_dataset("LennardZuendorf/openlegaldata-bulk-data", split='train')
print(raw_dataset)

In [2]:
synthetic_dataset=load_dataset("LennardZuendorf/legalis", split='train')
natural_dataset=load_dataset("LennardZuendorf/legalis", split='test')

Downloading and preparing dataset None/None to /home/datalore/.cache/huggingface/datasets/LennardZuendorf___parquet/LennardZuendorf--legalis-8bae77942a3363b8/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...
Dataset parquet downloaded and prepared to /home/datalore/.cache/huggingface/datasets/LennardZuendorf___parquet/LennardZuendorf--legalis-8bae77942a3363b8/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Found cached dataset parquet (/home/datalore/.cache/huggingface/datasets/LennardZuendorf___parquet/LennardZuendorf--legalis-8bae77942a3363b8/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


## Creating the Basic Dataset for Training and Testing
### Cleaning both Datasets

In [3]:
# renaming and removing columns
raw_dataset = raw_dataset.remove_columns(['slug', 'ecli', 'updated_date', 'created_date'])


#updating columns in nested dict ('court')
def cleaning_courts(data):
    del data['court']['slug']
    del data['court']['city']

    return data


raw_dataset = raw_dataset.map(cleaning_courts)

In [4]:
# cleaning date information into datetime.date object, throwing out data younger than 1 week
def clean_date(data):
    data['date'] = data['date'].date()

    return data


cleaned_dataset = raw_dataset.map(clean_date)
cleaned_dataset = cleaned_dataset.filter(lambda x: x['date'].date() < date.fromtimestamp(time.time()))
cleaned_dataset = cleaned_dataset.filter(
    lambda x: x['type'].lower() == 'urteil' or x['type'].lower() == 'abschlussurteil')

print(cleaned_dataset)

## Creating Synthetic Dataset
#### splitting synthetic dataset content into clean tenor and reasoning

In [5]:
#splitting content into tenor and reasoning
def splitting_content_synthetic(data):
    #cleaning entire content of html, newlines and whitespaces
    content_soup = BeautifulSoup(data['content'])
    content = content_soup.get_text().strip()
    content = re.sub('\n', ' ', content)
    content = ' '.join(content.split())

    data['content'] = content

    #splitting content into tenor and reasoning
    split_content = data['content'].rsplit(sep="<h2>Gründe</h2>")

    #alternative splitting if first split was not possible
    if len(split_content) != 2:
        split_content = data['content'].rsplit(sep="<!--hlIgnoreOn-->Gründe<!--hlIgnoreOff-->")

    if len(split_content) != 2:
        split_content = data['content'].rsplit(sep="Gründe:")

    if len(split_content) != 2:
        #settting tenor and reasoning to None if no split was possible
        data['tenor'] = None
        data['reasoning'] = None

    if len(split_content) == 2:
        #cleaning tenor of html, newlines and whitespaces
        tenor_soup = BeautifulSoup(split_content[0])
        tenor = tenor_soup.get_text().strip()
        tenor = re.sub('\n', ' ', tenor)
        tenor = ' '.join(tenor.split())
        data['tenor'] = tenor

        #cleaning reasoning of html, newlines and whitespaces
        reason_soup = BeautifulSoup(split_content[1])
        reason = reason_soup.get_text().strip()
        reason = re.sub('\n', ' ', reason)
        data['reasoning'] = ' '.join(reason.split())

    return data


#splitting content into tenor and reasoning, after filtering for content containing "Gründe"
synthetic_dataset = raw_dataset.filter(lambda x: x['content'].lower().find("gründe") != -1
                                                 and x['content'].lower().find("tenor") != -1)
synthetic_dataset = synthetic_dataset.map(splitting_content_synthetic)
print(synthetic_dataset)

### Extracting facts from reasoning using ChatGPT API

## creating natural dataset by splitting datasets into tenor, facts and reasoning

In [6]:
#splitting content into tenor and reasoning
def splitting_content_natural(data):
    #cleaning entire content of html, newlines and whitespaces
    content_soup = BeautifulSoup(data['content'])
    content = content_soup.get_text().strip()
    content = re.sub('\n', ' ', content)
    content = ' '.join(content.split())

    data['content'] = content

    try:
        #splitting into tenor, facts and reasoning
        split_content1 = content.rsplit(sep="Tatbestand:")
        split_content2 = split_content1[1].rsplit(sep="Entscheidungsgründe:")

        if len(split_content2) != 2:
            split_content1 = content.rsplit(sep="Entscheidungsgründe:")
            split_content2 = split_content1[1].rsplit(sep="Tatbestand:")

        data['tenor'] = split_content1[0]
        data['facts'] = split_content2[0]
        data['reasoning'] = split_content2[1]

    except IndexError:
        data['tenor'] = None
        data['facts'] = None
        data['reasoning'] = None

    return data


#splitting content into tenor and reasoning, after filtering for content containing needed words
natural_dataset = raw_dataset.filter(lambda x: x['content'].lower().find("tenor") != -1
                                               and x['content'].lower().find("tatbestand:") != -1
                                               and x['content'].lower().find("entscheidungsgründe:") != -1)
natural_dataset = natural_dataset.map(splitting_content_natural)

In [7]:
def dummy_facts(data):
    data['facts'] = "dummy facts"

    return data


synthetic_dataset = synthetic_dataset.map(dummy_facts)

In [14]:
def filter_empty(data):
    if data['tenor'] is None or str(data['tenor']).isspace() or data['tenor']=="":
        return False
    elif data['facts'] is None or str(data['facts']).isspace() or data['facts']=="":
        return False
    elif data['reasoning'] is None or str(data['reasoning']).isspace() or data['reasoning']=="":
        return False
    else:
        return True


final_synthetic_dataset = synthetic_dataset.filter(filter_empty)
final_natural_dataset = natural_dataset.filter(filter_empty)


## Using Summarization Model to update Synthetic Dataset

Accessing Legalis Extractor model via Interference API to extract facts from reasoning.

In [None]:
def extract_facts(data):
    

    return data

## Creating Splits, Uploading the new Dataset to HuggingFace Datasets

In [24]:
dataset = datasets.DatasetDict({"train": final_synthetic_dataset, "test": final_natural_dataset})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'court', 'file_number', 'date', 'type', 'content', 'tenor', 'reasoning', 'facts'],
        num_rows: 32809
    })
    test: Dataset({
        features: ['id', 'court', 'file_number', 'date', 'type', 'content', 'tenor', 'reasoning', 'facts'],
        num_rows: 2847
    })
})


In [28]:
#uploading new dataset into different repository
dataset.push_to_hub("LennardZuendorf/legalis")

Pushing split train to the Hub.
Pushing split test to the Hub.
