# Legalis Data Set

Notebook used to edit the existing dataset for usage in the legalis project.

## Importing Libraries and Dataset

In [None]:
from datasets import load_dataset
import datasets
from bs4 import BeautifulSoup
import re
from datetime import date
import time
import os
import openai
import tiktoken

In [None]:
raw_dataset = load_dataset("LennardZuendorf/openlegaldata-bulk-data", split='train')
print(raw_dataset)

Downloading and preparing dataset json/LennardZuendorf--openlegaldata-bulk-data to /home/datalore/.cache/huggingface/datasets/LennardZuendorf___json/LennardZuendorf--openlegaldata-bulk-data-0d18a44c8ee5464a/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...
Dataset json downloaded and prepared to /home/datalore/.cache/huggingface/datasets/LennardZuendorf___json/LennardZuendorf--openlegaldata-bulk-data-0d18a44c8ee5464a/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.
Dataset({
    features: ['id', 'slug', 'court', 'file_number', 'date', 'created_date', 'updated_date', 'type', 'ecli', 'content'],
    num_rows: 251038
})


## Creating the Basic Dataset for Training and Testing
### Cleaning both Datasets

In [None]:
# renaming and removing columns
raw_dataset = raw_dataset.remove_columns(['slug', 'ecli', 'updated_date', 'created_date'])


#updating columns in nested dict ('court')
def cleaning_courts(data):
    del data['court']['slug']
    del data['court']['city']

    return data


raw_dataset = raw_dataset.map(cleaning_courts)

In [None]:
# cleaning date information into datetime.date object, throwing out data younger than 1 week
def clean_date(data):
    data['date'] = data['date'].date()

    return data


cleaned_dataset = raw_dataset.map(clean_date)
cleaned_dataset = cleaned_dataset.filter(lambda x: x['date'].date() < date.fromtimestamp(time.time()))
cleaned_dataset = cleaned_dataset.filter(
    lambda x: x['type'].lower() == 'urteil' or x['type'].lower() == 'abschlussurteil')

print(cleaned_dataset)

Dataset({
    features: ['id', 'court', 'file_number', 'date', 'type', 'content'],
    num_rows: 137271
})


## Preprocessing Datasets
#### splitting natural dataset content into clean tenor and reasoning

In [None]:
#splitting content into tenor and reasoning
def splitting_content_twotext(data):
    #cleaning entire content of html, newlines and whitespaces
    content_soup = BeautifulSoup(data['content'])
    content = content_soup.get_text().strip()
    content = re.sub('\n', ' ', content)
    content = ' '.join(content.split())

    data['content'] = content

    #splitting content into tenor and reasoning
    split_content = data['content'].rsplit(sep="<h2>Gründe</h2>")

    #alternative splitting if first split was not possible
    if len(split_content) != 2:
        split_content = data['content'].rsplit(sep="<!--hlIgnoreOn-->Gründe<!--hlIgnoreOff-->")

    if len(split_content) != 2:
        split_content = data['content'].rsplit(sep="Gründe:")

    if len(split_content) != 2:
        #settting tenor and reasoning to None if no split was possible
        data['tenor'] = None
        data['reasoning'] = None
        data['facts'] = None

    if len(split_content) == 2:
        #cleaning tenor of html, newlines and whitespaces
        tenor_soup = BeautifulSoup(split_content[0])
        tenor = tenor_soup.get_text().strip()
        tenor = re.sub('\n', ' ', tenor)
        tenor = ' '.join(tenor.split()).lower()
        split_tenor = tenor.rsplit(sep="tenor")

        if len(split_tenor) ==2:
            data['tenor'] = split_tenor[1]
        else:
            data['tenor'] = split_tenor[0]

        #cleaning reasoning of html, newlines and whitespaces
        reason_soup = BeautifulSoup(split_content[1])
        reason = reason_soup.get_text().strip()
        reason = re.sub('\n', ' ', reason)
        data['reasoning'] = ' '.join(reason.split()).lower()

        data['facts'] = ""

    return data


#splitting content into tenor and reasoning, after filtering for content containing "Gründe"
natural_dataset = raw_dataset.filter(lambda x: x['content'].lower().find("gründe") != -1
                                                 and x['content'].lower().find("tenor") != -1)
natural_dataset = natural_dataset.map(splitting_content_twotext)
print(natural_dataset)

Dataset({
    features: ['id', 'court', 'file_number', 'date', 'type', 'content', 'tenor', 'reasoning', 'facts'],
    num_rows: 86095
})


Loading cached processed dataset at /home/datalore/.cache/huggingface/datasets/LennardZuendorf___json/LennardZuendorf--openlegaldata-bulk-data-0d18a44c8ee5464a/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-96c9c66ff9d224fa.arrow
  tenor_soup = BeautifulSoup(split_content[0])
  reason_soup = BeautifulSoup(split_content[1])


## creating based enhanced dataset by splitting datasets into tenor, facts and reasoning

In [None]:
#splitting content into tenor and reasoning
def splitting_content_threetext(data):
    #cleaning entire content of html, newlines and whitespaces
    content_soup = BeautifulSoup(data['content'])
    content = content_soup.get_text().strip()
    content = re.sub('\n', ' ', content)
    content = ' '.join(content.split())

    data['content'] = content

    try:
        #splitting into tenor, facts and reasoning
        split_content1 = content.rsplit(sep="Tatbestand:")
        split_content2 = split_content1[1].rsplit(sep="Entscheidungsgründe:")

        if len(split_content2) != 2:
            split_content1 = content.rsplit(sep="Entscheidungsgründe:")
            split_content2 = split_content1[1].rsplit(sep="Tatbestand:")

        tenor = split_content1[0].lower()
        split_tenor = tenor.rsplit(sep="tenor")

        if len(split_tenor) ==2:
            data['tenor'] = split_tenor[1]
        else:
            data['tenor'] = split_tenor[0]

        data['facts'] = split_content2[0].lower()
        data['reasoning'] = split_content2[1].lower()

    except IndexError:
        data['tenor'] = None
        data['facts'] = None
        data['reasoning'] = None

    return data


#splitting content into tenor and reasoning, after filtering for content containing needed words
enhanced_dataset = raw_dataset.filter(lambda x: x['content'].lower().find("tenor") != -1
                                               and x['content'].lower().find("tatbestand:") != -1
                                               and x['content'].lower().find("entscheidungsgründe:") != -1)
enhanced_dataset = enhanced_dataset.map(splitting_content_threetext)

In [None]:
def filter_empty(data):
    if data['tenor'] is None or str(data['tenor']).isspace() or data['tenor']=="":
        return False
    elif data['reasoning'] is None or str(data['reasoning']).isspace() or data['reasoning']=="":
        return False
    else:
        return True


natural_dataset = natural_dataset.filter(filter_empty)
enhanced_dataset = enhanced_dataset.filter(filter_empty)

In [None]:
def text_cleaner(data):
    tenor = data['tenor']
    tenor = tenor.lower()
    split_tenor = tenor.rsplit(sep="tenor")

    if len(split_tenor) ==2:
        data['tenor'] = split_tenor[1]
    else:
        data['tenor'] = split_tenor[0]

    data['reasoning'] = data['reasoning'].lower()

    if data['facts']:
        data['facts'] = data['facts'].lower()

    return data

natural_dataset = natural_dataset.map(text_cleaner)
enhanced_dataset = enhanced_dataset.map(text_cleaner)

## Creating Splits, Uploading the new Dataset to HuggingFace Datasets

In [None]:
openlegaldata_dataset = datasets.DatasetDict({"three": enhanced_dataset, "two": natural_dataset})
print(openlegaldata_dataset)

DatasetDict({
    three: Dataset({
        features: ['id', 'court', 'file_number', 'date', 'type', 'content', 'tenor', 'facts', 'reasoning'],
        num_rows: 2828
    })
    two: Dataset({
        features: ['id', 'court', 'file_number', 'date', 'type', 'content', 'tenor', 'reasoning', 'facts'],
        num_rows: 4954
    })
})


In [None]:
legalis_dataset = enhanced_dataset.train_test_split(test_size=0.05)
print(legalis_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'court', 'file_number', 'date', 'type', 'content', 'tenor', 'facts', 'reasoning'],
        num_rows: 2686
    })
    test: Dataset({
        features: ['id', 'court', 'file_number', 'date', 'type', 'content', 'tenor', 'facts', 'reasoning'],
        num_rows: 142
    })
})


In [None]:
#uploading new dataset into different repository
openlegaldata_dataset.push_to_hub("LennardZuendorf/openlegaldata-processed", token=os.environ['hub_token'])
legalis_dataset.push_to_hub("LennardZuendorf/legalis", token=os.environ['hub_token'])

Pushing split three to the Hub.
Pushing split two to the Hub.
Pushing split train to the Hub.
Pushing split test to the Hub.
