# Legalis Data Set

Notebook used to edit the existing dataset for usage in the legalis project. The following can 

In [2]:
from datasets import load_dataset
from bs4 import BeautifulSoup
import re

In [5]:
raw_dataset=load_dataset("LennardZuendorf/openlegaldata-bulk-data", split='train')

Downloading and preparing dataset json/LennardZuendorf--openlegaldata-bulk-data to /home/datalore/.cache/huggingface/datasets/LennardZuendorf___json/LennardZuendorf--openlegaldata-bulk-data-0d18a44c8ee5464a/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...
Dataset json downloaded and prepared to /home/datalore/.cache/huggingface/datasets/LennardZuendorf___json/LennardZuendorf--openlegaldata-bulk-data-0d18a44c8ee5464a/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


In [19]:
dataset=load_dataset("LennardZuendorf/legalis", split='train')

print(dataset)

Dataset({
    features: ['id', 'court', 'file_number', 'date', 'type', 'content', 'tenor', 'reasoning'],
    num_rows: 41791
})


Found cached dataset parquet (/home/datalore/.cache/huggingface/datasets/LennardZuendorf___parquet/LennardZuendorf--legalis-0a81af9488284743/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [6]:
# renaming and removing columns
dataset = raw_dataset.remove_columns(['slug','ecli', 'updated_date', 'created_date'])

#updating columns in nested dict ('court')
def cleaning_courts(court_data):
    del court_data['court']['slug']
    del court_data['court']['city']

    return court_data

dataset = dataset.map(cleaning_courts)

error_count = 0

#splitting content into tenor and reasoning
def splitting_content(data):

    #splitting content into tenor and reasoning
    split_content = data['content'].rsplit(sep="<h2>Gründe</h2>")

    #alternative splitting if first split was not possible
    if len(split_content)!=2:
        split_content=data['content'].rsplit(sep="<!--hlIgnoreOn-->Gründe<!--hlIgnoreOff-->")

    if len(split_content)!=2:

        #settting tenor and reasoning to None if no split was possible
        data['tenor']=None
        data['reasoning']=None

    if len(split_content)==2:

        #cleaning tenor of html, newlines and whitespaces
        tenor_soup=BeautifulSoup(split_content[0])
        tenor=tenor_soup.get_text().strip()
        tenor=re.sub('\n', ' ', tenor)#
        tenor=' '.join(tenor.split())
        data['tenor'] = tenor

        #cleaning reasoning of html, newlines and whitespaces
        reason_soup=BeautifulSoup(split_content[1])
        reason=reason_soup.get_text().strip()
        reason=re.sub('\n', ' ', reason)
        data['reasoning']=' '.join(reason.split())
    
    return data

#splitting content into tenor and reasoning, filter out data without found tenor
dataset = dataset.map(splitting_content)
dataset = dataset.filter(lambda data: data['tenor'] is not None)

In [29]:
def filter_strafe(data):
    return 'strafe' in data['tenor'].lower()

dataset_strafe=dataset.filter(filter_strafe)
print(dataset_strafe)

Dataset({
    features: ['id', 'court', 'file_number', 'date', 'type', 'content', 'tenor', 'reasoning'],
    num_rows: 1379
})


Loading cached processed dataset at /home/datalore/.cache/huggingface/datasets/LennardZuendorf___parquet/LennardZuendorf--legalis-0a81af9488284743/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-75f877ff0af1181e.arrow


In [7]:
#creating train and test split
dataset = dataset.train_test_split(test_size=0.1)

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'court', 'file_number', 'date', 'type', 'content', 'tenor', 'reasoning'],
        num_rows: 41791
    })
    test: Dataset({
        features: ['id', 'court', 'file_number', 'date', 'type', 'content', 'tenor', 'reasoning'],
        num_rows: 4644
    })
})


In [8]:
#uploading new dataset into different repository
dataset.push_to_hub("LennardZuendorf/legalis")

Pushing split train to the Hub.
Pushing split test to the Hub.
