# Legalis Data Set

Notebook used to edit the existing dataset for usage in the legalis project.

## Importing Libraries and Dataset

#### install libaries with pip (needed for google colab)

In [None]:
!pip install datasets
!pip install bs4

#### importing libaries

In [4]:
#standart imports
import os

import datasets as ds #for dataset management and processing
from bs4 import BeautifulSoup #for html processing
import re #for content processing
from datetime import date #for date filtering
import time #for date filtering

#### import of split but not enhanced dataset from huggingface hub or csv, commented out because only used once for initial upload to the hub

In [7]:
#loading dataset from csv (reproduction/inital way)
#raw_dataset=ds.load_dataset("csv", data_files="openlegaldata-bulk-data.csv")

Creating CSV from Arrow format:   0%|          | 0/252 [00:00<?, ?ba/s]

7220836525

#### pulling raw dataset from huggingface hub (production way) or csv, commented out because only used once for initial upload to the hub

In [6]:
#loading dataset from huggingface hub (production way)
raw_dataset = ds.load_dataset("LennardZuendorf/openlegaldata-bulk-data", split='train')
print(raw_dataset)

Found cached dataset json (C:/Users/lenna/.cache/huggingface/datasets/LennardZuendorf___json/LennardZuendorf--openlegaldata-bulk-data-0d18a44c8ee5464a/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


Dataset({
    features: ['id', 'slug', 'court', 'file_number', 'date', 'created_date', 'updated_date', 'type', 'ecli', 'content'],
    num_rows: 251038
})


#### import of already processed dataset for updates (testing/edit way)

In [5]:
#loading dataset from huggingface hub (production way)
#three_split_dataset=ds.load_dataset("LennardZuendorf/openlegaldata-processed", split="three")
#two_split_dataset=ds.load_dataset("LennardZuendorf/openlegaldata-processed", split="two")
#print(three_split_dataset)
#print(two_split_dataset)

Downloading readme:   0%|          | 0.00/749 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to C:/Users/lenna/.cache/huggingface/datasets/LennardZuendorf___parquet/LennardZuendorf--openlegaldata-processed-8fa5f5fb35684e99/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/82.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/89.7M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating three split:   0%|          | 0/2828 [00:00<?, ? examples/s]

Generating two split:   0%|          | 0/4954 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to C:/Users/lenna/.cache/huggingface/datasets/LennardZuendorf___parquet/LennardZuendorf--openlegaldata-processed-8fa5f5fb35684e99/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Found cached dataset parquet (C:/Users/lenna/.cache/huggingface/datasets/LennardZuendorf___parquet/LennardZuendorf--openlegaldata-processed-8fa5f5fb35684e99/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Dataset({
    features: ['id', 'court', 'file_number', 'date', 'type', 'content', 'tenor', 'facts', 'reasoning'],
    num_rows: 2828
})
Dataset({
    features: ['id', 'court', 'file_number', 'date', 'type', 'content', 'tenor', 'facts', 'reasoning'],
    num_rows: 4954
})


## Creating the Basic Dataset for Training and Testing
#### Cleaning raw dataset

In [7]:
# removing unnecessary columns
raw_dataset = raw_dataset.remove_columns(['slug', 'ecli', 'updated_date', 'created_date'])

#updating columns in nested dict ('court')
def cleaning_courts(data):
    del data['court']['slug']
    del data['court']['city']
    return data

#applying function to clean courts
raw_dataset = raw_dataset.map(cleaning_courts)

#### Filtering raw dataset for relevant data

In [8]:
# function for cleaning date information into datetime.date object, throwing out data younger than 1 week
def clean_date(data):
    data['date'] = data['date'].date()

    return data

#applying function to clean dates
cleaned_dataset = raw_dataset.map(clean_date)

#filtering for dates older than 1 week
cleaned_dataset = cleaned_dataset.filter(lambda x: x['date'].date() < date.fromtimestamp(time.time()))
#filtering dataset for special decision types
cleaned_dataset = cleaned_dataset.filter(lambda x: x['type'].lower() == 'urteil' or x['type'].lower() == 'abschlussurteil')

print(cleaned_dataset)

Dataset({
    features: ['id', 'court', 'file_number', 'date', 'type', 'content'],
    num_rows: 137271
})


## Preprocessing Datasets
#### splitting two text dataset content into clean tenor and reasoning

In [11]:
#splitting content into tenor and reasoning
def splitting_content_twotext(data):
    #cleaning entire content of html, newlines and whitespaces
    content_soup = BeautifulSoup(data['content'])
    content = content_soup.get_text().strip()
    content = re.sub('\n', ' ', content)
    content = ' '.join(content.split())
    data['content'] = content

    #splitting content into tenor and reasoning
    split_content = data['content'].rsplit(sep="<h2>Gründe</h2>")

    #alternative splitting if first split was not possible
    if len(split_content) != 2:
        split_content = data['content'].rsplit(sep="<!--hlIgnoreOn-->Gründe<!--hlIgnoreOff-->")

    if len(split_content) != 2:
        split_content = data['content'].rsplit(sep="Gründe:")

    if len(split_content) != 2:
        #settting tenor and reasoning to None if no split was possible
        data['tenor'] = None
        data['reasoning'] = None
        data['facts'] = None

    if len(split_content) == 2:
        #cleaning tenor of html, newlines and whitespaces
        tenor_soup = BeautifulSoup(split_content[0])
        tenor = tenor_soup.get_text().strip()
        tenor = re.sub('\n', ' ', tenor)
        tenor = ' '.join(tenor.split()).lower()
        split_tenor = tenor.rsplit(sep="tenor")

        #setting tenor
        if len(split_tenor) ==2:
            data['tenor'] = split_tenor[1]
        else:
            data['tenor'] = split_tenor[0]

        #cleaning reasoning of html, newlines and whitespaces
        reason_soup = BeautifulSoup(split_content[1])
        reason = reason_soup.get_text().strip()
        reason = re.sub('\n', ' ', reason)
        data['reasoning'] = ' '.join(reason.split()).lower()

        #setting facts with dummy data for equal dataset structure
        data['facts'] = ""

    return data


#filtering for content containing needed words (tenor and reasoning)
two_split_dataset = raw_dataset.filter(lambda x: x['content'].lower().find("gründe") != -1
                                                 and x['content'].lower().find("tenor") != -1)
#applying function to split content into tenor and reasoning
two_split_dataset = two_split_dataset.map(splitting_content_twotext)
print(two_split_dataset)

Dataset({
    features: ['id', 'court', 'file_number', 'date', 'type', 'content', 'tenor', 'reasoning', 'facts'],
    num_rows: 86095
})


Loading cached processed dataset at /home/datalore/.cache/huggingface/datasets/LennardZuendorf___json/LennardZuendorf--openlegaldata-bulk-data-0d18a44c8ee5464a/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-96c9c66ff9d224fa.arrow
  tenor_soup = BeautifulSoup(split_content[0])
  reason_soup = BeautifulSoup(split_content[1])


## creating based three split dataset for enhancing and usage in models
#### splitting three text dataset content into clean tenor, reasoning and facts

In [12]:
#splitting content into tenor and reasoning
def splitting_content_threetext(data):
    #cleaning entire content of html, newlines and whitespaces
    content_soup = BeautifulSoup(data['content'])
    content = content_soup.get_text().strip()
    content = re.sub('\n', ' ', content)
    content = ' '.join(content.split())

    #setting content
    data['content'] = content

    try:
        #splitting into tenor, facts and reasoning
        split_content1 = content.rsplit(sep="Tatbestand:")
        split_content2 = split_content1[1].rsplit(sep="Entscheidungsgründe:")

        #alternative splitting if first split was not possible
        if len(split_content2) != 2:
            split_content1 = content.rsplit(sep="Entscheidungsgründe:")
            split_content2 = split_content1[1].rsplit(sep="Tatbestand:")

        #setting tenor
        tenor = split_content1[0].lower()
        split_tenor = tenor.rsplit(sep="tenor")

        #setting tenor
        if len(split_tenor) ==2:
            data['tenor'] = split_tenor[1]
        else:
            data['tenor'] = split_tenor[0]

        #setting facts and reasoning
        data['facts'] = split_content2[0].lower()
        data['reasoning'] = split_content2[1].lower()

    #settting tenor, reasoning and facts to None if no split was possible
    except IndexError:
        data['tenor'] = None
        data['facts'] = None
        data['reasoning'] = None

    return data


#filtering dataset for content containing needed words
three_split_dataset = raw_dataset.filter(lambda x: x['content'].lower().find("tenor") != -1
                                               and x['content'].lower().find("tatbestand:") != -1
                                               and x['content'].lower().find("entscheidungsgründe:") != -1)
#applying function to split content into tenor, facts and reasoning
three_split_dataset = three_split_dataset.map(splitting_content_threetext)

#### filtering failed splitting

In [13]:
#function to filter empty tenor and reasoning
def filter_empty(data):
    if data['tenor'] is None or str(data['tenor']).isspace() or data['tenor']=="":
        return False
    elif data['reasoning'] is None or str(data['reasoning']).isspace() or data['reasoning']=="":
        return False
    else:
        return True

#applying function to filter empty tenor and reasoning
two_split_dataset = two_split_dataset.filter(filter_empty)
three_split_dataset = three_split_dataset.filter(filter_empty)

#### cleaning the text of tenor, reasoning and facts

In [14]:
#function to clean text into lowercase for better matching
def text_cleaner(data):

    #setting tenor to lowercase and splitting after "tenor"
    tenor = data['tenor']
    tenor = tenor.lower()
    split_tenor = tenor.rsplit(sep="tenor")

    #setting tenor after split
    if len(split_tenor) ==2:
        data['tenor'] = split_tenor[1]
    else:
        data['tenor'] = split_tenor[0]

    #setting reasoning and facts to lowercase
    data['reasoning'] = data['reasoning'].lower()

    if data['facts']:
        data['facts'] = data['facts'].lower()

    return data

#applying function to clean text
two_split_dataset = two_split_dataset.map(text_cleaner)
three_split_dataset = three_split_dataset.map(text_cleaner)

## Creating Splits, Uploading the new Dataset to HuggingFace Datasets

#### saving datasets as csv for reproducibility without huggingface

In [9]:
three_split_dataset.to_csv("three-split-dataset.csv")
two_split_dataset.to_csv("two-split-dataset.csv")

Creating CSV from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

184067167

#### creating splits and uploaded to huggingface datasets for persistency (never used this again)

In [19]:
openlegaldata_dataset = ds.DatasetDict({"three": three_split_dataset, "two": two_split_dataset})
print(openlegaldata_dataset)

DatasetDict({
    three: Dataset({
        features: ['id', 'court', 'file_number', 'date', 'type', 'content', 'tenor', 'facts', 'reasoning'],
        num_rows: 2828
    })
    two: Dataset({
        features: ['id', 'court', 'file_number', 'date', 'type', 'content', 'tenor', 'reasoning', 'facts'],
        num_rows: 4954
    })
})


#### creating test train split and upload to huggingface for usage in prediction

In [18]:
legalis_dataset = three_split_dataset.train_test_split(test_size=0.05)
print(legalis_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'court', 'file_number', 'date', 'type', 'content', 'tenor', 'facts', 'reasoning'],
        num_rows: 2686
    })
    test: Dataset({
        features: ['id', 'court', 'file_number', 'date', 'type', 'content', 'tenor', 'facts', 'reasoning'],
        num_rows: 142
    })
})


#### uploading new dataset into different repository

In [20]:
#uploading new dataset into different repository
openlegaldata_dataset.push_to_hub("LennardZuendorf/openlegaldata-processed", token=os.environ['hub_token'])
legalis_dataset.push_to_hub("LennardZuendorf/legalis", token=os.environ['hub_token'])

Pushing split three to the Hub.
Pushing split two to the Hub.
Pushing split train to the Hub.
Pushing split test to the Hub.
