# Legalis Extractor

Notebook used to extract features and create binary values to predict

## importing packages and dataset

In [4]:
import datasets as ds
import time
import os
import openai

In [6]:
train=ds.load_dataset("LennardZuendorf/legalis", split="train")
test=ds.load_dataset("LennardZuendorf/legalis", split="test")
dataset = ds.concatenate_datasets([train,test])

print(dataset)

Dataset({
    features: ['id', 'file_number', 'date', 'type', 'content', 'tenor', 'facts', 'reasoning', 'winner'],
    num_rows: 2828
})


Found cached dataset parquet (/home/datalore/.cache/huggingface/datasets/LennardZuendorf___parquet/LennardZuendorf--legalis-9651dcdeb9af3571/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
Found cached dataset parquet (/home/datalore/.cache/huggingface/datasets/LennardZuendorf___parquet/LennardZuendorf--legalis-9651dcdeb9af3571/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


## Enhancing Dataset using OpenAI ChatGPT API
Creating a binary label to predict the solution.

In [106]:
openai.api_key = os.getenv("openai_token")

#function to extract a label 
def winner_extract(data):
  try:
    completion = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
        {"role": "system", "content": "You are a legal scholar."},
        {"role": "user", "content": 
                            "Extrahiere aus dem folgenden Tenor einer Gerichtsentscheidung den Gewinner der Entscheidung:"
                            +str(data['tenor'])+"Gib den Gewinner so aus: Gewinner: mit 1 für Kläger und 0 für Verklagter."
        }
      ]
    )
  except openai.error.RateLimitError as e:
    print(f"OpenAI API request exceeded rate limit: {e}")
    time.sleep(15)
    return winner_extract(data) 
  except openai.error.APIError as e:
      data['winner'] = "error"
      return data
      
  #extracting the answer
  data['winner']=completion.choices[0].message['content']
  return data

global safe_dataset

#batching dataset and running gpt in 100 batches
def batch_runner(data, batch_size):
  print("running batch "+str(0)+" to "+str(27))
  output = ds.Dataset.from_dict(data[:27])
  output = output.map(winner_extract)
  
  i=27
  e=i+batch_size
    
  while i<len(data):
    print("running batch "+str(i)+" to "+str(e))

    tmp_data=ds.Dataset.from_dict(data[i:e])
    tmp_data=tmp_data.map(winner_extract)
    output = ds.concatenate_datasets([output, tmp_data])
    safe_dataset=output

    i=e
    e=i+batch_size
      
  return output

enhanced_dataset = batch_runner(dataset, 50)
print(enhanced_dataset)

running batch 0 to 27
OpenAI API request exceeded rate limit: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID f65dbf696ccbf04dd964f199db27a4bf in your message.)
running batch 27 to 77
OpenAI API request exceeded rate limit: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 7e5ffcb3fcc36abe50955a48505259ab in your message.)
OpenAI API request exceeded rate limit: The server had an error while processing your request. Sorry about that!
OpenAI API request exceeded rate limit: The server had an error while processing your request. Sorry about that!
running batch 77 to 127
running batch 127 to 177
running batch 177 to 227
running batch 227 to 277
running batch 277 to 327
running batch 327 to 377
OpenAI A

In [15]:
#function to create binary label from the winner label
def winner_binary(data):
    winner = data['winner']
    
    #new label if winner is 1 (Klaeger) or 0 (Verklagter)
    if winner.find("1") or winner.find("Kläger"):
        data['label']=1
        data['winner']="Klaeger"
    elif winner.find("0") or winner.find("Verklagter"): 
        data['label']=0
        data['winner']="Verklagter"
    else:
        data['label']=2

    return data

def filter_binary(data):
    
    if data['label']==2:
        return False
    else:
        return True

#dataset=dataset.map(winner_binary)
#print(dataset[10]['label'])

dataset=dataset.map(winner_binary)
dataset=dataset.filter(filter_binary)
print(dataset)

Dataset({
    features: ['id', 'file_number', 'date', 'type', 'content', 'tenor', 'facts', 'reasoning', 'winner', 'label'],
    num_rows: 2828
})


## Creating Splits, Uploading the new Dataset to HuggingFace Datasets

In [19]:
#uploading new dataset into different repository
dataset = dataset.train_test_split(test_size=0.05)
print(dataset)
dataset.push_to_hub("LennardZuendorf/legalis", token=os.environ['hub_token'])

DatasetDict({
    train: Dataset({
        features: ['id', 'file_number', 'date', 'type', 'content', 'tenor', 'facts', 'reasoning', 'winner', 'label'],
        num_rows: 2686
    })
    test: Dataset({
        features: ['id', 'file_number', 'date', 'type', 'content', 'tenor', 'facts', 'reasoning', 'winner', 'label'],
        num_rows: 142
    })
})


Pushing split train to the Hub.
Pushing split test to the Hub.
