# Information extraction using an LLM

## Table of Contents

In [None]:
# todo

## Imports

In [5]:
import json
import os
import re

from bs4 import BeautifulSoup

In [20]:
import openai
from progressbar import progressbar

In [6]:
from openai import OpenAI
from pydantic import BaseModel

In [7]:
from typing import List

In [8]:
client = OpenAI(api_key="")

## JSON schemata & prompts

The schema dictates the output shape. If this is supplied during the API call, we do not need to specify the output through prompting.

In [9]:
class SanctionDetails(BaseModel):
    celex: str
    law_number: str
    publication_date: str
    legal_base: str
    decision: str

In [10]:
class SanctionTarget(SanctionDetails):
    name: str
    alias: list[str]
    position: list[str]
    dob: str
    pob: str
    address: str
    nationality: str
    identifying_information: list[str]
    reason: str
    un_sanction_date: str
    date_of_listing: str
    is_un: bool
    entity_type: str

In [11]:
class SanctionDoc(BaseModel):
    sanctions: List[SanctionTarget]

In [12]:
system_message_sanctions = "You are a diligent research assistant. \
Your purpose is to support researchers with extracting \
information about individual sanctions from unstructured text as well as from \
tables. The required data is to be structured as JSON for the output; the JSON \
keys are explained below. \
The are multiple sanction targets in each document, and you must give the details on every target. \
If the information is not available, return either an empty string \
(if the variable is of the type str), or an empty list of strings, if that \
was requested; in case of a boolean, return null. If you are not sure if \
you extracted the correct information, prefix it with: unsure: \
\
Here is the description for the JSON keys: \
celex: the CELEX number; \
law_number: an alphanumerical identifier for a referenced law (can be referenced inside the HTML block of '<p class='oj-hd-oj'>'); \
publication_date: date of publication of a given document; \
legal_base: legal justification or decision basis (mentioned as 'regard* to/amending'); \
decision: the text referencing the decision or regulation or annex that is implemented or amended; \
name: name of the person or entity; \
alias: a list of alias names, if applicable; \
position: if it is about a person, these are the positions held; \
dob: date of birth; \
pob: place of birth; \
address: the address; \
nationality: nationality; \
identifying_information: a list of further identifying information; \
reason: reason for the listing; \
un_sanction_date: date of sanctioning through the UN, if applicable; \
date_of_listing: date of listing; \
is_un: bolean to indicate if it is an UN sanction or not; \
entity_type: type of entity (person, organisation)." 

In [13]:
class CourtDecision(BaseModel):
    entity_name: str
    delisting: bool
    delisting_case: bool
    url: str
    celex: str
    publication_date: str
    date: str
    title: str
    law: str
    case: str
    legal_base: str

In [14]:
system_message_decisions = "You are a diligent research assistant. \
Your purpose is to support researchers with extracting \
information about court decisions regarding sanctions from unstructured text as well as from \
tables. The required data is to be structured as JSON for the output; the JSON \
keys are explained below. \
There may be multiple decisions per document, and you must extract them all. \
If the information is not available, return either an empty string \
(if the variable is of the type str), or an empty list of strings, if that \
was requested; in case of a boolean, return null. If you are not sure if \
you extracted the correct information, prefix it with: unsure: \
\
Here is the description for the JSON keys: \
entity_name: name of the entity in question; \
delisting: boolean indicating if it is a delisting or not; \
delisting_case: boolean indicating if the case is about a delisting; \
url: the url; \
celex: the CELEX number; \
publication_date: date of publication; \
date: date of decision; \
title: title of the decision; \
law: name of the law applied; \
case: name of the court case; \
legal_base: the legal base."

## Data

In [40]:
# how many files in total?

len(os.listdir("relevant_subset/relevant"))

33

### Sorting by document type

In [18]:
# these are the regex to identify document types

court_decision_pattern = r'<p class="oj-doc-ti".*?>Judgment of the \w*? Court.*?</p>'

notice_pattern = r'<p class="oj-doc-ti".*?">Notice.*?</p>'

In [25]:
# move identified files to different folders
# this makes it easier to group API calls with the same system message
# it saves tokens, and tokens = money

import shutil

for file in progressbar(os.listdir("relevant_subset/relevant")):
    if file.endswith(".txt"):
        with open(os.path.join("relevant_subset/relevant/", file), "r", encoding="utf-8") as infile:
            try:
                insa_doc = infile.read()
            except UnicodeDecodeError:
                # this catches all files that are actually PDFs, they remain in the
                # original folder
                continue
                
        if re.search(court_decision_pattern, str(insa_doc)) is not None:
            shutil.move(os.path.join("relevant_subset/relevant/", file), os.path.join("relevant_subset/relevant_sorted/decisions/", file))
        elif re.search(notice_pattern, str(insa_doc)) is not None:
            shutil.move(os.path.join("relevant_subset/relevant/", file), os.path.join("relevant_subset/relevant_sorted/notices/", file))
        else:
            shutil.move(os.path.join("relevant_subset/relevant/", file), os.path.join("relevant_subset/relevant_sorted/sanctions/", file))
        

[38;2;0;255;0m100%[39m [38;2;0;255;0m(1786 of 1786)[39m |####################| Elapsed Time: 0:00:01 Time:  0:00:010000


## API calls

### Court decisions

Files that matched the court decision pattern.

In [18]:
for file in progressbar(os.listdir("relevant_subset/relevant_sorted/decisions")):
    if file.endswith(".txt") and not os.path.isfile(os.path.join("json_results/", file.replace(".txt", ".json"))):
        # print(file)
        with open(os.path.join("relevant_subset/relevant_sorted/decisions", file), "r", encoding="utf-8") as infile:
            try:
                insa_doc = infile.read()
            except UnicodeDecodeError:
                continue
        
        system_message = system_message_decisions
        response_format = CourtDecision

        try:
            completion = client.beta.chat.completions.parse(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": insa_doc},
                ],
                response_format=response_format,
            )    
            sanctions = completion.choices[0].message.parsed
        except openai.BadRequestError as e:
            continue
        except openai.LengthFinishReasonError as e:
            continue

        json_file_name = file.replace(".txt", ".json")

        # todo fix json output (currently string)
        with open(os.path.join("json_results/", json_file_name), "w", encoding="utf-8") as f:
            json.dump(sanctions.json(ensure_ascii=False), f, ensure_ascii=False, indent=4, sort_keys=True)

[38;2;0;255;0m100%[39m [38;2;0;255;0m(81 of 81)[39m |########################| Elapsed Time: 0:04:13 Time:  0:04:130322


### General files

Everything that did not match a regex pattern from above.

In [None]:
# general sanctions

for file in progressbar(os.listdir("relevant_subset/relevant_sorted/sanctions")):
    if file.endswith(".txt") and not os.path.isfile(os.path.join("json_results/", file.replace(".txt", ".json"))):
        # print(file)
        with open(os.path.join("relevant_subset/relevant_sorted/sanctions", file), "r", encoding="utf-8") as infile:
            try:
                insa_doc = infile.read()
            except UnicodeDecodeError:
                continue
        
        system_message = system_message_sanctions
        response_format = SanctionDoc

        try:
            completion = client.beta.chat.completions.parse(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": insa_doc},
                ],
                response_format=response_format,
            )    
            sanctions = completion.choices[0].message.parsed
        except openai.BadRequestError as e:
            continue
        except openai.LengthFinishReasonError as e:
            continue

        json_file_name = file.replace(".txt", ".json")

        # todo fix json output (currently string)
        with open(os.path.join("json_results/", json_file_name), "w", encoding="utf-8") as f:
            json.dump(sanctions.json(ensure_ascii=False), f, ensure_ascii=False, indent=4, sort_keys=True)

### Notices

Files that matched the notice pattern.

In [None]:
for file in progressbar(os.listdir("relevant_subset/relevant_sorted/notices")):
    if file.endswith(".txt") and not os.path.isfile(os.path.join("json_results/", file.replace(".txt", ".json"))):
        # print(file)
        with open(os.path.join("relevant_subset/relevant_sorted/notices", file), "r", encoding="utf-8") as infile:
            try:
                insa_doc = infile.read()
            except UnicodeDecodeError:
                continue
        
        system_message = system_message_sanctions
        response_format = SanctionDoc

        try:
            completion = client.beta.chat.completions.parse(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": insa_doc},
                ],
                response_format=response_format,
            )    
            sanctions = completion.choices[0].message.parsed
        except openai.BadRequestError as e:
            continue
        except openai.LengthFinishReasonError as e:
            continue

        json_file_name = file.replace(".txt", ".json")

        # todo fix json output (currently string)
        with open(os.path.join("json_results/", json_file_name), "w", encoding="utf-8") as f:
            json.dump(sanctions.json(ensure_ascii=False), f, ensure_ascii=False, indent=4, sort_keys=True)

## Evaluation

This creates excel files from json docs.

In [28]:
import openpyxl
import pandas as pd
import random

In [33]:
json_files = os.listdir("json_results/")

for _ in range(100):
    get_index = random.randrange(len(json_files))
    json_file_name = json_files[get_index]

    # Load JSON data
    with open(os.path.join("json_results/", json_file_name)) as file:
        # needed because i stored it as str, not json...
        data = json.loads(json.load(file))

    # Convert json data to pandas dataframe
    if "sanctions" in data:
        df = pd.DataFrame(data["sanctions"])
    else:
        df = pd.DataFrame([data])

    excel_file_name = json_file_name.split(".")[0] + ".xlsx"
    df.to_excel(os.path.join("random_excel/", excel_file_name), index=False)


## Data inspection

If you want quickly inspect all result data, create the dataframes here.

In [41]:
sanctions_df = pd.DataFrame()
court_decisions_df = pd.DataFrame()

In [42]:
for json_file_name in os.listdir("json_results/"):
    # Load JSON data
    if json_file_name.endswith(".json"):
        with open(os.path.join("json_results/", json_file_name)) as file:
            # needed because i stored it as str, not json...
            data = json.loads(json.load(file))

    # Convert json data to pandas dataframe
    if "sanctions" in data:
        # df = pd.DataFrame(data["sanctions"])
        # sanctions_df = sanctions_df.concat(data["sanctions"], ignore_index=True)
        sanctions_df = pd.concat([sanctions_df, pd.DataFrame(data["sanctions"])], ignore_index=True)
        #print("added")
    else:
        court_decisions_df = pd.concat([court_decisions_df, pd.DataFrame([data])], ignore_index=True)

In [43]:
sanctions_df.head()

Unnamed: 0,celex,law_number,publication_date,legal_base,name,alias,position,dob,pob,address,nationality,identifying_information,reason,un_sanction_date,date_of_listing,is_un,entity_type
0,52023XG0403(03),"2011/235/CFSP, 2023/727, 359/2011, 2023/721",2023-04-03,"Council Decision 2011/235/CFSP, Council Decisi...",,[],[],,,,,[],Restrictive measures in view of the situation ...,,,False,
1,62012TA0578,2012/635/CFSP,1.9.2014,Council Implementing Regulation (EU) No 945/20...,National Iranian Oil Company,[],[],,,,Iranian,[],Restrictive measures against Iran with the aim...,,15 October 2012,False,organisation
2,52022XG1128(03),2014/119/CFSP and 208/2014,28.11.2022,Council Decision 2014/119/CFSP and Council Reg...,Vitalii Yuriyovych Zakharchenko,[],[],,,,,[],Restrictive measures directed against certain ...,,,False,person
3,62022TN0364,2022/582/CFSP,2022-08-08,Council Decision (CFSP) 2022/582 of 8 April 20...,Aleksandr Aleksandrovich Shulgin,[],[],,,"Moscow, Russia",,[],Listed in the annex to the Council Decision 20...,,2022-04-08,False,person
4,62022TN0364,2022/581,2022-08-08,Council Implementing Regulation (EU) 2022/581 ...,Aleksandr Aleksandrovich Shulgin,[],[],,,"Moscow, Russia",,[],Listed in Annex I of the Implementing Regulati...,,2022-04-08,False,person


In [44]:
court_decisions_df.head()

Unnamed: 0,entity_name,delisting,delisting_case,url,celex,publication_date,date,title,law,case,legal_base
0,Sharif University of Technology,False,False,https://eur-lex.europa.eu/legal-content/EN/TXT...,62015TA0052,13.6.2016,28.4.2016,Judgment of the General Court of 28 April 2016...,,T-52/15,Council Decision 2010/413/CFSP
1,Tibisay Lucena Ramírez,False,False,https://eur-lex.europa.eu/legal-content/EN/TXT...,62018TA0247,30.8.2021,14 July 2021,Judgment of the General Court of 14 July 2021 ...,,T-247/18,Article 263 TFEU
2,Alexander Dmitrievich Pumpyanskiy,True,False,https://eur-lex.europa.eu/legal-content/EN/TXT...,62022TA0734,29.1.2024,29 November 2023,Judgment of the General Court of 29 November 2...,"Council Decision (CFSP) 2022/1530, Council Imp...",Case T-734/22,"Article 263 TFEU, Article 268 TFEU"
3,Samer Foz,False,False,https://eur-lex.europa.eu/legal-content/EN/TXT...,62019TA0258,2022-01-24,2021-11-24,Judgment of the General Court of 24 November 2...,Council Implementing Decision (CFSP) 2019/87 o...,T-258/19,Article 263 TFEU
4,Évariste Boshab,False,False,https://eur-lex.europa.eu/legal-content/EN/TXT...,62022TA0098,2023-05-02,2023-03-08,Judgment of the General Court of 8 March 2023 ...,Council Decision (CFSP) 2021/2181 and Council ...,T-98/22,Article 263 TFEU
