In [8]:
import re
import os
import PyPDF2
import pypandoc
import pandas as pd
import lxml.etree as ET

from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter

## Utility functions

In [9]:
# Utility functions and constants
default_path = os.getcwd() + "/work/"
DEFAULT_SAVE_DIR = default_path + "documents/"

def write_to_file(filename, content):
    with open(filename, 'w') as f:
        f.write(content)

def read_from_file(filename):
    with open(filename, 'r') as f:
        return f.read()

# Different kind of text extraction from each type of file
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text    

def extract_text_from_xml(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    return ET.tostring(root, encoding='unicode', method='text')

def extract_text_from_rtf(rtf_path):
    return pypandoc.convert_file(rtf_path, 'plain', format='rtf')


def split_text(text, pattern):
    parts = re.split(pattern, text, flags=re.MULTILINE)
    
    parts = [part for part in parts if part]
    
    if not re.match(pattern, parts[0]):
        parts = parts[1:]
    
    return parts

def split_text(text, max_chunk_size=7000, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(separators=[
        "\n\n",
        "\n",
        ".",
    ],
    chunk_size=max_chunk_size,
    chunk_overlap=chunk_overlap)
    
    return text_splitter.split_text(text)

## Extraction of Codice Penale (from website of Procura Generale Trento)

In [10]:
CODICE_PENALE_DIR = DEFAULT_SAVE_DIR + "Codice penale.pdf"

text = extract_text_from_pdf(CODICE_PENALE_DIR)
articles = re.findall(r'Articolo n\.(\d+)([\s\S]*?)(?=Articolo n\.|\Z)', text, re.M)

data = []
for article in articles:
    law_number = article[0]
    law_text = article[1].strip()
    if '.' in law_text:
        law_title, law_text = map(str.strip, law_text.split('.', 1))
    else:
        law_title = ''
    data.append({'Law number': law_number, 'Law title': law_title, 'Law text': law_text})

df = pd.DataFrame(data)
df.head()

Unnamed: 0,Law number,Law title,Law text
0,1,Reati e pene: disposizione espressa di legge,1. Nessuno può essere punito per un fatto che ...
1,2,Successione di leggi penali,"1. Nessuno può essere punito per un fatto che,..."
2,3,Obbligatorietà della legge penale,1. La legge penale italiana obbliga tutti colo...
3,4,Cittadino italiano,Territorio dello Stato.\n1. Agli effetti della...
4,5,Ignoranza della legge penale,1. Nessuno può invocare a propria scusa l'igno...


### Check elements of the extracted Codice Penale

In [11]:
os.environ["OPENAI_API_KEY"] = "e6e0bce8d2cf4675b61998097d298113"
# Other GPTs: 2024-02-01
# GPT 4o: 2024-05-13
os.environ["OPENAI_API_VERSION"] = "2024-02-01"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://mg-openai-adv.openai.azure.com/"
os.environ["PDF_PATH"] = "/home/utente/Scaricati/Legislative pdfs"

In [24]:
# Set up the model and the prompt template through AzureChatOpenAI

# GPT3.5: mg-gpt-35-turbo-16k
# GPT4: mg-gpt-4-0613
# GPT4o: gpt-4o 
llm = AzureChatOpenAI(deployment_name="mg-gpt-35-turbo-16k", temperature=0.7)

prompt_template = """
    Tell me if there is something wrong with this text (if something is weird). JUST REPLY WITH TRUE IF SOMETHING IS WRONG, AND FALSE OTHERWISE.
    {text}
    """

messages = [
    ("system", "You are a helpful assistant specialized in analyzing legal texts."),
    ("user", prompt_template)
]

prompt = ChatPromptTemplate.from_messages(messages)

chain = prompt | llm | StrOutputParser()


In [25]:
results_gpt = []

for idx, row in df.iterrows():
    if idx > 10:
        break
    law_text = row['Law text']
    chunks = split_text(law_text, int(16000*0.8))

    for chunk in chunks:
        output = chain.invoke({"text": chunk })
        if len(output) > 5:
            output = "True"
        results_gpt.append(output)
print(results_gpt)
"""
df_results = pd.DataFrame(results_gpt, columns=['Chunk', 'Output', 'Index'])

# apply your parse_llm_JSONoutput function to df_results
parse_llm_JSONoutput(df_results, "GPT4")
"""

['True', 'True', 'TRUE', 'True', 'FALSE', 'True', 'FALSE', 'True', 'True', 'True', 'TRUE']


'\ndf_results = pd.DataFrame(results_gpt, columns=[\'Chunk\', \'Output\', \'Index\'])\n\n# apply your parse_llm_JSONoutput function to df_results\nparse_llm_JSONoutput(df_results, "GPT4")\n'

In [23]:
print(df.iloc[0])

Law number                                                    1
Law title          Reati e pene: disposizione espressa di legge
Law text      1. Nessuno può essere punito per un fatto che ...
Name: 0, dtype: object
