In [1]:
import re
import os
import PyPDF2
import pypandoc
import pandas as pd
import lxml.etree as ET

from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter

## Utility functions

In [2]:
# Utility functions and constants
default_path = os.getcwd() + "/work/"
DEFAULT_SAVE_DIR = default_path + "documents/"

def write_to_file(filename, content):
    with open(filename, 'w') as f:
        f.write(content)

def read_from_file(filename):
    with open(filename, 'r') as f:
        return f.read()

# Different kind of text extraction from each type of file
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text    

def extract_text_from_xml(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    return ET.tostring(root, encoding='unicode', method='text')

def extract_text_from_rtf(rtf_path):
    return pypandoc.convert_file(rtf_path, 'plain', format='rtf')


def split_text(text, pattern):
    parts = re.split(pattern, text, flags=re.MULTILINE)
    
    parts = [part for part in parts if part]
    
    if not re.match(pattern, parts[0]):
        parts = parts[1:]
    
    return parts

def split_text(text, max_chunk_size=7000, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(separators=[
        "\n\n",
        "\n",
        ".",
    ],
    chunk_size=max_chunk_size,
    chunk_overlap=chunk_overlap)
    
    return text_splitter.split_text(text)

## Extraction of Codice Penale (from website of Procura Generale Trento)

In [6]:
CODICE_PENALE_DIR = DEFAULT_SAVE_DIR + "Codice penale well formatted edited.pdf"

def find_articles(text):
    # Remove all sections starting with "LIBRO"
    text = re.sub(r'LIBRO.*?(?=Articolo n\.|$)', '', text, flags=re.DOTALL)

    pattern = r'(Articolo n\..*?)(\n.*?)(?=\nArticolo n\.|$)'
    matches = re.findall(pattern, text, re.DOTALL)
    return matches

text = extract_text_from_pdf(CODICE_PENALE_DIR)

matches = find_articles(text)

data = []
for article in matches:
    law_number = article[0].split("Articolo n.")[1] # Get everything after "Articolo n."
    law_text = article[1].strip()
    if '.' in law_text:
        law_title, law_text = map(str.strip, law_text.split('.', 1))
    else:
        law_title = ''
    data.append({'Law number': law_number, 'Law title': law_title, 'Law text': law_text})

df = pd.DataFrame(data)
df.to_csv(os.getcwd() + "/work/documents/Codice_Penale.csv", index=False)

df.head()

Unnamed: 0,Law number,Law title,Law text
0,1,Reati e pene: disposizione espressa di legge,1. Nessuno può essere punito per un fatto che ...
1,2,Successione di leggi penali,"1. Nessuno può essere punito per un fatto che,..."
2,3,Obbligatorietà della legge penale,1. La legge penale italiana obbliga tutti colo...
3,4,Cittadino italiano,Territorio dello Stato.\n1. Agli effetti della...
4,5,Ignoranza della legge penale,1. Nessuno può invocare a propria scusa l'igno...


## Extraction of C.P.P. from Normattiva ()

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By

# You need to download chromedriver and give the location path
driver = webdriver.Chrome('/path/to/chromedriver')

# URL of the webpage you want to access
driver.get('https://www.example.com') 

# Find a button by its ID and click it
button = driver.find_element(By.ID, 'button-id')
button.click()

# Find a text input by its name, clear it and type into it
text_input = driver.find_element(By.NAME, 'input-name')
text_input.clear()
text_input.send_keys('Some text')

# Submit a form by clicking the submit button
submit_button = driver.find_element(By.XPATH, '//input[@type="submit"]')
submit_button.click()

# don't forget to close the browser!
driver.quit()
