In [8]:
#!pip install langchain pdfplumber langchain-openai langchail-text-spliteters > /dev/null

import os
import re
import pdfplumber
import pandas as pd
from langchain import PromptTemplate
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter

# RTF -> article extraction
from striprtf.striprtf import rtf_to_text
from collections import defaultdict

# Testing embedding models
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

In [3]:
os.environ["OPENAI_API_KEY"] = "e28b35965f744d38b464c69d32496500"
os.environ["OPENAI_API_VERSION"] = "2024-02-01"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://mg-openai-adv.openai.azure.com/"
os.environ["PDF_PATH"] = "/home/utente/Scaricati/Legislative pdfs"

In [4]:
# Set up the model and the prompt template through AzureChatOpenAI

# GPT3.5: mg-gpt-35-turbo-16k
# GPT4: mg-gpt-4-0613
llm = AzureChatOpenAI(deployment_name="mg-gpt-35-turbo-16k", temperature=0.9)

prompt_template = """
    You are given a text from a PDF that contains sentences about two antinomial laws.
    Your task is to extract the law numbers and determine if they are antinomial.
    
    Text: {text}
    
    Please provide a JSON response with the following keys:
    - first_law: the number of the first law
    - second_law: the number of the second law
    - are_antinomial: 1 if the laws are antinomial, 0 if they are not, 2 if it is unclear
    """

messages = [
    ("system", "You are a helpful assistant specialized in analyzing legal texts."),
    ("user", prompt_template)
]

prompt = ChatPromptTemplate.from_messages(messages)

chain = prompt | llm | StrOutputParser()

In [4]:
# Set up the model and the prompt template through Hugging Face's API (Italian-Legal-Bert)

from transformers import AutoModel, AutoTokenizer
model_name = "nlpaueb/legal-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

text = "Tell me something about the Italian legal system: "
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)

outputs



BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.1965, -0.4241, -0.0790,  ..., -0.2015, -0.1456,  0.3158],
         [-0.2645, -0.2345,  0.0482,  ...,  0.6902,  0.1312, -0.4591],
         [-0.2937, -0.1044, -0.3545,  ...,  0.4051,  0.0844,  0.3564],
         ...,
         [ 0.1385,  0.0258,  0.0761,  ..., -0.0405, -0.1984,  0.9173],
         [ 0.2290,  0.0651, -0.4753,  ...,  0.0164,  0.2266,  0.1565],
         [-0.1964, -0.4236, -0.0793,  ..., -0.2017, -0.1453,  0.3159]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-5.1989e-01,  4.3836e-01,  9.4956e-01,  1.8751e-01,  4.7849e-01,
          1.6745e-01, -5.9398e-02, -1.8771e-01,  8.6579e-01,  4.0117e-01,
          6.4372e-01, -1.2341e-01,  5.2440e-01, -8.9162e-01, -5.6513e-01,
          3.0558e-01, -4.1909e-01,  1.4944e-01, -2.8861e-01, -1.4434e-01,
          1.6670e-01, -2.7394e-01,  9.1846e-01,  1.0344e-01,  6.8153e-02,
         -7.6606e-01, -8.3362e-02,  8.0751e-01, -1.5699e-01, -3.132

In [5]:
# Utility functions

# !!! Write a customized regex to extract the text without words like avv. n. etc.

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text() + '\n'
    write_to_file('summary.txt', text)
    return text

def write_to_file(filename, content):
    with open(filename, 'w') as f:
        f.write(content)

def read_from_file(filename):
    with open(filename, 'r') as f:
        return f.read()
    
def split_text(text, pattern):
    parts = re.split(pattern, text, flags=re.MULTILINE)
    
    parts = [part for part in parts if part]
    
    if not re.match(pattern, parts[0]):
        parts = parts[1:]
    
    return parts


In [6]:
# Example text
text2 = "Some introduction text. Law No. 123 is the first law. More text. Law No. 456 is the second law."

# Define the regex pattern to split on and capture
pattern = r'(Law No\. \d+)'

# Display the results
for i, part in enumerate(split_text(text2, pattern)):
    print(f"Part {i+1}: {part}")

# If you need to process the parts further, you can do so here.


Part 1: Law No. 123
Part 2:  is the first law. More text. 
Part 3: Law No. 456
Part 4:  is the second law.


In [7]:
print(split_text("something wow something wow asdfsadf wow sdfcsdcfsdcsadcsd", r'(wow)'))

['wow', ' something ', 'wow', ' asdfsadf ', 'wow', ' sdfcsdcfsdcsadcsd']


In [None]:
# Execute scraping in the HTML file
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re

with open('./Normattiva - Export.html') as f:
    html_page = f.read()

soup = BeautifulSoup(html_page, 'html.parser')

for tag in soup.find_all('div', attrs={'class': 'container ContentHtmlExp'}):
    #print(tag)
    print(tag.text)
    print("------------------------------")

"""
links = []

for link in soup.findAll('div'):
    links.append(link.get('href'))

print(links)"""

In [13]:
# REad the RTF file and convert it to text

file_name = 'Codice di Procedura Civile'

rtf = read_from_file('./' + file_name + '.rtf')
text = rtf_to_text(rtf)
write_to_file(file_name + '.txt', text)

# Split by CAPO
libri = split_text(text, r'(LIBRO .+\n.+\n)')
for i, libro in enumerate(libri):
    write_to_file(f'libro_{i+1}.txt', libro)

data = []

contLibri = contTitoli = contCapitoli = contSezioni = contArrticoli = 0

for l, libro in enumerate(libri):
    # Skip the titles (keeping just the separated text)
    if len(libro) < 100:
        continue
    contLibri+=1
    
    # Split by TITOLO
    titoli = split_text(libro, r'(TITOLO [IVXL]+\n.+\n)')
    
    for t, titolo in enumerate(titoli):
        # Skip the titles (keeping just the separated text)
        if len(titolo) < 100:
            continue
        contTitoli+=1
        
        # Split by CAPO
        capi = split_text(titolo, r'(^(.+)?CAPO [IVXL]+\n.+\n)')
        
        for c, capo in enumerate(capi):
            
            # Skip the titles (keeping just the separated text)
            if len(capo) < 100:
                continue
            contCapitoli+=1
            
            # Split by SEZIONE
            sezioni = split_text(capo, r'(Sezione [IVXL]+\n.+\n)')

            for s, sezione in enumerate(sezioni):
                
                # Skip the titles (keeping just the separated text)
                if len(sezione) < 100:
                    continue
                contSezioni+=1
                
                # Split by ARTICOLO
                #articoli = split_text(text=sezione, pattern=r'(^(.+)?Art\. \d\..+?\n.+\n(.+\n)?)')
                articoli = re.findall(r'Art\. (\d+)\.(.*?)((?=Art\.)|$)', sezione, re.DOTALL)
    
                for a, articolo in enumerate(articoli):
                    contArrticoli+=1
                    
                    articolo_split = articolo[1].strip().split('\n', 1)
                    
                    numero_articolo = articolo[0].strip()
                    titolo_articolo = articolo_split[0].strip()
                    testo_articolo = articolo_split[1].strip() if len(articolo_split) > 1 else ''      
                    
                    data.append({
                        'libro': libri[l-1].replace('\n', ' '),
                        'titolo': titoli[t-1].replace('\n', ' '),
                        'capo': capi[c-1].replace('\n', ' '),
                        'sezione': sezioni[s-1].replace('\n', ' '),
                        'numero_articolo': numero_articolo if len(numero_articolo) > 0 else "PARSE ERROR",
                        'titolo_articolo': titolo_articolo.replace('\n', ' ') if len(titolo_articolo) > 1 else 'PARSE ERROR',
                        'testo_articolo': testo_articolo.replace('\n', ' ') if len(testo_articolo) > 1 else "PARSE ERROR",
                    })
                    

print(f"Libri: {contLibri}, Titoli: {contTitoli}, Capitoli: {contCapitoli}, Sezioni: {contSezioni}, Articoli: {contArrticoli}")


# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(data)
df.to_csv(file_name + '.csv', index=False)
df.head()


Libri: 4, Titoli: 25, Capitoli: 60, Sezioni: 57, Articoli: 450


Unnamed: 0,libro,titolo,capo,sezione,numero_articolo,titolo_articolo,testo_articolo
0,LIBRO PRIMO DISPOSIZIONI GENERALI,TITOLO I DEGLI ORGANI GIUDIZIARI,CAPO I Del giudice,Sezione I Della giurisdizione e della competen...,1,(Giurisdizione dei giudici ordinari).,"La giurisdizione civile, salvo speciali dispos..."
1,LIBRO PRIMO DISPOSIZIONI GENERALI,TITOLO I DEGLI ORGANI GIUDIZIARI,CAPO I Del giudice,Sezione I Della giurisdizione e della competen...,2,"((ARTICOLO ABROGATO DALLA L. 31 MAGGIO 1995, N...",PARSE ERROR
2,LIBRO PRIMO DISPOSIZIONI GENERALI,TITOLO I DEGLI ORGANI GIUDIZIARI,CAPO I Del giudice,Sezione I Della giurisdizione e della competen...,3,"((ARTICOLO ABROGATO DALLA L. 31 MAGGIO 1995, N...",PARSE ERROR
3,LIBRO PRIMO DISPOSIZIONI GENERALI,TITOLO I DEGLI ORGANI GIUDIZIARI,CAPO I Del giudice,Sezione I Della giurisdizione e della competen...,4,"((ARTICOLO ABROGATO DALLA L. 31 MAGGIO 1995, N...",PARSE ERROR
4,LIBRO PRIMO DISPOSIZIONI GENERALI,TITOLO I DEGLI ORGANI GIUDIZIARI,CAPO I Del giudice,Sezione I Della giurisdizione e della competen...,5,(Momento determinante della giurisdizione e de...,La giurisdizione e la competenza si determinan...


In [11]:
# Extract laws from entire text

def extract_laws(text):
    return chain.invoke({"text": text })#"La corte costituzionale ha dichiarato che la legge 122 e la legge 123 sono antinomie."})
    
for filename in os.listdir(os.environ["PDF_PATH"]):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(os.environ["PDF_PATH"], filename)
        
        text = extract_text_from_pdf(pdf_path)
        laws_json = extract_laws(text)
        
        print(f"File: {filename}")
        print(f"Extracted JSON: {laws_json}")

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 16384 tokens. However, your messages resulted in 18004 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

In [18]:
# Extract laws from summarized text

map_prompt_template = \
"""Summarize the following text:

{text}

Summary:"""
map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["text"])

# Define the reduce step to summarize the combined summaries
reduce_prompt_template = \
"""Summarize the following combined summaries:

{text}

Final Summary:"""
reduce_prompt = PromptTemplate(template=reduce_prompt_template, input_variables=["text"])

# Function to split text into chunks
def split_text(text):
    text_splitter = RecursiveCharacterTextSplitter(separators=[
        ".\n",
    ],
    chunk_size=7000, 
    chunk_overlap= 100)
    
    return text_splitter.create_documents([text])

# Function to map (summarize each chunk)
def map_summarize_chunks(chunks):
    map_chain = map_prompt | llm# Chain(llm=llm, prompt_template=map_prompt)
    chunk_summaries = [map_chain.invoke({"text": chunk.page_content}).content for chunk in chunks]
    return chunk_summaries

# Function to reduce (summarize the combined summaries)
def reduce_summary(chunk_summaries):
    combined_text = " ".join(chunk_summaries)
    reduce_chain = reduce_prompt | llm #Chain(llm=llm, prompt_template=reduce_prompt)
    final_summary = reduce_chain.invoke({"text": combined_text}).content
    return final_summary

def process_pdf(filepath):
    text = extract_text_from_pdf(filepath)
    chunks = split_text(text)
    for i, x in enumerate(chunks):
        write_to_file(f'chunk{i}.txt', x.page_content)
    chunk_summaries = map_summarize_chunks(chunks)
    final_summary = reduce_summary(chunk_summaries)
    return final_summary


for filename in os.listdir(os.environ["PDF_PATH"]):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(os.environ["PDF_PATH"], filename)
        final_summary = process_pdf(pdf_path)
        #write_to_file('summary.txt', final_summary)
        
        laws_json = extract_laws(final_summary)
        print(f"Extracted JSON: {laws_json}")
        

Extracted JSON: {
  "first_law": "",
  "second_law": "",
  "are_antinomial": 2
}


In [None]:
articles = [
    "Articolo 1: Testo del primo articolo di legge.",
    "Articolo 2: Testo del secondo articolo di legge.",
]

df = pd.DataFrame(articles, columns=['text'])

def get_embeddings(texts, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

model_names = [
    '../Models/SaulLM-7B_model',
    '../Models/ChatLaw_model',
    '../Models/LLama3-8B_model',
    '../Models/MPT-7B_model',
    '../Models/Falcon-7B_model'
]

embeddings_dict = {}
for model_name in model_names:
    embeddings = get_embeddings(df['text'].tolist(), model_name)
    embeddings_dict[model_name] = embeddings

# Save embeddings to files
for model_name, embeddings in embeddings_dict.items():
    np.save(f"{model_name}_embeddings.npy", embeddings.numpy())
