In [None]:
import pandas as pd
import PyPDF2
import re

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig

import os
import torch
from langchain import PromptTemplate
from langchain.chains import Chain, SimpleSequentialChain, SequentialChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms.huggingface_pipeline import HuggingFacePipeline

In [None]:
# Utility functions

# !!! Write a customized regex to extract the text without words like avv. n. etc.

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

def write_to_file(filename, content):
    with open(filename, 'w') as f:
        f.write(content)

def read_from_file(filename):
    with open(filename, 'r') as f:
        return f.read()
    
def split_text(text, pattern):
    parts = re.split(pattern, text, flags=re.MULTILINE)
    
    parts = [part for part in parts if part]
    
    if not re.match(pattern, parts[0]):
        parts = parts[1:]
    
    return parts

model_names = {
    "LegalBert": {'model': 'pile-of-law/legalbert-large-1.7M-2', 'context_window': 512}, #Modello molto rapido addestrato su testi legali
    "Saul": {'model': 'Equall/Saul-7B-Instruct-v1', 'context_window': 1024}, #Modello addestrato su testi legali
    "Meta-Llama": {'model': 'meta-llama/Meta-Llama-3-8B', 'context_window': 2048},
    "Falcon-7B": {'model': 'tiiuae/falcon-7b', 'context_window': 512},
    "Mixtral-8x22B": {'model': 'mistralai/Mixtral-8x22B-Instruct-v0.1', 'context_window': 1024},
    "Minerva-3B": {'model': 'sapienzanlp/Minerva-3B-base-v1.0', 'context_window': 512}, # Modello italiano della Sapienza
}

In [None]:
# Extract questions from the PDF with quiz

def extract_questions_answers(text):
    qa_pattern = re.compile(r'(\d+)\. (.+?)\n(A\) .+?)\n(B\) .+?)\n(C\) .+?)\n', re.DOTALL)
    matches = qa_pattern.findall(text)
    
    data = []
    for match in matches:
        question_number, question, answer_a, answer_b, answer_c = match
        
        # Remove newline characters and clean up answers
        question = question.replace('\n', ' ')
        answer_a = answer_a.replace('\n', ' ').replace('A) ', '')
        answer_b = answer_b.replace('\n', ' ').replace('B) ', '')
        answer_c = answer_c.replace('\n', ' ').replace('C) ', '')
        
        data.append([question, answer_a, answer_b, answer_c])
    
    return data


text = extract_text_from_pdf("/home/utente/Downloads/03_diritto_amministrativo.pdf")
qa_data = extract_questions_answers(text)

df_pdf = pd.DataFrame(qa_data, columns=['Domanda', 'Risposta 1', 'Risposta 2', 'Risposta 3'])
df_pdf.to_csv('quiz_pdf.csv', index=False)

df_pdf.head()


: 

In [None]:
# Extract questions from the JSON file

import json
import pandas as pd

with open('/home/utente/Downloads/domande.json', 'r') as file:
    data = json.load(file)

questions = []
answers1 = []
answers2 = []
answers3 = []

# Iterate over each question object in the JSON data
for item in data:
    question = item['question']
    answers = item['answers']
    
    # Extract each answer and its correctness
    correct_answer = None
    incorrect_answers = []
    for answer in answers:
        if answer['right']:
            correct_answer = answer['answer']
        else:
            incorrect_answers.append(answer['answer'])
    
    answers1.append(correct_answer)
    answers2.append(incorrect_answers[0])
    answers3.append(incorrect_answers[1])
    
    questions.append(question)

df_json = pd.DataFrame({
    'Domanda': questions,
    'Risposta 1': answers1,
    'Risposta 2': answers2,
    'Risposta 3': answers3
})

df_json.to_csv('quiz_json.csv', index=False)

df_json.head()


: 

In [23]:
# Merge the two dataframes and clean up the data

df_merged = pd.concat([df_pdf, df_json], ignore_index=True)
df_merged.to_csv('quiz_merged_orig.csv', index=False)

# Delete rows which don't contain a number (a law reference)
df_merged = df_merged[df_merged['Domanda'].apply(lambda x: bool(re.search(r'\d', x)))]
df_merged = df_merged.drop_duplicates(subset='Domanda')

# Add an index colum at the beginning of the dataframe
df_merged.insert(0, 'Index', range(1, 1 + len(df_merged)))

df_merged.to_csv('quiz_merged.csv', index=False)
df_merged.head()

Unnamed: 0,Domanda,Risposta 1,Risposta 2,Risposta 3
1,Il Capo II della l. n. 241/1990 è riservato al...,"Accerta d'ufficio i fatti, disponendo il compi...",Non è mai competente alla valutazione della su...,É solo competente all'indizione delle conferen...
3,Ai sensi dell'art. 80 C.p.a. come avviene la p...,Deve essere presentata istanza di fissazione d...,Deve essere presentata istanza di fissazione d...,Deve essere presentato nuovamente il ricorso e...
4,Entro quale termine le parti devono proporre r...,Nel termine di sessanta giorni decorrente dall...,Nel termine di novanta giorni decorrente dalla...,Nel termine di centoventi giorni decorrente da...
7,A norma di quanto dispone l'art. 133 del C.p.a...,Alla giurisdizione esclusiva del giudice ammin...,Alla giurisdizione esclusiva del giudice ordin...,Alla giurisdizione esclusiva del TAR del Lazio.
8,Consacrando a livello costituzionale i princip...,"Moneta, tutela del risparmio e mercati finanzi...",Tutela e sicurezza del lavoro.,"Produzione, trasporto e distribuzione nazional..."


In [3]:
# Extract laws from a question (Mixtral-8x22B)
def ask_question(model, question, answers):
    #basic_prompt = "I am going to ask you a quiz question related to the legislative world, choose the correct answer between the possible options: "
    basic_prompt = "Sto per mostrarti una domanda a carattere legislativo, ricerca tutte le leggi all'interno della domanda e genera un JSON con i seguenti campi per ogni legge trovata {id_domanda,  numero della legge, testo della legge, link alla pagina web}"
    
    conversation = [
        {"role": "user", "content": basic_prompt + question},
        {"role": "assistant", "content": answers}
    ]
    
    tokenizer = AutoTokenizer.from_pretrained(model)
    tool_use_prompt = tokenizer.apply_chat_template (
        conversation,
        chat_template="tool_use",
        tools=tools,
        tokenize=False,
        add_generation_prompt=True,
    )
    model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x22B-Instruct-v0.1", device_map="cuda")

    inputs = tokenizer(tool_use_prompt, return_tensors="pt")

    outputs = model.generate(**inputs, max_new_tokens=20)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))
    
    return


Downloading shards:  20%|██        | 12/59 [2:04:09<7:54:10, 605.34s/it] 

In [3]:
# Extract laws from a question (other models)
def extract_laws_from_question(model, question):
    try:
        # Load the pipeline for question-answering
        qa_pipeline = pipeline("question-answering", model=model, tokenizer=model)
        # Ask the question to the model
        answer = qa_pipeline(question=question, context="I am going to give you a quiz question about the legislative world, from that extract all the laws and generate a JSON with the following fields for each law found {law number, law text, link to the web page}")
        # Store the extracted law from the answer
        print(f"answer: {answer}")
        extracted_laws = answer["answer"]
    except Exception as e:
        print(f"An error occurred with model {model_name}: {str(e)}")
        extracted_laws[model_name] = f"An error occurred: {str(e)}"
    return extracted_laws

df = pd.read_csv('data.csv')
question = "What laws apply to employment discrimination?"
extracted_laws = {}

for index, row in df.iterrows():
    print(f"Index: {index}")
    print(f"question: {row['Domanda']}, answer_1: {row['Risposta 1']}, answer_2: {row['Risposta 2']}, answer_3: {row['Risposta 3']}")

    for model_name, model_data in model_names.items():
        print(f"{model_name}: {row['Domanda']}")
        extracted_laws[model_data["model"], index] = extract_laws_from_question(model_data["model"], question)
        
# Convert the dictionary to a DataFrame
df_extracted_laws = pd.DataFrame.from_dict(extracted_laws, orient='index')

# Rename the columns
df_extracted_laws.reset_index(inplace=True)
df_extracted_laws.columns = ['Model', 'Question Index', 'Question ID', 'Law Number', 'Law Text', 'Link to the Web Page']

# Split the 'Model' column into two separate columns
df_extracted_laws[['Model', 'Question Index']] = pd.DataFrame(df_extracted_laws['Model'].tolist(), index=df_extracted_laws.index)
        

An error occurred with model Meta-Llama: At least one of TensorFlow 2.0 or PyTorch should be installed. To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ To install PyTorch, read the instructions at https://pytorch.org/.
Meta-Llama: An error occurred: At least one of TensorFlow 2.0 or PyTorch should be installed. To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ To install PyTorch, read the instructions at https://pytorch.org/.


In [None]:
# Extract different laws from .rtf, .xml or pdf files from Normattiva

# Function to split text into chunks
def split_text(text, max_chunk_size=7000, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(separators=[
        ".\n",
    ],
    chunk_size=max_chunk_size, 
    chunk_overlap=chunk_overlap)
    
    return text_splitter.create_documents([text])

# Function to map (summarize each chunk)
def map_summarize_chunks(chunks):
    map_chain = map_prompt | llm# Chain(llm=llm, prompt_template=map_prompt)
    chunk_summaries = [map_chain.invoke({"text": chunk.page_content}).content for chunk in chunks]
    return chunk_summaries

# Function to reduce (summarize the combined summaries)
def reduce_summary(chunk_summaries):
    combined_text = " ".join(chunk_summaries)
    reduce_chain = reduce_prompt | llm #Chain(llm=llm, prompt_template=reduce_prompt)
    final_summary = reduce_chain.invoke({"text": combined_text}).content
    return final_summary

def process_pdf(filepath):
    text = extract_text_from_pdf(filepath)
    chunks = split_text(text)
    for i, x in enumerate(chunks):
        write_to_file(f'chunk{i}.txt', x.page_content)
    chunk_summaries = map_summarize_chunks(chunks)
    final_summary = reduce_summary(chunk_summaries)
    return final_summary

# Default templates
map_prompt_template = \
"""Summarize the following text:

{text}

Summary:"""
map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["text"])

# Define the reduce step to summarize the combined summaries
reduce_prompt_template = \
"""Summarize the following combined summaries:

{text}

Final Summary:"""

# Define the model and the pipeline
model_name = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

class HuggingFaceTextGenerationChain(Chain):
    def __init__(self, generator):
        super().__init__()
        self.generator = generator

    def _call(self, inputs):
        text = inputs['text']
        generated_text = self.generator(text, max_length=100, num_return_sequences=1)[0]['generated_text']
        return {"generated_text": generated_text}

hugging_face_chain = HuggingFaceTextGenerationChain(generator)
chain = SimpleSequentialChain(chains=[hugging_face_chain])

# Use the chain
result = chain.run({"text": "Tell me a story about artificial intelligence."})
print(result['generated_text'])


# Example other component
class DummyChain(Chain):
    def _call(self, inputs):
        text = inputs['text']
        # Perform some dummy operations
        processed_text = text.upper()
        return {"text": processed_text}

dummy_chain = DummyChain()
hugging_face_chain = HuggingFaceTextGenerationChain(generator)

# Combine chains
complex_chain = SequentialChain(chains=[dummy_chain, hugging_face_chain])

# Use the complex chain with an input
result = complex_chain.run({"text": "Describe the future of technology."})
print(result['generated_text'])
    
bnb_config = BitsAndBytesConfig(
                            load_in_4bit=True,
                            bnb_4bit_use_double_quant=True,
                            bnb_4bit_quant_type="nf4",
                            bnb_4bit_compute_dtype=torch.bfloat16,
                            )
model = AutoModelForCausalLM.from_pretrained(model, trust_remote_code=True, quantization_config=bnb_config, device_map="cuda")


reduce_prompt = PromptTemplate(template=reduce_prompt_template, input_variables=["text"])

path = "/home/utente/Desktop/Thesis/Documents/Downloaded"

for filename in os.listdir(os.environ["PDF_PATH"]):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(os.environ["PDF_PATH"], filename)
        final_summary = process_pdf(pdf_path)
        #write_to_file('summary.txt', final_summary)
        
        laws_json = chain.invoke({"text": final_summary })#"La corte costituzionale ha dichiarato che la legge 122 e la legge 123 sono antinomie."})
        print(f"Extracted JSON: {laws_json}")
        