In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from datasets import load_dataset
import bitsandbytes
from huggingface_hub import notebook_login
import torch
import numpy as np
import pandas as pd
import os
import json
import re

In [2]:
os.environ["OPENAI_API_KEY"] = ""
os.environ["OPENAI_API_VERSION"] = "2024-02-01"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://mg-openai-adv.openai.azure.com/"
os.environ["PDF_PATH"] = "/home/utente/Scaricati/Legislative pdfs"

In [3]:
notebook_login()

comma = load_dataset("disi-unibo-nlp/COMMA", "it")

anto_ds = comma['train'].filter(lambda example : "abrogat" in example["full_text"].lower() or "antinom" in example["full_text"].lower())
anto_ds

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Dataset({
    features: ['id', 'ruling_type', 'epigraph', 'body', 'decision', 'maxims_text', 'maxims_title', 'full_text', 'num_maxims', 'maxims_len', 'full_text_len', 'judgment_type', 'constitutional_parameters', 'maxims'],
    num_rows: 1860
})

In [7]:
models = {
    #"Saul": {'model_name': 'Equall/Saul-7B-Instruct-v1', 'context_window': 1024, 'prompt_function': lambda system_prompt, user_prompt: f"<|system|>\n{system_prompt}|<user>|\n{user_prompt}\n|<assistant>|\n\n"}, #Modello addestrato su testi legali
    #"Llamantino": {'model_name': 'swap-uniba/LLaMAntino-2-7b-hf-dolly-ITA', 'context_window': 8000, 'prompt_function': lambda system_prompt, user_prompt: f"Di seguito è riportata un'istruzione che descrive un'attività, abbinata ad un input che fornisce ulteriore informazione.\nScrivi una risposta che soddisfi adeguatamente la richiesta.\n\n### Istruzione:\n{system_prompt}\n\n### Input:\n{user_prompt}\n\n### Risposta:\n"}, # Doesn't work with transformers
    "Meta-Llama": {'model_name': 'meta-llama/Meta-Llama-3-8B-Instruct', 'context_window': 8000, 'quantization_type': 0, 'prompt_function': lambda system_prompt, user_prompt: f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>{user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"},
    "Falcon-7B": {'model_name': 'tiiuae/falcon-7b-instruct', 'context_window': 512, 'quantization_type': 0, 'prompt_function': lambda system_prompt, user_prompt: f"User: {user_prompt}\nAssistant:{system_prompt}"},
    "Mixtral-8x22B": {'model_name': 'mistralai/Mixtral-8x22B-Instruct-v0.1', 'context_window': 1024, 'quantization_type': 1, 'prompt_function': lambda system_prompt, user_prompt: f"[INST] {system_prompt} {user_prompt}\n[/INST]"},
    "Mixtral-7B": {'model_name': 'mistralai/Mistral-7B-Instruct-v0.2', 'context_window': 32000, 'quantization_type': 1, 'prompt_function': lambda system_prompt, user_prompt: f"[INST] {system_prompt} {user_prompt}\n[/INST]"},
    #"Minerva-3B": {'model_name': 'sapienzanlp/Minerva-3B-base-v1.0', 'context_window': 512, 'prompt_function': lambda system_prompt, user_prompt: f"{system_prompt} {user_prompt}"}, # Modello italiano della Sapienza
    #"deepset/roberta-base-squad2" : {'model_name': 'deepset/roberta-base-squad2', 'context_window': 512}, # Modello per il question answering                                <|system|>{systemPrompt}<|end|><|user|>{userPrompt}<|end|><|assistant|>
    #"Phi-small" : {'model_name': 'microsoft/Phi-3-small-8k-instruct', 'context_window': 8000, 'quantization_type': 1, 'prompt_function': lambda system_prompt, user_prompt: f"<|system|>{system_prompt}<|end|><|<user>|{user_prompt}<|end|><|assistant|>"},
    #"Phi-small" : {'model_name': 'microsoft/Phi-3-small-128k-instruct', 'context_window': 128000, 'quantization_type': 1, 'prompt_function': lambda system_prompt, user_prompt: f"<|system|>{system_prompt}<|end|><|<user>|{user_prompt}<|end|><|assistant|>"},
    #"Phi-medium" : {'model_name': 'microsoft/Phi-3-medium-4k-instruct', 'context_window': 8000, 'quantization_type': 1, 'prompt_function': lambda system_prompt, user_prompt: f"<|system|>{system_prompt}<|end|><|<user>|{user_prompt}<|end|><|assistant|>"},
    #"Phi-medium-quantized" : {'model_name': 'kaitchup/Phi-3-medium-128k-instruct-awq-4bit', 'context_window': 8000, 'quantization_type': 0, 'prompt_function': lambda system_prompt, user_prompt: f"<|system|>{system_prompt}<|end|><|<user>|{user_prompt}<|end|><|assistant|>"},
}

def init_model(model_name, quantization_config=0):
    if quantization_config == 0:
        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16)
        
        if "Phi" in model_name:
            model.trust_remote_code = True
            
    elif quantization_config == 1:
        bnb_config = BitsAndBytesConfig(
                                    load_in_4bit=True,
                                    #bnb_4bit_use_double_quant=True,
                                    bnb_4bit_quant_type="nf4",
                                    bnb_4bit_compute_dtype=torch.bfloat16,
                                )
        
        model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="cuda")
        
        if "Phi" in model_name:
            model.trust_remote_code = True
        
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    return model, tokenizer

def split_text(text, max_chunk_size=7000, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(separators=[
        "\n\n",
        "\n",
        ".",
    ],
    chunk_size=max_chunk_size,
    chunk_overlap=chunk_overlap)
    
    return text_splitter.split_text(text)

### Use open source llms to generate the tuples

In [5]:
valid_texts = np.array([])

for item in anto_ds["full_text"]:
    if any(word in item for word in ["antinomi", "antonim", "abroga"]):# and len(item) < 31000:
        valid_texts = np.append(valid_texts, item)
        
print(valid_texts.shape)
valid_texts[0]

(1860,)


'Nel giudizio di legittimità costituzionale degli artt. 55 e 56 della legge 12 febbraio 1968, n. 132 (legge ospedaliera), dell\'art. 132 del r.d. 4 febbraio 1915, n. 148 (nuovo testo unico della legge comunale e provinciale), e dell\'art. 81 del r.d. 5 febbraio 1891, n. 99 (approvazione del regolamento sulle istituzioni pubbliche di beneficenza), promosso con ordinanza emessa l\'8 giugno 1971 dalla Corte d\'appello di Milano nel procedimento civile vertente tra Marandola Paolo, Veltri Cornelio e Azzarelli Vittorio, iscritta al n. 426 del registro ordinanze 1971 e pubblicata nella Gazzetta Ufficiale della Repubblica n. 4 del 5 gennaio 1972. Visti gli atti di Costituzione di Marandola Paolo e Veltri Cornelio e d\'intervento del Presidente del Consiglio dei ministri; udito nell\'udienza pubblica del 17 ottobre 1973 il Giudice relatore Giulio Gionfrida; udito il sostituto avvocato generale dello Stato Michele Savarese, per il Presidente del Consiglio dei ministri. RITENUTO IN FATTO. 1. - I

In [8]:
results_os = []
for model_data_key in models.keys():
    model_data = models[model_data_key]
    model, tokenizer = init_model(model_data["model_name"], model_data["quantization_type"])
    
    for i, text in enumerate(valid_texts):
        if i > 10:
            break
        
        chunks = split_text(text, model_data["context_window"]*0.8)
        for chunk in chunks:
            inputs = tokenizer.encode(model_data["prompt_function"] (
                """I am going to give you a text, extract all pairs of antinomian laws as JSON like {"Law 1": "...", "Law 2": "...", "Are antinomial": true if they are, false otherwise} for each pair. Answer directly without any preamble or comment. Answer always using a correct Italian. DON'T PUT MORE THAN ONE PAIR IN A SINGLE JSON.""",
                chunk
            ), return_tensors="pt", truncation=True).to("cuda")
            
            outputs = model.generate(inputs, max_length=int(model_data["context_window"]*0.8), num_return_sequences=1)
            output = tokenizer.decode(outputs[0], skip_special_tokens=True)
            print([chunk, output])
            results_os.append([text, output])
                    
            del inputs, outputs, output
            torch.cuda.empty_cache()
        
    df = pd.DataFrame(results_os)
    df.to_csv(os.getcwd() + "/work/documents/COMMA_Extraction_" + model_data_key + ".csv", index=False)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:100257 for open-end generation.
  x = [xi.to_sparse_csr() for xi in x]


OutOfResources: out of resource: shared memory, Required: 90112, Hardware limit: 65536. Reducing block sizes or `num_stages` may help.

### Generate the tuples with GPT APIs

In [None]:
# Set up the model and the prompt template through AzureChatOpenAI

# GPT3.5: mg-gpt-35-turbo-16k
# GPT4: mg-gpt-4-0613
llm = AzureChatOpenAI(deployment_name="mg-gpt-35-turbo-16k", temperature=0.7)

prompt_template = """
    I am going to give you a text, extract all pairs of antinomian laws as JSON like {{"Law 1": "...", "Law 2": "...", "Are antinomial": true if they are, false otherwise}} for each pair. Answer directly without any preamble or comment. Answer always using a correct Italian. DON'T PUT MORE THAN ONE PAIR IN A SINGLE JSON.
    {text}
    """

messages = [
    ("system", "You are a helpful assistant specialized in analyzing legal texts."),
    ("user", prompt_template)
]

prompt = ChatPromptTemplate.from_messages(messages)

chain = prompt | llm | StrOutputParser()

In [None]:
results_gpt = []

for i, text in enumerate(valid_texts):
    if i > 10:
        break
    chunks = split_text(text, int(16000*0.8))
    
    for chunk in chunks:
        output = chain.invoke({"text": chunk })
        
        results_gpt.append([chunk, output])        

In [None]:
def fix_json(json_like_str):
    json_like_str = json_like_str.replace("'", "\"")
    json_like_str = json_like_str.replace("""{"Law 1": "Reference to law 1", "Law 2": "Reference to law 2", "Are antinomial": True if they are, False otherwise}""", "")
    json_like_str = json_like_str.replace("True", "true")
    json_like_str = json_like_str.replace("False", "false")
    return json_like_str

json_pattern = re.compile(r'\{.*?\}')
true_count = 0
results = []

for text, output in results_gpt:
    json_matches = json_pattern.findall(output)
    print(output)
    for json_str in json_matches:
        json_str = fix_json(json_str)
        
        #print(">>>>>>\nOriginal JSON:\n", json_str)
        
        try:
            json_obj = json.loads(json_str)
            results.append(json_obj)
            true_count += 1 if json_obj["Are antinomial"] == True else 0
            print("Extracted JSON:", json_obj)
        except json.JSONDecodeError as e:
            print(json_str)
            print("Failed to decode JSON:", e, ) 
            
print(f"True count: {true_count}")

df = pd.DataFrame(results)
df.to_csv(os.getcwd() + "/work/documents/COMMA_Extraction_GPT35.csv", index=False)

### Analyzing the results

In [None]:
for result in results:    
    print(f"Input: {result[0]}\n\nOutput: {result[1]}\n\n")

In [None]:
llama_results = results

In [None]:
df = pd.DataFrame(llama_results, columns=["Llama Input", "Llama Output"])

# add to the dataframe also the columns of the mixtral results
df["Mixtral Input"] = [result[1] for result in mixtral_results]
df["Mixtral Output"] = [result[1] for result in mixtral_results]
