In [1]:
import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, BitsAndBytesConfig
import pandas as pd
from datasets import load_dataset
from datetime import datetime
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download of the used dataset
ita_const = load_dataset("ItalianLegalNLP/ItalianConstitution", "constitution", split='it')
ita_const = ita_const.to_pandas()
ita_const.head()
ita_const.to_csv('ita_const_orig.csv', index=False)

In [3]:
# Remove the Nones in the article_commas field
def remove_none_values(d):
    if isinstance(d, dict):
        return {k: remove_none_values(v) for k, v in d.items() if v is not None}
    elif isinstance(d, list):
        return [remove_none_values(v) for v in d if v is not None]
    else:
        return d
    
for i in range(len(ita_const)):
    ita_const['article_commas'][i] = remove_none_values(ita_const['article_commas'][i])

ita_const.to_csv('ita_const.csv', index=False)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  ita_const['article_commas'][i] = remove_none_values(ita_const['article_commas'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-

In [10]:
# Set a testing text
test_text = [ "L'Italia è una Repubblica democratica, fondata sul lavoro. La sovranità appartiene al popolo, che la esercita nelle forme e nei limiti della Costituzione.", 
             "La democrazia italiana si basa sul lavoro e sul potere del popolo.",
             "In Italia, il popolo esercita la sovranità nei limiti stabiliti dalla Costituzione.",
             "L'Italia è una nazione dove i diritti dei lavoratori sono fondamentali.",
             "L'Italia è governata da una monarchia autoritaria, non dal popolo.",
             "In Italia, il potere è nelle mani di pochi, ignorando la volontà del popolo.",
             "La sovranità in Italia è detenuta esclusivamente dal governo, senza alcun limite." ]

# Use of hugging face models to get embeddings 

def get_embeddings(texts, model, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model)
    
    bnb_config = BitsAndBytesConfig(
                                load_in_4bit=True,
                                bnb_4bit_use_double_quant=True,
                                bnb_4bit_quant_type="nf4",
                                bnb_4bit_compute_dtype=torch.bfloat16,
                               )
    model = AutoModelForCausalLM.from_pretrained(model, trust_remote_code=True, quantization_config=bnb_config, device_map="cuda")
    
    embeddings = {}
    
    for i, text in enumerate(texts):        
        #inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
        inputs = inputs.to(device='cuda')
        
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings[f"{model_name}-{i}"] = outputs.logits[-1].mean(dim=1)
    return embeddings

def save_embeddings(file_name, embeddings):
    for embedding_name, embedding in embeddings.items():
        print(f"Embeddings: {embedding}")
    
        with open( "~/work/embeddings/" + embedding_name + '.pkl', 'wb') as f:
            pickle.dump(embedding, f)
        
def retrieve_embeddings(file_name):
    with open(file_name + '.pkl', 'rb') as f:
        loaded_embeddings = pickle.load(f)
    return loaded_embeddings
    

# !!! For the models that don't have the padding token, try changing the padding parameter to False/None(automatic configuration) in the tokenizer function !!!

model_names = {
    "LegalBert": 'pile-of-law/legalbert-large-1.7M-2',
    "Multilingual Bert": 'google-bert/bert-base-multilingual-cased',
    "Roberta": 'FacebookAI/xlm-roberta-base',
    "Saul": 'Equall/Saul-7B-Instruct-v1', #Modello interessante ma manca il padding token
    "ChatLaw": 'chestnutlzj/ChatLaw-Text2Vec',
    "Meta-Llama": 'meta-llama/Meta-Llama-3-8B', #Modello interessante ma mi devono dare l'accesso alla repo
    "MPT-7B": 'mosaicml/mpt-7b', #Modello interessante ma manca il padding token
    "Falcon-7B": 'tiiuae/falcon-7b', #Modello interessante ma manca il padding token 
}

bert_modes = {
    "LegalBert": 'pile-of-law/legalbert-large-1.7M-2',
    "Multilingual Bert": 'google-bert/bert-base-multilingual-cased',
    "Roberta": 'FacebookAI/xlm-roberta-base',
}

non_bert_models = {
    "Saul": 'Equall/Saul-7B-Instruct-v1',
    "Meta-Llama": 'meta-llama/Meta-Llama-3-8B',
    "MPT-7B": 'mosaicml/mpt-7b',
    "Falcon-7B": 'tiiuae/falcon-7b',
}

embeddings_dict = {}

print("Tokenizing:")
for sentence in test_text:
    print(f"  - {sentence}")
ok = False
for model_name, model_ in model_names.items():
    if model_name != 'MPT-7B':
        continue
    print(model_name + "...", end='')
    start_time = datetime.now()
    save_embeddings(f"{model_name}", get_embeddings(test_text, model_, model_name))
    
    end_time = datetime.now()
    print(f"done in {(end_time - start_time).total_seconds()} seconds!")

"""
for i, sentence in enumerate(test_text):
    for model_name, model_ in model_names.items():
        print(model_name + "...", end='')
        start_time = datetime.now()
        
        embeddings = get_embeddings(sentence, model_)
        embeddings_dict[model_name, i] = embeddings
        
        end_time = datetime.now()
        print(f"done in {(end_time - start_time).total_seconds()} seconds!")
        
    
print(embeddings_dict)"""

Tokenizing:
  - L'Italia è una Repubblica democratica, fondata sul lavoro. La sovranità appartiene al popolo, che la esercita nelle forme e nei limiti della Costituzione.
  - La democrazia italiana si basa sul lavoro e sul potere del popolo.
  - In Italia, il popolo esercita la sovranità nei limiti stabiliti dalla Costituzione.
  - L'Italia è una nazione dove i diritti dei lavoratori sono fondamentali.
  - L'Italia è governata da una monarchia autoritaria, non dal popolo.
  - In Italia, il potere è nelle mani di pochi, ignorando la volontà del popolo.
  - La sovranità in Italia è detenuta esclusivamente dal governo, senza alcun limite.
MPT-7B...

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.44s/it]


Embeddings: tensor([23.8906,  9.3672,  6.1055, 17.0000, 17.1875, 12.9609,  4.1211,  8.5000,
         9.4922, 15.9297,  8.8516, 12.2578, 15.5391,  6.2266, 16.8438,  8.5391,
         6.8125, 17.7656, 27.2188, 13.8438,  4.3633,  0.6367,  3.2246, 13.0938,
         2.5195,  4.7969,  5.0938, 18.0938,  7.2109,  8.0703, 12.5156, 19.2969,
        14.5000, 12.1328,  3.3301,  1.7109, 14.7969,  9.5234,  3.8984,  4.4883,
        12.1172, 12.8984,  8.3516,  4.4688,  3.7676, 12.7344,  6.8320,  1.9961,
         1.9082,  3.9434, 12.0391, 27.6562], device='cuda:0',
       dtype=torch.float16)


FileNotFoundError: [Errno 2] No such file or directory: '~/work/embeddings/MPT-7B-0.pkl'

In [11]:
# Check the distance between the embeddings

sentence_orig = retrieve_embeddings("LegalBert-0")
sentence_pos_1 = retrieve_embeddings("LegalBert-1")
sentence_pos_2 = retrieve_embeddings("LegalBert-2")
sentence_pos_3 = retrieve_embeddings("LegalBert-3")
sentence_neg_1 = retrieve_embeddings("LegalBert-4")
sentence_neg_2 = retrieve_embeddings("LegalBert-5")
sentence_neg_3 = retrieve_embeddings("LegalBert-6")

print(sentence_orig)

embedding_orig = torch.tensor(sentence_orig, dtype=torch.float32)
embedding_pos_1 = torch.tensor(sentence_pos_1, dtype=torch.float32)
embedding_pos_2 = torch.tensor(sentence_pos_2, dtype=torch.float32)
embedding_pos_3 = torch.tensor(sentence_pos_3, dtype=torch.float32)
embedding_neg_1 = torch.tensor(sentence_neg_1, dtype=torch.float32)
embedding_neg_2 = torch.tensor(sentence_neg_2, dtype=torch.float32)
embedding_neg_3 = torch.tensor(sentence_neg_3, dtype=torch.float32)

# Reshape the tensors to 2D (1, -1) to use with F.cosine_similarity
embedding_orig_reshaped = embedding_orig.unsqueeze(0)
embedding_pos_1_reshaped = embedding_pos_1.unsqueeze(0)
embedding_pos_2_reshaped = embedding_pos_2.unsqueeze(0)
embedding_pos_3_reshaped = embedding_pos_3.unsqueeze(0)
embedding_neg_1_reshaped = embedding_neg_1.unsqueeze(0)
embedding_neg_2_reshaped = embedding_neg_2.unsqueeze(0)
embedding_neg_3_reshaped = embedding_neg_3.unsqueeze(0)

similarity_orig_pos_1 = F.cosine_similarity(embedding_orig_reshaped, embedding_pos_1_reshaped)
similarity_orig_pos_2 = F.cosine_similarity(embedding_orig_reshaped, embedding_pos_2_reshaped)
similarity_orig_pos_3 = F.cosine_similarity(embedding_orig_reshaped, embedding_pos_3_reshaped)
similarity_orig_neg_1 = F.cosine_similarity(embedding_orig_reshaped, embedding_neg_1_reshaped)
similarity_orig_neg_2 = F.cosine_similarity(embedding_orig_reshaped, embedding_neg_2_reshaped)
similarity_orig_neg_3 = F.cosine_similarity(embedding_orig_reshaped, embedding_neg_3_reshaped)

print(similarity_orig_pos_1)
print("Cosine Similarity with pos_1:", similarity_orig_pos_1.item())
print("Cosine Similarity with pos_2:", similarity_orig_pos_2.item())
print("Cosine Similarity with pos_3:", similarity_orig_pos_3.item())
print("Cosine Similarity with neg_1:", similarity_orig_neg_1.item())
print("Cosine Similarity with neg_2:", similarity_orig_neg_2.item())
print("Cosine Similarity with neg_3:", similarity_orig_neg_3.item())

tensor([[-0.1121, -0.0832, -1.0378,  ...,  0.6271, -0.3492, -0.1108]],
       device='cuda:0')


  embedding_orig = torch.tensor(sentence_orig, dtype=torch.float32)
  embedding_pos_1 = torch.tensor(sentence_pos_1, dtype=torch.float32)
  embedding_pos_2 = torch.tensor(sentence_pos_2, dtype=torch.float32)
  embedding_pos_3 = torch.tensor(sentence_pos_3, dtype=torch.float32)
  embedding_neg_1 = torch.tensor(sentence_neg_1, dtype=torch.float32)
  embedding_neg_2 = torch.tensor(sentence_neg_2, dtype=torch.float32)
  embedding_neg_3 = torch.tensor(sentence_neg_3, dtype=torch.float32)


RuntimeError: a Tensor with 1024 elements cannot be converted to Scalar

In [7]:
from bs4 import BeautifulSoup


# Reading the data inside the xml
# file to a variable under the name 
# data
with open('./Test.xml', 'r') as f:
	data = f.read()

# Passing the stored data inside
# the beautifulsoup parser, storing
# the returned object 
Bs_data = BeautifulSoup(data, "xml")

# Finding all instances of tag 
# `unique`
b_unique = Bs_data.find_all('unique')

print("b_unique")
print(b_unique)

# Using find() to extract attributes 
# of the first instance of the tag
b_name = Bs_data.find('book')#, {'name':'Frank'})

print("b_name")
print(b_name)

# Extracting the data stored in a
# specific attribute of the 
# `child` tag
value = b_name.get('author')

print("value")
print(value)


FileNotFoundError: [Errno 2] No such file or directory: 'Codice penale.xml'

In [None]:
# Use of OpenAI tiktoken to get embeddings
# NOT USEFUL !

import tiktoken

encodings = {
    "gpt-4o" : "o200k_base",
    "gpt-3.5" : "cl100k_base"
}

for model_name, encoding_name in encodings.items():
    enc = tiktoken.get_encoding(encoding_name)
    embeddings = enc.encode(df['text'].tolist())
    np.save(f"{model_name}_embeddings.npy", embeddings)

#enc = tiktoken.encoding_for_model("gpt-4o")
enc1 = tiktoken.get_encoding("o200k_base") # GPT-4o
enc2 = tiktoken.get_encoding("cl100k_base") # GPT-3.5 GPT-4 and others

print(enc1.encode("hello world"))
print("----------------------")
print(enc2.encode("hello world"))

[24912, 2375]
----------------------
[15339, 1917]
