In [2]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Download of the used dataset
ita_const = load_dataset("ItalianLegalNLP/ItalianConstitution", "constitution", split='it')
ita_const = ita_const.to_pandas()
ita_const.head()
ita_const.to_csv('ita_const_orig.csv', index=False)

In [4]:
# Remove the Nones in the article_commas field
def remove_none_values(d):
    if isinstance(d, dict):
        return {k: remove_none_values(v) for k, v in d.items() if v is not None}
    elif isinstance(d, list):
        return [remove_none_values(v) for v in d if v is not None]
    else:
        return d
    
for i in range(len(ita_const)):
    ita_const['article_commas'][i] = remove_none_values(ita_const['article_commas'][i])

ita_const.to_csv('ita_const.csv', index=False)

# Set a testing text
test_text = ["L'Italia è una Repubblica democratica, fondata sul lavoro. La sovranità appartiene al popolo, che la esercita nelle forme e nei limiti della Costituzione."]

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  ita_const['article_commas'][i] = remove_none_values(ita_const['article_commas'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-

In [5]:
# Use of hugging face models to get embeddings

def get_embeddings(texts, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# !!! For the models that don't have the padding token, try changing the padding parameter to False/None(automatic configuration) in the tokenizer function !!!

model_names = {
    "Falcon-7B": 'tiiuae/falcon-7b', #Modello interessante ma manca il padding token 
    "LegalBert": 'pile-of-law/legalbert-large-1.7M-2',
    "Multilingual Bert": 'google-bert/bert-base-multilingual-cased',
    "Roberta": 'FacebookAI/xlm-roberta-base',
    "Saul": 'Equall/Saul-7B-Instruct-v1', #Modello interessante ma manca il padding token
    "ChatLaw": 'chestnutlzj/ChatLaw-Text2Vec',
    "Meta-Llama": 'meta-llama/Meta-Llama-3-8B', #Modello interessante ma mi devono dare l'accesso alla repo
    "MPT-7B": 'mosaicml/mpt-7b', #Modello interessante ma manca il padding token
}

embeddings_dict = {}

for model_name, model_ in model_names.items():
    print(model_name + "...", end='')
    embeddings = get_embeddings(test_text, model_)
    embeddings_dict[model_name] = embeddings
    print("done!")
    
    
print(embeddings_dict)

Falcon-7B...

Loading checkpoint shards: 100%|██████████| 2/2 [01:11<00:00, 35.94s/it]


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [7]:
from bs4 import BeautifulSoup


# Reading the data inside the xml
# file to a variable under the name 
# data
with open('./Test.xml', 'r') as f:
	data = f.read()

# Passing the stored data inside
# the beautifulsoup parser, storing
# the returned object 
Bs_data = BeautifulSoup(data, "xml")

# Finding all instances of tag 
# `unique`
b_unique = Bs_data.find_all('unique')

print("b_unique")
print(b_unique)

# Using find() to extract attributes 
# of the first instance of the tag
b_name = Bs_data.find('book')#, {'name':'Frank'})

print("b_name")
print(b_name)

# Extracting the data stored in a
# specific attribute of the 
# `child` tag
value = b_name.get('author')

print("value")
print(value)


FileNotFoundError: [Errno 2] No such file or directory: 'Codice penale.xml'

In [None]:
# Use of OpenAI tiktoken to get embeddings
# NOT USEFUL !

import tiktoken

encodings = {
    "gpt-4o" : "o200k_base",
    "gpt-3.5" : "cl100k_base"
}

for model_name, encoding_name in encodings.items():
    enc = tiktoken.get_encoding(encoding_name)
    embeddings = enc.encode(df['text'].tolist())
    np.save(f"{model_name}_embeddings.npy", embeddings)

#enc = tiktoken.encoding_for_model("gpt-4o")
enc1 = tiktoken.get_encoding("o200k_base") # GPT-4o
enc2 = tiktoken.get_encoding("cl100k_base") # GPT-3.5 GPT-4 and others

print(enc1.encode("hello world"))
print("----------------------")
print(enc2.encode("hello world"))

[24912, 2375]
----------------------
[15339, 1917]
