In [18]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from datasets import load_dataset

In [19]:
# Download of the used dataset
ita_const = load_dataset("ItalianLegalNLP/ItalianConstitution", "constitution", split='it')
ita_const = ita_const.to_pandas()
ita_const.head()
ita_const.to_csv('ita_const_orig.csv', index=False)

In [20]:
# Remove the Nones in the article_commas field
def remove_none_values(d):
    if isinstance(d, dict):
        return {k: remove_none_values(v) for k, v in d.items() if v is not None}
    elif isinstance(d, list):
        return [remove_none_values(v) for v in d if v is not None]
    else:
        return d
    
for i in range(len(ita_const)):
    ita_const['article_commas'][i] = remove_none_values(ita_const['article_commas'][i])

ita_const.to_csv('ita_const.csv', index=False)

# Set a testing text
test_text = ["L'Italia è una Repubblica democratica, fondata sul lavoro. La sovranità appartiene al popolo, che la esercita nelle forme e nei limiti della Costituzione."]

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  ita_const['article_commas'][i] = remove_none_values(ita_const['article_commas'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-

In [21]:
# Use of hugging face models to get embeddings

def get_embeddings(texts, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

model_names = {
    "LegalBert": 'pile-of-law/legalbert-large-1.7M-2',
    "Multilingual Bert": 'google-bert/bert-base-multilingual-cased',
    "Roberta": 'FacebookAI/roberta-base',
    "Saul": 'Equall/Saul-7B-Instruct-v1',
    "ChatLaw": 'chestnutlzj/ChatLaw-Text2Vec',
    "Meta-Llama": 'meta-llama/Meta-Llama-3-8B',
    "MPT-7B": 'mosaicml/mpt-7b',
    "Falcon-7B": 'tiiuae/falcon-7b'
}

embeddings_dict = {}

for model_name, model_ in model_names.items():
    embeddings = get_embeddings(test_text, model_)
    embeddings_dict[model_name] = embeddings
    
    
print(embeddings_dict)



tokenizer_config.json:   0%|          | 0.00/254 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/589 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/238k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

In [16]:
# Use of OpenAI tiktoken to get embeddings
import tiktoken

encodings = {
    "gpt-4o" : "o200k_base",
    "gpt-3.5" : "cl100k_base"
}

for model_name, encoding_name in encodings.items():
    enc = tiktoken.get_encoding(encoding_name)
    embeddings = enc.encode(df['text'].tolist())
    np.save(f"{model_name}_embeddings.npy", embeddings)

#enc = tiktoken.encoding_for_model("gpt-4o")
enc1 = tiktoken.get_encoding("o200k_base") # GPT-4o
enc2 = tiktoken.get_encoding("cl100k_base") # GPT-3.5 GPT-4 and others

print(enc1.encode("hello world"))
print("----------------------")
print(enc2.encode("hello world"))

[24912, 2375]
----------------------
[15339, 1917]
