In [1]:
from datasets import Dataset
import json

label_to_id = {"TITLE": 0, "PERSON": 1, "DATE": 2}


# Charger les données depuis le fichier JSON
with open("generated_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)


dataset = Dataset.from_dict(data)
print(dataset)

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['text', 'labels'],
    num_rows: 100
})


In [2]:
from transformers import AutoTokenizer

# Charger le tokenizer
tokenizer = AutoTokenizer.from_pretrained("camembert-base")

text = "Contrat de partenariat signé par Sophie Leclerc le 5 août 2019."

tokenized_inputs = tokenizer(text, return_offsets_mapping=True, truncation=True, padding=True)

# Récupérer les tokens et leurs offsets
tokens = tokenizer.convert_ids_to_tokens(tokenized_inputs["input_ids"])
offsets = tokenized_inputs["offset_mapping"]

print("Tokens:", tokens)
print("Offsets:", offsets)

Tokens: ['<s>', '▁Contrat', '▁de', '▁partenariat', '▁signé', '▁par', '▁Sophie', '▁Leclerc', '▁le', '▁5', '▁août', '▁2019.', '</s>']
Offsets: [(0, 0), (0, 7), (8, 10), (11, 22), (23, 28), (29, 32), (33, 39), (40, 47), (48, 50), (51, 52), (53, 57), (58, 63), (0, 0)]


In [11]:
sentence = dataset[0]
print(sentence)

tokenized_inputs = tokenizer(text, return_offsets_mapping=True, truncation=True, padding=True)
print(tokenized_inputs)

print(sentence["labels"])

for offset in tokenized_inputs["offset_mapping"]:
    print(offset)

{'text': 'Accord commercial signé par Sophie Leclerc le 2 février 2021.', 'labels': [[0, 17, 0], [28, 42, 1], [46, 60, 2]]}
{'input_ids': [5, 19217, 8, 2455, 2917, 37, 7834, 18673, 16, 205, 995, 14224, 6], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 7), (8, 10), (11, 22), (23, 28), (29, 32), (33, 39), (40, 47), (48, 50), (51, 52), (53, 57), (58, 63), (0, 0)]}
[[0, 17, 0], [28, 42, 1], [46, 60, 2]]
(0, 0)
(0, 7)
(8, 10)
(11, 22)
(23, 28)
(29, 32)
(33, 39)
(40, 47)
(48, 50)
(51, 52)
(53, 57)
(58, 63)
(0, 0)


In [None]:
from transformers import AutoTokenizer

In [20]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("camembert-base")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding=True, return_offsets_mapping=True)
    labels = []

    for i, label_set in enumerate(examples["labels"]):
        offsets = tokenized_inputs["offset_mapping"][i]
        label_ids = []

        for offset in offsets:
            if offset == (0, 0):  # Tokens spéciaux comme <s> ou <pad>
                label_ids.append(-100)
            else:
                # Trouver le label correspondant à cet offset
                token_label = "O"  # Par défaut, aucun label
                for start, end, label in label_set:
                    if offset[0] >= start and offset[1] <= end:
                        token_label = label
                        break
                label_ids.append(label_to_id.get(token_label, 0))  # Convertir en ID

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# Appliquer la tokenisation et l'alignement
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


Map: 100%|██████████| 100/100 [00:00<00:00, 2250.04 examples/s]


In [21]:
tokenized_dataset

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask', 'offset_mapping'],
    num_rows: 100
})

In [3]:
import os 
import sys
os.path.dirname(sys.executable)

'c:\\Users\\mhanania\\python\\Lecture de contrats\\AI transformers for contract reading\\.venv\\Scripts'

In [26]:
print(
    tokenized_dataset[1]["text"],"\n",
    dataset["labels"][1],"\n",
    tokenizer.convert_ids_to_tokens(tokenized_dataset[1]["input_ids"]),"\n",
    tokenized_dataset[1]["labels"],"\n",
    tokenized_dataset[1]["input_ids"],"\n",
    tokenized_dataset[1]["attention_mask"],"\n",
)

Contrat de partenariat signé par Sophie Leclerc le 5 août 2019. 
 [[0, 22, 0], [33, 47, 1], [51, 62, 2]] 
 ['<s>', '▁Contrat', '▁de', '▁partenariat', '▁signé', '▁par', '▁Sophie', '▁Leclerc', '▁le', '▁5', '▁août', '▁2019.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>'] 
 [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100, -100, -100, -100, -100] 
 [5, 19217, 8, 2455, 2917, 37, 7834, 18673, 16, 205, 995, 14224, 6, 1, 1, 1, 1] 
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0] 



In [4]:
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments

model = AutoModelForTokenClassification.from_pretrained("camembert-base", num_labels=len(label_to_id))

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=10,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

trainer.train()


Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
100%|██████████| 21/21 [01:07<00:00,  3.19s/it]

{'train_runtime': 67.0648, 'train_samples_per_second': 4.473, 'train_steps_per_second': 0.313, 'train_loss': 0.4465455100649879, 'epoch': 3.0}





TrainOutput(global_step=21, training_loss=0.4465455100649879, metrics={'train_runtime': 67.0648, 'train_samples_per_second': 4.473, 'train_steps_per_second': 0.313, 'total_flos': 2602784194200.0, 'train_loss': 0.4465455100649879, 'epoch': 3.0})

In [5]:
model.save_pretrained("./custom_camembert_ner")
tokenizer.save_pretrained("./custom_camembert_ner")


('./custom_camembert_ner\\tokenizer_config.json',
 './custom_camembert_ner\\special_tokens_map.json',
 './custom_camembert_ner\\tokenizer.json')

## Charger le modèle 

In [6]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Charger le modèle et le tokenizer
model = AutoModelForTokenClassification.from_pretrained("./custom_camembert_ner")
tokenizer = AutoTokenizer.from_pretrained("./custom_camembert_ner")


# Texte à analyser
text = "Accord signé par Alice Dupont le 15 février 2023."

# Tokeniser le texte
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)


In [7]:
# Obtenir les logits (scores bruts)
outputs = model(**inputs)
logits = outputs.logits

# Obtenir les prédictions
predictions = torch.argmax(logits, dim=2)  # Dim=2 correspond à la dimension des labels

# Afficher les résultats
print(predictions)


tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


In [9]:
# Associer les tokens et les prédictions
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
labels = predictions[0].tolist()

# Remettre les étiquettes au format lisible
label_map = {0: "O", 1: "PERSON", 2: "TITLE", 3: "DATE"}
results = []

for token, label_idx in zip(tokens, labels):
    if label_idx != -100:  # Ignorer les tokens spéciaux
        results.append((token, label_map[label_idx]))

# Afficher les résultats
print("Predictions:")
for token, label in results:
    print(f"{token}: {label}")


Predictions:
<s>: O
▁Accord: O
▁signé: O
▁par: O
▁Alice: O
▁Dupont: O
▁le: O
▁15: O
▁février: O
▁20: O
23: O
.: O
</s>: O
