In [1]:
# !pip install -q transformers numpy torch

In [2]:
import json
import torch
import numpy as np
from transformers import BertTokenizer, BertForTokenClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from transformers import CamembertTokenizer, CamembertForTokenClassification

In [3]:
class NERTrainer:
    def __init__(self, model_name, num_labels, train_data, test_data, batch_size=8, num_epochs=3, learning_rate=1e-5):
        self.tokenizer = CamembertTokenizer.from_pretrained(model_name)
        self.model = CamembertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.learning_rate = learning_rate
        self.train_data = train_data
        self.test_data = test_data
        self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate)
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def prepare_data(self, data):
        input_ids = []
        attention_masks = []
        labels = []

        for example in data:
            text = example['text']
            label = example['label']

            encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
            input_ids.append(encoding['input_ids'])
            attention_masks.append(encoding['attention_mask'])
            labels.append(torch.tensor(label, dtype=torch.long))

        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)
        labels = torch.cat(labels, dim=0)

        dataset = TensorDataset(input_ids, attention_masks, labels)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        return dataloader

    def train(self):
        train_dataloader = self.prepare_data(self.train_data)

        for epoch in range(self.num_epochs):
            self.model.train()
            total_loss = 0.0

            for batch in train_dataloader:
                input_ids, attention_mask, labels = batch
                input_ids, attention_mask, labels = input_ids.to(self.device), attention_mask.to(self.device), labels.to(self.device)

                self.optimizer.zero_grad()
                outputs = self.model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits

                loss = self.loss_fn(logits.view(-1, self.model.config.num_labels), labels.view(-1))
                loss.backward()
                self.optimizer.step()

                total_loss += loss.item()

            average_loss = total_loss / len(train_dataloader)
            print(f"Epoch {epoch+1}/{self.num_epochs}, Loss: {average_loss:.4f}")

    def evaluate(self):
        test_dataloader = self.prepare_data(self.test_data)
        self.model.eval()
        total_accuracy = 0.0

        with torch.no_grad():
            for batch in test_dataloader:
                input_ids, attention_mask, labels = batch
                input_ids, attention_mask, labels = input_ids.to(self.device), attention_mask.to(self.device), labels.to(self.device)

                outputs = self.model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                predicted_labels = torch.argmax(logits, dim=2)

                correct = (predicted_labels == labels).sum().item()
                total = labels.numel()

                total_accuracy += correct / total

        average_accuracy = total_accuracy / len(test_dataloader)
        print(f"Average Accuracy: {average_accuracy:.4f}")


In [4]:
# Charger les données JSON exportées depuis Label Studio
with open('../../data/raw/data449.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

label_to_int = {"SYND":1, "DIR":2, "ENT":3, "DATE":4, "CAD":5,"INT":6, "OUV":7, "NCAD":8, "NOUV":9, "TOUS":10,"AG CAD":11,"AG INT":12,"AG OUV":13, "AG NCAD":14, "AG NOUV":15, "AI CAD":16, "AI INT":17, "AI OUV":18, "AI NCAD":19, "AI NOUV":20, "AG":21, "AI":22, "ATO":23, "ATOT CAD":24, "ATOT INT":25, "ATOT OUV":26, "ATOT NCAD":27, "ATOT NOUV":28, "PPV":29, "PPVm":30 }


# Formater les données
formatted_data = []

for item in data:
    text = item['text']
    labels = item['label'] 

    ner_annotations = []

    for annotation in labels:
        start = annotation['start']
        end = annotation['end']
        label = annotation['labels'][0]
        
        label_id = label_to_int.get(label, -1)
        start_idx = -1
        end_idx = -1
        if label_id != -1:
            # Trouver les indices de début et de fin dans le texte
                 start_idx = text.find(text[start:end])
                 end_idx = start_idx + (end - start)

        ner_annotations.append({
            'start': start_idx,
            'end': end_idx,
            'label': label_id
        })

    formatted_data.append({
        'text': text,
        'label': ner_annotations
    })

In [7]:
# Diviser les données en ensembles d'entraînement et de test (80% - 20%)
split_ratio = 0.8
split_index = int(len(formatted_data) * split_ratio)
train_data = formatted_data[:split_index]
test_data = formatted_data[split_index:]

# Exemple d'utilisation :
# Entraîner le modèle
num_labels=30
trainer = NERTrainer("Jean-Baptiste/camembert-ner", num_labels, train_data, test_data)
trainer.train()

# Évaluer le modèle
trainer.evaluate()

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/892 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

KeyboardInterrupt: 