In [227]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import CamembertTokenizerFast, TFAutoModelForTokenClassification, create_optimizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import warnings


warnings.filterwarnings("ignore")
# Chargement des datasets
df = pd.read_csv('../../data/reservation-first-dataset-train.csv')
df_test = pd.read_csv('../../data/reservation-first-dataset-test.csv')

model_name = 'etalab-ia/camembert-base-squadFR-fquad-piaf'
#model_name = 'camembert-base'

# Charger le tokenizer Camembert
tokenizer = CamembertTokenizerFast.from_pretrained(model_name)

# Créer un LabelEncoder pour les labels
label_encoder = LabelEncoder()
label_encoder.fit(['B-DEP', 'I-DEP', 'B-ARR', 'I-ARR', 'O'])  # Les classes de labels

# Fonction pour encoder les données
def encode_data_and_tokenize(data, tokenizer, label_encoder, max_length=42):
    tokens = []
    labels = []

    for i, row in data.iterrows():
        sentence = row['Phrase']
        departure = row['Départ']
        arrival = row['Arrivée']

        # Trouver toutes les positions des entités Départ et Arrivée
        dep_positions = []
        arr_positions = []

        start = 0  # Trouver toutes les occurrences de départ
        while True:
            idx = sentence.find(departure, start)
            if idx == -1:
                break
            dep_positions.append((idx, idx + len(departure)))
            start = idx + len(departure)

        start = 0  # Trouver toutes les occurrences d'arrivée
        while True:
            idx = sentence.find(arrival, start)
            if idx == -1:
                break
            arr_positions.append((idx, idx + len(arrival)))
            start = idx + len(arrival)

        # Tokenisation de la phrase
        tokenized_input = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            return_offsets_mapping=True,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        # Récupérer les tokens et offsets
        tokenized_text = tokenizer.convert_ids_to_tokens(tokenized_input.input_ids[0])
        offsets = tokenized_input['offset_mapping'][0].numpy()
        label_list = ['O'] * len(tokenized_text)

        # Attribution des labels aux tokens
        for j, (offset_start, offset_end) in enumerate(offsets):
            if offset_start == 0 and offset_end == 0:
                continue  # Token de padding

            # Vérifier si le token chevauche une entité de départ
            for entity_start, entity_end in dep_positions:
                if (offset_start >= entity_start) and (offset_end <= entity_end):
                    if offset_start == entity_start:
                        label_list[j] = 'B-DEP'
                    else:
                        label_list[j] = 'I-DEP'
                    break
            # Vérifier si le token chevauche une entité d'arrivée
            for entity_start, entity_end in arr_positions:
                if (offset_start >= entity_start) and (offset_end <= entity_end):
                    if offset_start == entity_start:
                        label_list[j] = 'B-ARR'
                    else:
                        label_list[j] = 'I-ARR'
                    break

        # Encoder les labels
        label_ids = label_encoder.transform(label_list)
        
        # Convertir les tokens en int32
        tokens.append(np.array(tokenized_input.input_ids[0].numpy(), dtype=np.int32))
        labels.append(np.array(label_ids, dtype=np.int32))

    print("\n// Encoding completed")
    return np.array(tokens, dtype=np.int32), np.array(labels, dtype=np.int32)

# Appel de la fonction avec ton DataFrame
tokens, labels = encode_data_and_tokenize(df, tokenizer, label_encoder)
tokens_eval, labels_eval = encode_data_and_tokenize(df_test, tokenizer, label_encoder)

# Vérification de la tokenisation et des labels
for idx, row in df.head(2).iterrows():
    phrase = row['Phrase']
    tokens_display = tokenizer.convert_ids_to_tokens(tokens[idx])
    labels_display = label_encoder.inverse_transform(labels[idx])

    print(f"\nPhrase: {phrase}")
    print(f"Tokens: {tokens_display}")
    print(f"Labels: {labels_display}")

# Encoder les datasets
tokens, labels = encode_data_and_tokenize(df, tokenizer, label_encoder)
tokens_eval, labels_eval = encode_data_and_tokenize(df_test, tokenizer, label_encoder)

# Créer les datasets TensorFlow
train_dataset_tf = tf.data.Dataset.from_tensor_slices((tokens, labels)).batch(16)
eval_dataset_tf = tf.data.Dataset.from_tensor_slices((tokens_eval, labels_eval)).batch(16)

# Charger le modèle Camembert
model = TFAutoModelForTokenClassification.from_pretrained(model_name, num_labels=5)

# Optimiseur et perte
initial_lr=5e-5
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=initial_lr, 
    decay_steps=1000, 
    decay_rate=0.96, 
    staircase=True
)
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=lr_schedule)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Compiler le modèle
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

# Calculer les fréquences des classes dans les labels
class_counts = np.bincount(labels.flatten())
total_count = len(labels.flatten())
# Calculer le poids pour chaque classe (inversement proportionnel à leur fréquence)
class_weights = {i: total_count / count for i, count in enumerate(class_counts)}

# Entraîner le modèle
model.fit(train_dataset_tf, validation_data=eval_dataset_tf, epochs=15, class_weight=class_weights)

# Sauvegarder le modèle et le tokenizer
model.save_pretrained('../models/ia-base-finetuned')
tokenizer.save_pretrained('../models/etalab-ia-finetuned')

# Évaluation finale
preds = model.predict(eval_dataset_tf).logits
pred_labels = np.argmax(preds, axis=-1)
true_labels = labels_eval.flatten()

# Décoder les prédictions
pred_decoded = [label_encoder.inverse_transform(p) for p in pred_labels]
true_decoded = [label_encoder.inverse_transform(t) for t in labels_eval]
print('labels: ', true_decoded)

# Générer un rapport de classification
print(classification_report(true_decoded, pred_decoded))



// Encoding completed

// Encoding completed

Phrase: montre-moi les trains dimanche allant de Jarville-la-Malgrange à La Bassée-Violaines en première classe sans correspondance partant l'après midi
Tokens: ['<s>', '▁montre', '-', 'moi', '▁les', '▁trains', '▁dimanche', '▁allant', '▁de', '▁Jar', 'ville', '-', 'la', '-', 'Mal', 'g', 'range', '▁à', '▁La', '▁Bas', 'sée', '-', 'Vi', 'ola', 'ines', '▁en', '▁première', '▁classe', '▁sans', '▁correspondance', '▁partant', '▁l', "'", 'après', '▁midi', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', 

All PyTorch model weights were used when initializing TFCamembertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFCamembertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
labels:  [array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'B-DEP', 'I-DEP', 'O', 'B-ARR', 'I-ARR', 'I-ARR', 'I-ARR',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O'], dtype='<U5'), array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DEP', 'I-DEP',
   

ValueError: multiclass-multioutput is not supported