# Training

## Prepa des data

In [7]:
import pandas as pd
import numpy as np

# Define the paths to the CSV files
file_path_train = '/home/jovyan/data/reservation-first-dataset-train.csv'
file_path_test = '/home/jovyan/data/reservation-first-dataset-test.csv'

# Load the data
try:
    data_train = pd.read_csv(file_path_train).fillna('')  # Handle possible NaN values
    data_test = pd.read_csv(file_path_test).fillna('')    # Handle possible NaN values
    print("-> loaded successfully")
except FileNotFoundError:
    print("Error : please verify the file paths")

print(f"Train sentences: {len(data_train)}")
print(f"Test sentences: {len(data_test)}")
print(data_train.columns)
print(data_test.columns)

print("-> Head train data")
print(data_train.head())
print("-> Head test data")
print(data_test.head())

-> loaded successfully
Train sentences: 207
Test sentences: 20
Index(['Phrase', 'Départ', 'Arrivée'], dtype='object')
Index(['Phrase', 'Départ', 'Arrivée'], dtype='object')
-> Head train data
                                              Phrase                Départ  \
0  montrer les trains de Gargan à Valdahon Camp M...                Gargan   
1  quels trains sont disponibles de Montbard à Sa...              Montbard   
2  j'ai besoin d'un train demain de Saint-Jodard ...          Saint-Jodard   
3  montre-moi les trains de Montlouis-sur-Loire à...   Montlouis-sur-Loire   
4  tous les trains de Nanteuil-le-Haudouin à Vign...  Nanteuil-le-Haudouin   

                   Arrivée  
0  Valdahon Camp Militaire  
1      Saint-Romain-le-Puy  
2                Champigny  
3                 Maroeuil  
4        Vigneux-sur-Seine  
-> Head test data
                                              Phrase               Départ  \
0  S'il vous plaît, donnez-moi des trains d'Imphy...                Im

In [None]:
import os
import pandas as pd
from transformers import BertTokenizer, BertTokenizerFast, TFBertForTokenClassification
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) # Masquer les avertissements
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # 0 = toutes les logs, 1 = info logs masqués, 2 = info et warning masqués, 3 = tout masqué


# Chemins vers les fichiers CSV
file_path_train = '/home/jovyan/data/reservation-first-dataset-train.csv'
file_path_test = '/home/jovyan/data/reservation-first-dataset-test.csv'

# Chargement des données
data_train = pd.read_csv(file_path_train).fillna('')
data_test = pd.read_csv(file_path_test).fillna('')

# Préparer les données pour l'entraînement 
# Encodage des phrases et des labels correspondants
# data = CSV en input / max_length = Longueur maximale des phrases
def encode_data(data, tokenizer, label_encoder, max_length=128):
    tokens = []
    labels = []

    print("Starting data encoding...")
    for i, row in data.iterrows():
        phrase = row['Phrase']
        dep = row['Départ']
        arr = row['Arrivée']

        if i < 5:  # print 5 lignes
            print(f"\n- Ligne {i+1}")
            print("Phrase originale :", phrase)

        tokenized_input = tokenizer.encode_plus(
            phrase,
            add_special_tokens=True,  # [CLS] au début et [SEP] à la fin
            return_offsets_mapping=True,
            return_tensors="tf",  # retourne les tokens en tensors
            max_length=max_length,
            truncation=True,  # si la phrase est +longue que max_length
            padding="max_length"  # padding pour que ttes les séquences aient la même longueur
        )

        tokenized_text = tokenizer.convert_ids_to_tokens(tokenized_input.input_ids[0])
        offsets = tokenized_input['offset_mapping'].numpy()[0]
        label_list = ['O'] * len(tokenized_text)

        if i < 5:
            print("Tokens encodés :", tokenized_text)
            print("Offsets :", offsets)

        # Encodage des entités
        for j, (start, end) in enumerate(offsets):
            if start and end and start != end:
                token_str = phrase[start:end]
                if token_str in dep:
                    label_list[j] = 'B-DEP'
                elif token_str in arr:
                    label_list[j] = 'B-ARR'

        if i < 5:
            print("Labels après encodage des entités :", label_list)

        # Conversion des labels en ids
        label_ids = label_encoder.transform(label_list)
        tokens.append(tokenized_input.input_ids.numpy()[0])
        labels.append(label_ids)

        if i < 5:
            print("IDs des label", label_ids)

    print("Encoding completed")
    return np.array(tokens), np.array(labels)


# Version fast du tokenizer BERT
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

# Préparation des labels
unique_labels = ['O', 'B-DEP', 'B-ARR']
label_encoder = LabelEncoder()
label_encoder.fit(unique_labels)
print("-> mapping labels et ids :", {label: idx for idx, label in enumerate(label_encoder.classes_)})

# Encoder les données de train et de test
train_tokens, train_labels = encode_data(data_train, tokenizer, label_encoder, max_length=128)
test_tokens, test_labels = encode_data(data_test, tokenizer, label_encoder, max_length=128)
print("- shape train tokens", train_tokens.shape)
print("- shape trains labels", train_labels.shape)
print("- shape test tokens", test_tokens.shape)
print("- shape test labels", test_labels.shape)

# Chargement du modèle
model = TFBertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(label_encoder.classes_))
print("-> modèle chargé avec", len(label_encoder.classes_), "labels")

# Configuration de l'entraînement
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Entraînement
model.fit(train_tokens, train_labels, epochs=3, batch_size=16, validation_split=0.1)

# Évaluation
model.evaluate(test_tokens, test_labels)



## Construction et Entraînement du Modèle TensorFlow

# Évaluation du Modèle

In [None]:

# Evaluate the model
trainer.evaluate()

# Example function to get predictions
def get_predictions(text):
    inputs = tokenizer(text, return_tensors="tf", padding=True, truncation=True)
    outputs = model(inputs)
    predictions = np.argmax(outputs.logits, axis=-1)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].numpy()[0])

    # Extract and print the predicted tokens and corresponding entities
    return [(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0]) if label_list[prediction] != 'O']

# Test with an example sentence
example_sentence = "Je dois voyager de Paris à Lyon"
print(get_predictions(example_sentence))