In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Layer
from tensorflow.keras.layers import GlobalAveragePooling1D, LayerNormalization, MultiHeadAttention
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

print("Bibliothèques importées avec succès.")

In [None]:
DATASET_PATH = "C:/Users/vikne/Documents/Master 2/Semestre 9/Intelligence artificielle/Travel-Order-Resolver/ai/nlp/dataset/text/text_intention_detector.csv"

def load_dataset():
    if os.path.exists(DATASET_PATH):
        print("Chargement des données...")
        data = pd.read_csv(DATASET_PATH, delimiter=';')
        print("Données chargées avec succès.")
        return data
    else:
        raise FileNotFoundError("Le chemin du dataset est incorrect.")

data = load_dataset()

print("\nLes premières lignes du jeu de données :")
print(data.head())

In [None]:
def preprocess_data(data):
    label_mapping = {'is_correct': 0, 'is_not_trip': 1, 'is_unknown': 2}
    data['label'] = data[['is_correct', 'is_not_trip', 'is_unknown']].idxmax(axis=1).map(label_mapping)
    
    X_train, X_test, y_train, y_test = train_test_split(data['sentence'], data['label'], test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

X_train_raw, X_val_raw, X_test_raw, y_train, y_val, y_test = preprocess_data(data)

In [None]:
vocab_size = 20000
max_length = 100

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_raw)

X_train_padded = pad_sequences(tokenizer.texts_to_sequences(X_train_raw), maxlen=max_length, padding='post')
X_val_padded = pad_sequences(tokenizer.texts_to_sequences(X_val_raw), maxlen=max_length, padding='post')
X_test_padded = pad_sequences(tokenizer.texts_to_sequences(X_test_raw), maxlen=max_length, padding='post')

print("Tokenisation et padding terminés.")

In [None]:
class SelfAttentionLayer(Layer):
    def __init__(self, embed_dim, num_heads):
        super(SelfAttentionLayer, self).__init__()
        self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.norm = LayerNormalization(epsilon=1e-6)
        self.dropout = Dropout(0.1)

    def call(self, inputs):
        attn_output = self.attention(inputs, inputs)
        out = self.norm(inputs + self.dropout(attn_output))
        return out

def create_self_attention_model(vocab_size, embed_dim, max_length, num_labels, num_heads):
    inputs = Input(shape=(max_length,))
    x = Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=max_length)(inputs)
    x = SelfAttentionLayer(embed_dim, num_heads)(x)
    x = GlobalAveragePooling1D()(x)
    x = Dropout(0.2)(x)
    outputs = Dense(num_labels, activation='softmax')(x)
    
    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(learning_rate=5e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

embed_dim = 128
num_heads = 4
num_labels = 3

model = create_self_attention_model(vocab_size, embed_dim, max_length, num_labels, num_heads)
model.summary()

In [None]:
num_epochs = 5
batch_size = 32

print("Début de l'entraînement...")
history = model.fit(
    X_train_padded, y_train,
    validation_data=(X_val_padded, y_val),
    epochs=num_epochs,
    batch_size=batch_size,
    verbose=1
)
print("Entraînement terminé.")

In [None]:
print("\nÉvaluation sur l'ensemble de test...")
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f"Perte: {loss:.4f}, Précision: {accuracy:.4f}")

y_pred = np.argmax(model.predict(X_test_padded), axis=1)
cm = confusion_matrix(y_test, y_pred)

def plot_confusion_matrix(cm, labels):
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels, cmap="Blues")
    plt.title("Matrice de Confusion")
    plt.xlabel("Prédictions")
    plt.ylabel("Valeurs Réelles")
    plt.show()

plot_confusion_matrix(cm, labels=["is_correct", "is_not_trip", "is_unknown"])

In [None]:
def predict_new_texts(model, tokenizer, new_texts, max_length):
    sequences = tokenizer.texts_to_sequences(new_texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    predictions = model.predict(padded_sequences)

    for i, text in enumerate(new_texts):
        print(f"\nTexte: {text}")
        for j, label in enumerate(["is_correct", "is_not_trip", "is_unknown"]):
            print(f" - {label}: {round(predictions[i][j] * 100, 2)}%")

new_texts = [
    "Je veux aller de Port-Boulet à Le Havre.",
    "Je veux aller de Nantes à Nantes.",
    "Comment aller à Niort depuis Troyes ?"
]
predict_new_texts(model, tokenizer, new_texts, max_length)