In [None]:
# Importation des librairies nécessaires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib

print("imported libraries")

In [None]:
from torch import cuda

# Setup device
device = 'cuda' if cuda.is_available() else 'cpu'
print(f"Running on device: {device}")

In [None]:
# Charger le dataset
input_csv = r"C:/Users/vikne/Documents/Master 2/Semestre 9/Intelligence artificielle/Travel-Order-Resolver/ai/nlp/dataset/text/text_lang_detector.csv"
df = pd.read_csv(input_csv, sep=";")
df.head()

In [None]:
# Prétraiter les données
def preprocess_data(df):
    X = df['sentence']
    y = df['is_not_french']
    return X, y

X, y = preprocess_data(df)

In [None]:
# Diviser les données en ensembles d'entraînement et de test
def split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(X, y)

In [None]:
# Tokenization des phrases et création de séquences
max_words = 10000
max_len = 100  # Longueur maximale des séquences

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [None]:
# Construction du modèle CNN
model = Sequential()

# Couche d'embedding
embedding_dim = 50
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))

# Couches convolutives
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# Couches de flattening et fully connected
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Sortie binaire (French vs Not French)

# Compilation du modèle
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Entraînement du modèle
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test), callbacks=[early_stopping])

In [None]:
# Prédictions sur l'ensemble de test
y_pred = model.predict(X_test_pad)
y_pred = (y_pred > 0.5).astype(int)

# Évaluation du modèle
print("### Classification Report ###")
print(classification_report(y_test, y_pred))

print("### Accuracy ###")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

In [None]:
# Matrice de confusion
def plot_confusion_matrix(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["French", "Not French"], yticklabels=["French", "Not French"])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

plot_confusion_matrix(y_test, y_pred)

In [None]:
# Sauvegarde du modèle
model_path = r"C:/Users/vikne/Documents/Master 2/Semestre 9/Intelligence artificielle/Travel-Order-Resolver/ai/nlp/models/text_classification/lang_detector/cnn_lang_detector.h5"
model.save(model_path)

print(f"Modèle sauvegardé dans : {model_path}")

In [None]:
# Exemple de prédictions avec de nouvelles phrases
new_sentences = [
    "I would like to go to the station.",
    "Voglio andare alla stazione.",
    "je veux partir de Marseille à Paris",
    "Ich möchte zum Bahnhof gehen.",
    "Quiero morir",
    "Je veux aller à la gare."
]

# Prétraiter les nouvelles phrases
new_sentences_seq = tokenizer.texts_to_sequences(new_sentences)
new_sentences_pad = pad_sequences(new_sentences_seq, maxlen=max_len)

# Prédictions
predictions = model.predict(new_sentences_pad)
predictions = (predictions > 0.5).astype(int)

# Affichage des résultats
results = pd.DataFrame({
    'Sentence': new_sentences,
    'Predicted Language': ['French' if prediction == 0 else 'Not French' for prediction in predictions]
})

import IPython
IPython.display.display(results)