In [8]:
#import os
#import numpy as np
#import pandas as pd
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import classification_report, confusion_matrix
#import matplotlib.pyplot as plt
#import seaborn as sns
#from tensorflow.keras.models import Model
#from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Layer
#from tensorflow.keras.layers import GlobalAveragePooling1D, LayerNormalization, MultiHeadAttention
#from tensorflow.keras.optimizers import Adam
#from tensorflow.keras.preprocessing.text import Tokenizer
#from tensorflow.keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import pandas as pd

print("Bibliothèques importées avec succès.")

Bibliothèques importées avec succès.


In [9]:
# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Using GPU.")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

# Print the name of the GPU
if device.type == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))

CUDA is available. Using GPU.
GPU: NVIDIA GeForce RTX 4070 Laptop GPU


In [10]:
# Load dataset
DATASET_PATH = r"C:\Users\Seed\Projects\Travel-Order-Resolver\ai\nlp\dataset\text\text_intention_detector.csv"
data = pd.read_csv(DATASET_PATH, delimiter=';')

print("\nLes premières lignes du jeu de données :")
print(data.head())


Les premières lignes du jeu de données :
                                            sentence  is_correct  is_not_trip  \
0  Y a-t-il un moyen d'aller de Montreux-Vieux à ...           1            0   
1  I would like to travel from culoz to buswiller...           0            0   
2           He lied when he said he didn't like her.           0            1   
3                         ?N|ajOLIY6;DOM'mKavLZZnkAi           0            0   
4                         a(c}sMyu7/97.[-IA@m k0rN0U           0            0   

   is_unknown  
0           0  
1           0  
2           0  
3           1  
4           1  


In [11]:
# Preprocess data
label_mapping = {'is_correct': 0, 'is_not_trip': 1, 'is_unknown': 2}
data['label'] = data[['is_correct', 'is_not_trip', 'is_unknown']].idxmax(axis=1).map(label_mapping)
X_train, X_test, y_train, y_test = train_test_split(data['sentence'], data['label'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [12]:
# Define a simple dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        return encoding['input_ids'].squeeze(), encoding['attention_mask'].squeeze(), torch.tensor(label)

# Define a simple model
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids)
        pooled = torch.mean(embedded, dim=1)
        output = self.fc(pooled)
        return output
    
# Tokenizer and model parameters
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
vocab_size = tokenizer.vocab_size
embed_dim = 128
num_classes = 3
max_len = 128

print("Tokenisation et padding terminés.")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Tokenisation et padding terminés.


In [13]:
# Create datasets and dataloaders
train_dataset = TextDataset(X_train.tolist(), y_train.tolist(), tokenizer, max_len)
val_dataset = TextDataset(X_val.tolist(), y_val.tolist(), tokenizer, max_len)
test_dataset = TextDataset(X_test.tolist(), y_test.tolist(), tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Initialize model, loss function, and optimizer
model = TextClassifier(vocab_size, embed_dim, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [15]:
from tqdm import tqdm

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
    for input_ids, attention_mask, labels in progress_bar:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        progress_bar.set_postfix(loss=running_loss/len(train_loader))
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}")

print("Entraînement terminé.")

Epoch 1/10: 100%|██████████| 26035/26035 [05:30<00:00, 78.67batch/s, loss=0.00668]


Epoch 1/10, Loss: 0.0066838651555684095


Epoch 2/10: 100%|██████████| 26035/26035 [05:30<00:00, 78.81batch/s, loss=0.00105] 


Epoch 2/10, Loss: 0.0010540862999793598


Epoch 3/10: 100%|██████████| 26035/26035 [05:33<00:00, 78.07batch/s, loss=0.000549]


Epoch 3/10, Loss: 0.0005490388180762079


Epoch 4/10: 100%|██████████| 26035/26035 [05:31<00:00, 78.43batch/s, loss=0.000367]


Epoch 4/10, Loss: 0.0003670860301954392


Epoch 5/10: 100%|██████████| 26035/26035 [05:38<00:00, 76.89batch/s, loss=0.000295]


Epoch 5/10, Loss: 0.00029530764172506856


Epoch 6/10: 100%|██████████| 26035/26035 [05:30<00:00, 78.79batch/s, loss=0.000261]


Epoch 6/10, Loss: 0.0002614685000693668


Epoch 7/10: 100%|██████████| 26035/26035 [05:30<00:00, 78.73batch/s, loss=0.000229]


Epoch 7/10, Loss: 0.00022862430519621593


Epoch 8/10: 100%|██████████| 26035/26035 [05:31<00:00, 78.46batch/s, loss=0.000187]


Epoch 8/10, Loss: 0.00018730818654540132


Epoch 9/10: 100%|██████████| 26035/26035 [05:31<00:00, 78.49batch/s, loss=0.000164]


Epoch 9/10, Loss: 0.00016371678813408938


Epoch 10/10: 100%|██████████| 26035/26035 [05:35<00:00, 77.61batch/s, loss=0.000157]

Epoch 10/10, Loss: 0.00015679976096112473
Entraînement terminé.





In [16]:
# Save the model's state dictionary
torch.save(model.state_dict(), 'model.pth')
print("Model saved successfully.")

Model saved successfully.


In [17]:
# Evaluation
model.eval()
y_pred = []
y_true = []
with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, dim=1)
        y_pred.extend(preds.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

print(classification_report(y_true, y_pred, target_names=label_mapping.keys()))

              precision    recall  f1-score   support

  is_correct       1.00      1.00      1.00    103521
 is_not_trip       1.00      1.00      1.00     78404
  is_unknown       1.00      1.00      1.00     78424

    accuracy                           1.00    260349
   macro avg       1.00      1.00      1.00    260349
weighted avg       1.00      1.00      1.00    260349



In [21]:
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        return encoding['input_ids'].squeeze(), encoding['attention_mask'].squeeze()

def predict_new_texts(model, tokenizer, new_texts, max_length):
    dataset = TextDataset(new_texts, tokenizer, max_length)
    dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

    model.eval()
    with torch.no_grad():
        for i, (input_ids, attention_mask) in enumerate(dataloader):
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
            outputs = model(input_ids, attention_mask)
            predictions = torch.softmax(outputs, dim=1).cpu().numpy()

            print(f"\nTexte: {new_texts[i]}")
            for j, label in enumerate(["is_correct", "is_not_trip", "is_unknown"]):
                print(f" - {label}: {round(predictions[0][j] * 100, 2)}%")

# Example usage
new_texts = [
    "Je veux aller de Port-Boulet à Le Havre.",
    "Je veux aller de Nantes à Nantes.",
    "Comment aller à Niort depuis Troyes ?",
    "Je veux aller de Lyon à Marseille.",
    "Comment puis-je rejoindre Bordeaux depuis Lille ?",
    "Je dois me rendre à Montpellier depuis Paris.",
    "Y a-t-il un train direct de Strasbourg à Metz ?",
    "Comment aller de Toulouse à Rennes ?",
    "J'aime voyager dans toute la France.",
    "Le Havre est une belle ville.",
    "Je réfléchis à visiter Nantes un jour.",
    "Les trains entre Niort et Troyes sont souvent rapides.",
    "Aller à Marseille, c'est toujours une aventure."
]

predict_new_texts(model, tokenizer, new_texts, max_len) 


Texte: Je veux aller de Port-Boulet à Le Havre.
 - is_correct: 67.72%
 - is_not_trip: 32.28%
 - is_unknown: 0.0%

Texte: Je veux aller de Nantes à Nantes.
 - is_correct: 99.79%
 - is_not_trip: 0.21%
 - is_unknown: 0.0%

Texte: Comment aller à Niort depuis Troyes ?
 - is_correct: 0.0%
 - is_not_trip: 100.0%
 - is_unknown: 0.0%

Texte: Je veux aller de Lyon à Marseille.
 - is_correct: 99.67%
 - is_not_trip: 0.33%
 - is_unknown: 0.0%

Texte: Comment puis-je rejoindre Bordeaux depuis Lille ?
 - is_correct: 100.0%
 - is_not_trip: 0.0%
 - is_unknown: 0.0%

Texte: Je dois me rendre à Montpellier depuis Paris.
 - is_correct: 100.0%
 - is_not_trip: 0.0%
 - is_unknown: 0.0%

Texte: Y a-t-il un train direct de Strasbourg à Metz ?
 - is_correct: 12.06%
 - is_not_trip: 87.94%
 - is_unknown: 0.0%

Texte: Comment aller de Toulouse à Rennes ?
 - is_correct: 5.2%
 - is_not_trip: 94.8%
 - is_unknown: 0.0%

Texte: J'aime voyager dans toute la France.
 - is_correct: 0.0%
 - is_not_trip: 100.0%
 - is_unkn