In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
import matplotlib.pyplot as plt
import seaborn as sns

print("Bibliothèques importées avec succès.")

  from .autonotebook import tqdm as notebook_tqdm



Bibliothèques importées avec succès.


In [2]:
from torch import cuda

# Setup device
device = 'cuda' if cuda.is_available() else 'cpu'
print(f"Running on device: {device}")

Running on device: cuda


In [3]:
DATASET_PATH = r"C:\Users\Seed\Projects\Travel-Order-Resolver\ai\nlp\dataset\text\text_intention_detector.csv"

def load_dataset():
    if os.path.exists(DATASET_PATH):
        print("Chargement des données...")
        data = pd.read_csv(DATASET_PATH, delimiter=';')
        print("Données chargées avec succès.")
        return data
    else:
        raise FileNotFoundError("Le chemin du dataset est incorrect.")

data = load_dataset()

Chargement des données...
Données chargées avec succès.


In [4]:
print("\nLes premières lignes du jeu de données :")
print(data.head())

def preprocess_data(data):
    label_mapping = {'is_correct': 0, 'is_not_trip': 1, 'is_unknown': 2}
    data['label'] = data[['is_correct', 'is_not_trip', 'is_unknown']].idxmax(axis=1).map(label_mapping)
    
    X_train, X_test, y_train, y_test = train_test_split(data['sentence'], data['label'], test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    train_data = Dataset.from_dict({'text': X_train.tolist(), 'label': y_train.tolist()})
    val_data = Dataset.from_dict({'text': X_val.tolist(), 'label': y_val.tolist()})
    test_data = Dataset.from_dict({'text': X_test.tolist(), 'label': y_test.tolist()})
    dataset = DatasetDict({'train': train_data, 'validation': val_data, 'test': test_data})
    
    print("Données préparées pour Transformers.")
    return dataset

dataset = preprocess_data(data)


Les premières lignes du jeu de données :
                                            sentence  is_correct  is_not_trip  \
0  Y a-t-il un moyen d'aller de Montreux-Vieux à ...           1            0   
1  I would like to travel from culoz to buswiller...           0            0   
2           He lied when he said he didn't like her.           0            1   
3                         ?N|ajOLIY6;DOM'mKavLZZnkAi           0            0   
4                         a(c}sMyu7/97.[-IA@m k0rN0U           0            0   

   is_unknown  
0           0  
1           0  
2           0  
3           1  
4           1  
Données préparées pour Transformers.


In [5]:
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

print(f"Modèle {model_name} chargé avec succès.")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Modèle distilgpt2 chargé avec succès.


In [6]:
# Configurer un jeton de padding pour le tokenizer
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Ou utilisez un nouveau jeton de padding
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))  # Ajuster les embeddings

# Re-tokenisation des données
def tokenize_function(example):
    return tokenizer(
        example["text"], 
        padding="max_length", 
        truncation=True, 
        max_length=100
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

print("Données tokenisées avec succès.")

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Map: 100%|██████████| 833116/833116 [00:28<00:00, 29499.25 examples/s]
Map: 100%|██████████| 208280/208280 [00:07<00:00, 29738.03 examples/s]
Map: 100%|██████████| 260349/260349 [00:08<00:00, 29619.95 examples/s]

Données tokenisées avec succès.





In [None]:

# Vérifier la configuration du modèle
model.config.pad_token_id = tokenizer.pad_token_id

# Entraînement
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    fp16=True,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = np.mean(preds == labels)
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Démarrer l'entraînement
print("Début de l'entraînement...")
trainer.train()
print("Entraînement terminé.")

  trainer = Trainer(


Début de l'entraînement...


Epoch,Training Loss,Validation Loss


In [None]:
print("\nÉvaluation sur l'ensemble de test...")
results = trainer.evaluate(tokenized_datasets["test"])
print("Résultats :", results)

In [None]:
def plot_confusion_matrix(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels, cmap="Blues")
    plt.title("Matrice de Confusion")
    plt.xlabel("Prédictions")
    plt.ylabel("Valeurs Réelles")
    plt.show()

test_preds = trainer.predict(tokenized_datasets["test"]).predictions
test_labels = np.argmax(test_preds, axis=1)
true_labels = tokenized_datasets["test"]["labels"]

plot_confusion_matrix(true_labels, test_labels, labels=["is_correct", "is_not_trip", "is_unknown"])

In [None]:
def predict_new_texts(new_texts, model, tokenizer):
    inputs = tokenizer(new_texts, return_tensors="pt", padding=True, truncation=True, max_length=100)
    outputs = model(**inputs)
    probs = outputs.logits.softmax(dim=-1).detach().numpy()
    
    for i, text in enumerate(new_texts):
        print(f"\nTexte: {text}")
        for j, label in enumerate(["is_correct", "is_not_trip", "is_unknown"]):
            print(f" - {label}: {round(probs[i][j] * 100, 2)}%")

new_texts = [
    "Je veux aller de Port-Boulet à Le Havre.",
    "Je veux aller de Nantes à Nantes.",
    "Comment aller à Niort depuis Troyes ?"
]
predict_new_texts(new_texts, model, tokenizer)