# Importations

In [1]:
import numpy as np
import json
import joblib
import string

import nltk
from nltk.stem import SnowballStemmer

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# # Téléchargement de ressources (si nécessaire)
# nltk.download('punkt')

In [2]:
stemmer = SnowballStemmer("french")

In [3]:
# charger le fichier intents.json
with open('intents.json', 'r', encoding='utf-8') as f:
    intents = json.load(f)

# Fonctions utiles

In [4]:
#ecrire une fonction remove_punct qui supprime les ponctuations
def remove_punct(text:str) -> str:
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct
    

def tokenize(sentence):
    return nltk.word_tokenize(sentence)


def stem(word):
    return stemmer.stem(word.lower())

In [5]:
def bag_of_words(tokenized_sentence, words):
    # stem chaque mot
    sentence_words = [stem(word) for word in tokenized_sentence]
    # initialiser le sac avec 0 pour chaque mot
    bag = np.zeros(len(words), dtype=np.float32)
    for idx, w in enumerate(words):
        if w in sentence_words: 
            bag[idx] = 1

    return bag

# Prétraitement

In [6]:
words = []
tags = []
xy = []
# parcourir chaque phrase dans intents
for intent in intents['intentions']:
    tag = intent['tag']
    # ajouter a la liste de tag
    tags.append(tag)
    for question in intent['questions']:
        # supprimer les ponctuations
        question = remove_punct(question)
        # tokeniser chaque mot de la phrase
        w = tokenize(question)
        # ajouter a la liste de mots
        words.extend(w)
        # ajouter le tuple (mot, tag) a la liste
        xy.append((w, tag))

In [7]:
# stemmer et mettre en muniscule chaque mot
words = [ stem(w) for w in words ]
# supprimer les duplicatats et trier
words = sorted(set(words))
tags = sorted(set(tags))

print(len(xy), "questions")
print(len(tags), "tags:", tags)
print(len(words), "mots à base unique:", words)

103 questions
21 tags: ['bourse', 'contacts', 'cout_dsi', 'cout_licence', 'cout_master', 'horaire_dsi', 'horaire_licence', 'horaire_master', 'infos_dsi', 'infos_ecole', 'infos_formations', 'infos_licence', 'infos_master', 'localisation', 'prerequis_dsi', 'prerequis_licence', 'prerequis_master', 'programme_dsi', 'programme_licence', 'programme_master', 'salutations']
147 mots à base unique: ['accrédit', 'adress', 'allezvous', 'arriv', 'artificiel', 'atil', 'autor', 'avezvous', 'avoir', 'big', 'bonjour', 'bonsoir', 'bours', 'ce', 'cet', 'chez', 'combien', 'comm', 'comment', 'compétent', 'condit', 'connaitr', 'contact', 'cour', 'coût', 'criter', 'dadmiss', 'dadmissibil', 'dan', 'dat', 'de', 'différent', 'dinform', 'dinscript', 'dir', 'disponibl', 'dobten', 'don', 'du', 'déroul', 'déroulent', 'détud', 'e', 'en', 'enseign', 'est', 'estce', 'estel', 'et', 'financ', 'financi', 'format', 'fourn', 'genr', 'heur', 'horair', 'ia', 'il', 'inform', 'intelligent', 'intens', 'je', 'joindr', 'jour', '

In [8]:
# creer les données d'entrainements
X_train = []
y_train = []
for (pattern_sentence, tag) in xy:
    # X: bag of words
    bag = bag_of_words(pattern_sentence, words)
    X_train.append(bag)
    label = tags.index(tag)
    y_train.append(label)

# convertir en tableau numpy
X_train = np.array(X_train)
y_train = np.array(y_train)

# Creation et entrainement du modele

In [9]:
# creation du modele
naive_bayes = MultinomialNB()
# entrainement du modele
naive_bayes.fit(X_train, y_train)

In [10]:
y_pred = naive_bayes.predict(X_train)

In [11]:
accuracy = accuracy_score(y_train, y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_train, y_pred)
print("Classification Report:")
print(report)

Accuracy: 0.9320388349514563
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         6
           2       0.80      0.80      0.80         5
           3       0.83      1.00      0.91         5
           4       0.83      1.00      0.91         5
           5       0.67      1.00      0.80         8
           6       1.00      0.83      0.91         6
           7       1.00      1.00      1.00         7
           8       1.00      1.00      1.00         4
           9       1.00      1.00      1.00         4
          10       1.00      1.00      1.00         4
          11       1.00      1.00      1.00         4
          12       1.00      1.00      1.00         4
          13       1.00      1.00      1.00         6
          14       1.00      0.50      0.67         4
          15       1.00      1.00      1.00         4
          16       1.00      

# Sauvegarde

In [12]:
joblib.dump(naive_bayes, 'modele.pkl')

['modele.pkl']

In [13]:
joblib.dump(words, 'words.pkl')

['words.pkl']

In [14]:
# sentence = tokenize("Quel genre de formations offrez-vous ?")
# texte_vectorise = bag_of_words(sentence, words)
# prediction = model.predict([texte_vectorise])[0]
# prediction