In [26]:
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [27]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pickle

# Chargement des données

df = pd.read_excel('./extract_normalised_name_fr_training_data.xlsx')


# Extraire chacune des colonnes dans des variables X et Y

X = df['designation_fr']
Y = df['normalised_name']


# Tokenizer notre data

X = X.apply(lambda x: word_tokenize(x.lower()))

# Réduction des mots à leur racines/formes de base

stemmer = SnowballStemmer('french')
X = X.apply(lambda tokens: [stemmer.stem(token) for token in tokens])

# Rattacher les mots séparés en un seul text

X = X.apply(lambda tokens: ' '.join(tokens))

# Vectoriser le texte avec CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Export de notre vectorizer pour réutilisation directe

# with open('trained_Nlp_vectorizer.pkl', 'wb') as file:
#      pickle.dump(vectorizer, file)

        
#Séparation de nos données en données d'entraînement et données de test

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


#Instanciation du model ici nous allons utiliser la regression logistique avec une régularisation l2 pour éviter l'overfitting

clf = LogisticRegression(penalty='l2')


# Entrainement du modèle 

clf.fit(X_train, Y_train)

# Export du modèle pour réutilisation directe 

# with open('trained_Nlp_model.pkl', 'wb') as file:
#      pickle.dump(clf, file)
        
# Evaluation du modéle

Y_pred = clf.predict(X_test)


# calcul des metrics :  accuracy, precision, recall, et F1-score

accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, average='weighted')
recall = recall_score(Y_test, Y_pred, average='weighted')
f1_score = f1_score(Y_test, Y_pred, average='weighted')

# Affichage des metrics calculées

print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1-score: {:.2f}".format(f1_score))

# Fonction predict pour tester notre modele 

def predict(name):
    # refaire les mêmes traitements sur le nom d'équipements qu'on veut normaliser
    tokens = word_tokenize(name.lower())
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    preprocessed_name = ' '.join(stemmed_tokens)
    
    # Vectoriser le texte avec CountVectorizer
    
    X_test = vectorizer.transform([preprocessed_name])
    
   # Prédire à l'aide du model entrainé

    predictions = [(label, prob) for label, prob in zip(clf.classes_,  clf.predict_proba(X_test)[0])]
    predictions.sort(key=lambda x: x[1], reverse=True)
  
    predictions = predictions[:3] # J'ai choisi d'afficher la liste des trois premiers classes d'apprtenance avec leurs probabilités
       
    if predictions:
        return predictions
    else:
        return None

# Test

equipment_names = [' réacteurs ', 'Transformateurs Triphasé (T13)    ', '1 Conge À Sirop ppp Électrique 123']
for name in equipment_names:
    predicted_names = predict(name)
    if predicted_names is not None:
        print("-------------------------------------------")
        print(f"Equipment Name: {name}")
        for prediction in predicted_names:
            print(f"Predicted Name: {prediction[0]}, Confidence: {prediction[1]}",)


Accuracy: 0.99
Precision: 0.98
Recall: 0.99
F1-score: 0.98
-------------------------------------------
Equipment Name:  réacteurs 
Predicted Name: INOX_REACTOR, Confidence: 0.9745685809048135
Predicted Name: PREPARATION_VESSEL, Confidence: 0.006181249632191923
Predicted Name: BOTTLE_FILLING_MACHINE, Confidence: 0.0029160785598131285
-------------------------------------------
Equipment Name: Transformateurs Triphasé (T13)    
Predicted Name: TRANSFORMER, Confidence: 0.9953646596363122
Predicted Name: BOTTLE_FILLING_MACHINE, Confidence: 0.0006705574476757722
Predicted Name: MELTING_POT, Confidence: 0.0005835853483289741
-------------------------------------------
Equipment Name: 1 Conge À Sirop ppp Électrique 123
Predicted Name: MELTING_POT, Confidence: 0.9426550956526978
Predicted Name: TRANSFORMER, Confidence: 0.025044844403544566
Predicted Name: BOTTLE_FILLING_MACHINE, Confidence: 0.004718479860022788


  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
pip install pltk

^C
Note: you may need to restart the kernel to use updated packages.
