In [None]:
#%pip install nltk
#%pip install --upgrade pip
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('stopwords')

In [None]:
import pickle
with open("data.pkl", "rb") as f:
    questions_matrix, data, best_num_clusters, tfidf_vectorizer,tfidf_vectorizer1 = pickle.load(f)
    
with open("model.pkl", "rb") as f1:
        model, history= pickle.load(f1)

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.regularizers import l2
import matplotlib.pyplot as plt

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def preprocess_question(user_question):
    # Convertir la question en minuscules
    user_question = user_question.lower()

    # Supprimer la ponctuation
    user_question = re.sub(r'[^\w\s]', '', user_question)

    # Tokenization
    tokens = word_tokenize(user_question)

    # Supprimer les mots vides (stop words)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Reconstruire la question à partir des tokens lemmatisés
    processed_question = ' '.join(tokens)

    return processed_question

In [None]:
user_question = preprocess_question(user_question) 

### Entrainement du modèle de classification / Prédiction

In [None]:
def extract_first_line(text):
    return text.split('\n')[0]

In [None]:
# Fonction pour recommander des réponses avec une similarité supérieure à 40%
def recommend_responses(new_question, model, df, tfidf_vectorizer, threshold=0.4, top_n=25):
    new_question_vector = tfidf_vectorizer.transform([new_question])
    new_question_vector_array = new_question_vector.toarray()

    visited_clusters = set()
    recommended_responses = []

    while len(recommended_responses) < top_n and len(visited_clusters) < len(df['cluster'].unique()):
        # Prédire le cluster de la nouvelle question
        predicted_clusters = model.predict(new_question_vector_array)
        
        # Récupérer les clusters 
        ordered_clusters = np.argsort(-predicted_clusters, axis=1).flatten()

        # Sélectionner le premier cluster non visité
        for predicted_cluster in ordered_clusters:
            if predicted_cluster not in visited_clusters:
                visited_clusters.add(predicted_cluster)
                break

        # Récupérer les questions et réponses du cluster prédit
        cluster_questions = df[df['cluster'] == predicted_cluster]

        # Calculer la similarité cosinus entre la nouvelle question et les questions du cluster
        cluster_question_vectors = tfidf_vectorizer.transform(cluster_questions['body_x'])
        similarities = cosine_similarity(new_question_vector, cluster_question_vectors).flatten()

        # Filtrer les réponses avec une similarité supérieure au seuil spécifié
        high_similarity_indices = [i for i, sim in enumerate(similarities) if sim > threshold]

        # Récupérer les réponses recommandées, leurs clusters et leurs similarités
        cluster_recommendations = cluster_questions.iloc[high_similarity_indices][['body_y', 'cluster']].copy()
        cluster_recommendations['similarity'] = similarities[high_similarity_indices]

        # Ajouter les réponses recommandées à la liste finale
        recommended_responses.extend(cluster_recommendations.to_dict('records'))

        # Supprimer les réponses dupliquées tout en préservant l'ordre
        seen = set()
        unique_recommended_responses = []
        for response in recommended_responses:
            if response['body_y'] not in seen:
                unique_recommended_responses.append(response)
                seen.add(response['body_y'])
        
        recommended_responses = unique_recommended_responses

    # Convertir la liste de réponses recommandées en DataFrame
    recommended_df = pd.DataFrame(recommended_responses).sort_values(by='similarity', ascending=False).head(top_n)

    # Extraire la liste de réponses recommandées (body_y)
    response_list = recommended_df['body_y'].tolist()

    return response_list, recommended_df

In [None]:
# Obtenir les réponses recommandées
recommended_responses, recommended_df = recommend_responses(user_question, model, data, tfidf_vectorizer1)

similarities_data = recommended_df

similarities_data['similarity'] = similarities_data['similarity'].apply(lambda x: f"{x*100:.2f}%")

    # Convert the similarity values back to float for sorting
similarities_data['similarity_float'] = similarities_data['similarity'].str.rstrip('%').astype(float)

    # Sort the DataFrame by the similarity column in descending order
similarities_data = similarities_data.sort_values(by='similarity_float', ascending=False)
similarities_data = similarities_data.drop(columns=['similarity_float'])

similarities_data = similarities_data.head(15)

# Afficher les réponses recommandées sous forme de liste
print("Recommended responses:")
for i, response in enumerate(recommended_responses, start=1):
    print(f"{i}. {response}")


In [None]:
recommended_responses = pd.DataFrame(recommended_responses, columns=['Answers']).drop_duplicates()
recommended_responses.rename(columns={'0': 'Answers'}, inplace=True)
recommended_responses = recommended_responses.iloc[:15]

print(similarities_data.head(2))

In [None]:
# Save the history
history_df = pd.DataFrame(history.history)
history_df = history_df.iloc[:15]

history_df.to_csv('history.csv', index=False)

In [None]:
similarities_data.to_csv('data.csv',index=False)
recommended_responses.to_csv('recommendations.csv', index= False)

In [None]:
# Tracer la courbe de la fonction de perte
plt.figure()
plt.plot(history.history['loss'], label='Entraînement')
plt.plot(history.history['val_loss'], label='Validation')
plt.title('Courbe de la fonction de perte')
plt.xlabel('Épochs')
plt.ylabel('Perte')
plt.savefig('loss_plot.png')
plt.legend()
plt.show()

In [None]:
# Plot the accuracy
plt.figure()
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.title('Courbe de la précision du modèle')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.savefig('accuracy_plot.png')
plt.legend()
plt.close()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score

"""BufferErrory_pred = model.predict(X_train)
y_pred_clusters = np.argmax(y_pred, axis=1)

# Calculer la matrice de confusion
conf_matrix = confusion_matrix(y_train, y_pred_clusters)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
disp.plot(cmap=plt.cm.Blues)

# Afficher la matrice de confusion
plt.title('Matrice de Confusion')
plt.show()"""