In [6]:
import certifi
ca = certifi.where()

In [7]:
import pymongo as mongo
import pandas as pd

#client = mongo.MongoClient("mongodb+srv://hadyltitri:QScSgXpsINfxAfQC@cluster0.zvbmwjb.mongodb.net/xyzdb?retryWrites=true&w=majority", tlsCAFile=ca)
client = mongo.MongoClient("mongodb://localhost:27017/")
db = client["admin"]

collection_qst = db["questions_clean"]
collection_ans = db["answers_clean"]

# Identifier et supprimer les doublons basés sur l'attribut 'answer_id' dans la collection answers_clean
duplicates = collection_ans.aggregate([
    { '$group': { '_id': '$answer_id', 'duplicates': { '$addToSet': '$_id' }, 'count': { '$sum': 1 } } },
    { '$match': { 'count': { '$gt': 1 } } }
])

for doc in duplicates:
    doc['duplicates'].pop(0)  # Conserver un élément, supprimer les autres
    collection_ans.delete_many({ '_id': { '$in': doc['duplicates'] } })

In [8]:
questions_df = list(collection_qst.find({}))
questions_df = pd.json_normalize(questions_df)
df_questions = pd.DataFrame(questions_df)

answers_df = list(collection_ans.find({}))
answers_df = pd.json_normalize(answers_df)
df_answers = pd.DataFrame(answers_df)

df_merged = pd.merge(df_questions,df_answers,on='question_id',how='inner')

data = df_merged[['title_x','body_x', 'body_y']]

In [9]:
data.head(2)

Unnamed: 0,title_x,body_x,body_y
0,39parametercompiledvalue39 missing execution p...,question parameter sniffing execution plan att...,You can disable the parameter sniffing. When t...
1,ok index column available another index,doubt creating index table condition table 10 ...,Generally speaking having two indexes sharing ...


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, make_scorer
from sklearn.model_selection import GridSearchCV

# Vectorisation des textes
tfidf_vectorizer = TfidfVectorizer()

questions_matrix = tfidf_vectorizer.fit_transform(data['title_x'])

In [11]:
# extraire les noms des fonctionnalités/Le vocabumaire extraits des questions
tfidf_tokens = tfidf_vectorizer.get_feature_names_out()

In [12]:
questions_matrix.shape

(59500, 18031)

In [13]:
# Paramètres pour GridSearchCV
param_grid = {'n_clusters': range(7, 15)}

# Tester différents nombres de clusters
# Créer un scorer pour la métrique silhouette
silhouette_scorer = make_scorer(silhouette_score)

# Recherche par grille pour le nombre de clusters avec KMeans
kmeans = KMeans(random_state=42)

grid_search = GridSearchCV(kmeans, param_grid, scoring=silhouette_scorer)
grid_search.fit(questions_matrix)
best_num_clusters = grid_search.best_params_['n_clusters']

Traceback (most recent call last):
  File "c:\Users\ASUS\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 980, in _score
    scores = scorer(estimator, X_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'



In [14]:
kmeans_clustering = KMeans(n_clusters=best_num_clusters, random_state=42)
question_clusters = kmeans_clustering.fit_predict(questions_matrix)

In [15]:
cluster_label = kmeans_clustering.labels_
data['cluster'] = question_clusters

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['cluster'] = question_clusters


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.regularizers import l2
import matplotlib.pyplot as plt

In [17]:
# Prétraiter les questions
tfidf_vectorizer1 = TfidfVectorizer(max_features=50000)  
tfidf_matrix = tfidf_vectorizer1.fit_transform(data['title_x'])

In [18]:
# Créer et entraîner un modèle Keras
input_dim = tfidf_matrix.shape[1]

model = Sequential()
model.add(Dense(256, input_dim=input_dim, activation='relu', kernel_regularizer=l2(0.001)))#256: le nombre de neurones, input_dim : nombre de features (vecteurs) que cette couche attend en entrée
model.add(Dropout(0.5)) # 50%=0.5 : pourcentage de neurones à supprimer ou abondonner pour réduire l'overfitting.
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.3))
model.add(Dense(best_num_clusters, activation='softmax'))  

model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [19]:
# Entraîner le modèle
X_train = tfidf_matrix.toarray()
y_train = data['cluster'].values

history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m744/744[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 97ms/step - accuracy: 0.7810 - loss: 0.9049 - val_accuracy: 0.9779 - val_loss: 0.2479
Epoch 2/5
[1m744/744[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 95ms/step - accuracy: 0.9767 - loss: 0.2501 - val_accuracy: 0.9830 - val_loss: 0.2156
Epoch 3/5
[1m744/744[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 96ms/step - accuracy: 0.9832 - loss: 0.2202 - val_accuracy: 0.9850 - val_loss: 0.2005
Epoch 4/5
[1m744/744[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 97ms/step - accuracy: 0.9851 - loss: 0.2051 - val_accuracy: 0.9837 - val_loss: 0.1974
Epoch 5/5
[1m744/744[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 97ms/step - accuracy: 0.9859 - loss: 0.1949 - val_accuracy: 0.9831 - val_loss: 0.1917


In [20]:
import pickle
with open("data.pkl", "wb") as f:
    pickle.dump([questions_matrix, data, best_num_clusters, tfidf_vectorizer,tfidf_vectorizer1], f)

with open("model.pkl", "wb") as f1:
    pickle.dump([ model, history], f1)