In [4]:
import json
import pickle
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer

# for loading dataset
with open("fixed_qa_dataset.json", "r", encoding="utf-8") as f:
    qa_data = json.load(f)

questions = [item["question"] for item in qa_data]

# for Generating sentence embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(questions)

# now using KMeans to cluster questions into N groups (like classes)
n_clusters = 50  # tune this value between 20–100 depending on question variety
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(embeddings)

# for Train/Test split
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)

# for Training classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Evaluatating
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print("✅ Accuracy:", round(acc * 100, 2), "%")
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\n🧾 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Step 7: Save classifier
with open("trained_classifier.pkl", "wb") as f:
    pickle.dump(clf, f)

# Save embeddings and clustered data
with open("embedded_qa_dataset.pkl", "wb") as f:
    pickle.dump({
        "data": qa_data,
        "embeddings": embeddings,
        "cluster_labels": labels
    }, f)


✅ Accuracy: 96.36 %

📊 Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        27
           1       0.97      1.00      0.98        87
           2       1.00      1.00      1.00         6
           3       0.93      1.00      0.97        28
           4       1.00      1.00      1.00        12
           5       0.92      0.85      0.88        13
           6       1.00      1.00      1.00        11
           7       1.00      1.00      1.00        40
           8       0.95      0.87      0.91        23
           9       1.00      0.92      0.96        13
          10       0.93      0.78      0.85        18
          11       1.00      0.94      0.97        17
          12       1.00      1.00      1.00        15
          13       1.00      1.00      1.00        13
          14       1.00      1.00      1.00         9
          15       1.00      0.79      0.88        14
          16       1.00      1.00  