In [None]:
# Importing Libraries
import json
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sentence_transformers import SentenceTransformer

# Loading Dataset
with open("fixed_qa_dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame(data)

# Filtering out answers with only 1 example (to avoid training issues)
answer_counts = df["answer"].value_counts()
valid_answers = answer_counts[answer_counts > 1].index
df = df[df["answer"].isin(valid_answers)].reset_index(drop=True)

if len(df) < 2:
    raise ValueError(" Not enough data after filtering low-frequency answers.")

# Encoding Questions using SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
X = model.encode(df["question"].tolist(), show_progress_bar=True)
y = df["answer"].astype("category").cat.codes  # Convert text labels to numbers

# for Train-Test Split (Stratify if possible)
stratify = y if len(np.unique(y)) > 1 else None

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=stratify, random_state=42
)

#for Training Classifier
clf = LogisticRegression(
    C=10, max_iter=2000, solver="lbfgs", multi_class="multinomial"
)
clf.fit(X_train, y_train)

#now Evaluate Model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="macro", zero_division=0)
recall = recall_score(y_test, y_pred, average="macro", zero_division=0)
conf_matrix = confusion_matrix(y_test, y_pred)

print(" Accuracy:", round(accuracy * 100, 2), "%")
print("\n Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\n Confusion Matrix:")
print(conf_matrix)

# now Save Trained Classifier
with open("trained_classifier.pkl", "wb") as f:
    pickle.dump(clf, f)

# Now Save Embeddings + Data for Chatbot
with open("embedded_qa_dataset.pkl", "wb") as f:
    pickle.dump({
        "data": df.to_dict(orient="records"),
        "embeddings": X
    }, f)


Batches:   0%|          | 0/91 [00:00<?, ?it/s]