In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv('data_with_embeddings.csv')
answer_embeddings       = np.load('answer_embeddings.npy')
category_embeddings_all = np.load('category_embeddings.npy')

In [3]:
# one embedding per unique category
_, first_idx = np.unique(df['category'], return_index=True)
unique_categories   = df['category'].iloc[first_idx].reset_index(drop=True)
category_embeddings = category_embeddings_all[first_idx]

In [4]:
# cosine‑similarity matrix: rows=answers, cols=categories
sims_matrix = cosine_similarity(answer_embeddings, category_embeddings)

In [5]:
# best category for each answer
best_idx = sims_matrix.argmax(axis=1)
df['predicted_category'] = unique_categories.iloc[best_idx].values


In [26]:
# add similarity columns
for i, cat in enumerate(unique_categories):
    df[f'sim_{cat}'] = sims_matrix[:, i]

df.to_csv('answers_with_predicted_category_and_sims.csv', index=False)

In [29]:
# classification metrics
y_true = df['category']
y_pred = df['predicted_category']

print("Accuracy:", accuracy_score(y_true, y_pred))
print(classification_report(y_true, y_pred, digits=4))

Accuracy: 0.77
                   precision    recall  f1-score   support

    Career Growth     1.0000    1.0000    1.0000        17
     Compensation     1.0000    0.6923    0.8182        13
         Job Role     0.8000    1.0000    0.8889        12
       Management     0.8000    1.0000    0.8889        16
 Work Environment     0.6250    1.0000    0.7692        20
Work-Life Balance     0.0000    0.0000    0.0000        12
Workplace Culture     1.0000    0.3000    0.4615        10

         accuracy                         0.7700       100
        macro avg     0.7464    0.7132    0.6895       100
     weighted avg     0.7490    0.7700    0.7253       100



In [30]:
conf_mat = pd.DataFrame(
    confusion_matrix(y_true, y_pred, labels=unique_categories),
    index=unique_categories,
    columns=unique_categories,
)
conf_mat.to_csv('confusion_matrix.csv')