In [6]:
# Notebook 4 – predict_subcategories_for_training_data.ipynb
import pandas as pd, numpy as np, pickle
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, accuracy_score

df                = pd.read_csv('data_with_embeddings.csv')
answer_emb        = np.load('answer_embeddings.npy')

model             = SentenceTransformer('all-MiniLM-L6-v2')

pairs             = df[['category','subcategory']].drop_duplicates().reset_index(drop=True)
subcat_emb        = model.encode(pairs['subcategory'].tolist(), show_progress_bar=True)

# map category → indices of its sub‑categories in `pairs`
cat_to_idx = {}
for i,(cat,_) in pairs.iterrows():
    cat_to_idx.setdefault(cat, []).append(i)

pred_subcats, sims_best = [], []
for idx, row in df.iterrows():
    cat = row['category']
    idxs = cat_to_idx[cat]
    sims = cosine_similarity([answer_emb[idx]], subcat_emb[idxs])[0]
    best_local = sims.argmax()
    pred_subcats.append(pairs.iloc[idxs[best_local]]['subcategory'])
    sims_best.append(sims[best_local])

df['predicted_subcategory'] = pred_subcats
df['sim_best_subcat']       = sims_best
df.to_csv('answers_with_predicted_subcategory.csv', index=False)

print("Sub‑category accuracy:", accuracy_score(df['subcategory'], df['predicted_subcategory']))
print(classification_report(df['subcategory'], df['predicted_subcategory'], digits=4))

# save mapping and embeddings for reuse
with open('subcat_mapping.pkl', 'wb') as f:
    pickle.dump((pairs, subcat_emb, cat_to_idx), f)
