In [None]:
# ----- PREPROCESS & EXPORT -----
import pandas as pd, re, spacy, nltk, os
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas(); nltk.download('stopwords')
stop_words=set(stopwords.words('english')); nlp=spacy.load('en_core_web_sm')

def preprocess(text):
    if not isinstance(text,str): return ''
    text=re.sub(r'[^\x00-\x7F]+',' ',text).lower()
    text=re.sub(r'[^a-z\s]',' ',text); doc=nlp(text)
    return ' '.join([t.lemma_ for t in doc if t.lemma_ not in stop_words and t.pos_ in {'NOUN','VERB','ADJ'}])

raw=pd.read_csv('your_dataset.csv')                   # answer,category,subcategory
raw['cleaned_answer']=raw['answer'].progress_apply(preprocess)
raw.to_csv('preprocessed_training_data.csv',index=False)


In [2]:
# Notebook – train_subcategory_models_per_category.ipynb
import os, joblib, pandas as pd, numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

os.makedirs('sub_models', exist_ok=True)

df    = pd.read_csv('input_train_data.csv')      # cleaned_answer, category, subcategory
model = SentenceTransformer('emb/sentence_model')          # same encoder used for category model

for cat in df['category'].unique():
    sub_df = df[df['category'] == cat]
    if sub_df['subcategory'].nunique() == 1:                # only one sub‑cat → no model needed
        joblib.dump({'single': sub_df['subcategory'].iloc[0]}, f'sub_models/{cat}_single.pkl')
        continue

    X  = model.encode(sub_df['answer'].tolist(), show_progress_bar=False)
    le = LabelEncoder()
    y  = le.fit_transform(sub_df['subcategory'])

    clf = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))
    clf.fit(X, y)

    joblib.dump({'classifier': clf, 'label_encoder': le}, f'sub_models/{cat}.pkl')


In [None]:
# Notebook – train_subcategory_models_per_category.ipynb  (with per‑category accuracy)
import os, joblib, pandas as pd, numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

os.makedirs('sub_models', exist_ok=True)

df    = pd.read_csv('preprocessed_training_data.csv')      # cleaned_answer, category, subcategory
model = SentenceTransformer('emb/sentence_model')          # same encoder used for category model

results = []                                               # store accuracies here

for cat in df['category'].unique():
    sub_df = df[df['category'] == cat]

    # if only one subcategory, no classifier needed
    if sub_df['subcategory'].nunique() == 1:
        single_sub = sub_df['subcategory'].iloc[0]
        joblib.dump({'single': single_sub}, f'sub_models/{cat}_single.pkl')
        results.append({'category': cat, 'subcats': 1, 'accuracy': 1.0})
        continue

    # embeddings
    X  = model.encode(sub_df['cleaned_answer'].tolist(), show_progress_bar=False)

    # label‑encode subcategories
    le = LabelEncoder()
    y  = le.fit_transform(sub_df['subcategory'])

    # train classifier
    clf = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))
    clf.fit(X, y)

    # compute training accuracy
    y_pred = clf.predict(X)
    acc    = accuracy_score(y, y_pred)
    results.append({'category': cat,
                    'subcats': sub_df['subcategory'].nunique(),
                    'accuracy': round(acc, 4)})

    # save model & encoder
    joblib.dump({'classifier': clf, 'label_encoder': le},
                f'sub_models/{cat}.pkl')

# export accuracy summary
pd.DataFrame(results).to_csv('sub_models/subcategory_training_accuracy.csv', index=False)


In [6]:
# Notebook – predict_subcategory_from_category_output.ipynb
import os, joblib, pandas as pd, numpy as np
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('emb/sentence_model')

# input must contain: responseid, answer, predicted_category, cleaned_answer
df = pd.read_csv('output/answers_with_predicted_category_supervised.csv')

# encode cleaned answers
X = model.encode(df['answer'].fillna('').tolist(), show_progress_bar=True)

pred_sub, sub_conf = [], []
for vec, cat in zip(X, df['predicted_category']):
    file_single = f'sub_models/{cat}_single.pkl'
    file_multi  = f'sub_models/{cat}.pkl'

    if os.path.exists(file_single):                         # only one sub‑cat
        data      = joblib.load(file_single)
        pred_sub.append(data['single'])
        sub_conf.append(1.0)
    elif os.path.exists(file_multi):                        # multi sub‑cat model
        data      = joblib.load(file_multi)
        clf       = data['classifier']
        le        = data['label_encoder']
        probs     = clf.predict_proba([vec])[0]
        idx       = probs.argmax()
        pred_sub.append(le.inverse_transform([idx])[0])
        sub_conf.append(probs[idx])
    else:                                                   # no model found
        pred_sub.append('unknown')
        sub_conf.append(0.0)

df['predicted_subcategory']   = pred_sub
df['subcat_confidence_score'] = sub_conf
df['method_used']             = 'data_science_team'

#cols = ['responseid', 'answer', 'predicted_category',
#        'predicted_subcategory', 'subcat_confidence_score', 'method_used']
#df[cols].to_csv('new_predictions_with_category_and_subcategory.csv', index=False)
df

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,answer,category,subcategory,predicted_category,confidence_score,predicted_subcategory,subcat_confidence_score,method_used
0,There is a clear path for promotions and recog...,Career Growth,Promotions,Career Growth,0.431380,Promotions,0.613528,data_science_team
1,There is a clear path for promotions and recog...,Career Growth,Promotions,Career Growth,0.431380,Promotions,0.613528,data_science_team
2,There is clarity in what is expected from my p...,Job Role,Clarity,Job Role,0.439469,Clarity,0.458004,data_science_team
3,Our leaders are approachable and provide clear...,Workplace Culture,Leadership,Workplace Culture,0.399964,Leadership,0.518551,data_science_team
4,Ergonomic furniture helps with long working ho...,Work Environment,Ergonomics,Work Environment,0.554878,Ergonomics,0.584357,data_science_team
...,...,...,...,...,...,...,...,...
95,Management communicates updates regularly.,Management,Communication,Management,0.463981,Communication,0.589659,data_science_team
96,Office facilities are well-maintained and reso...,Work Environment,Facilities,Work Environment,0.413396,Facilities,0.558563,data_science_team
97,Ergonomic furniture helps with long working ho...,Work Environment,Ergonomics,Work Environment,0.554878,Ergonomics,0.584357,data_science_team
98,Ergonomic furniture helps with long working ho...,Work Environment,Ergonomics,Work Environment,0.554878,Ergonomics,0.584357,data_science_team
