In [4]:
import pandas as pd
import numpy as np
import re
import os
from tqdm import tqdm
import joblib
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from deep_translator import GoogleTranslator
import spacy
from nltk.corpus import stopwords
from nltk import download

In [6]:
# Setup
tqdm.pandas()
download('stopwords')
stop_words = set(stopwords.words('english'))
nlp = spacy.load('en_core_web_lg')
translator = GoogleTranslator(source='auto', target='en')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# Load new data (must include 'responseid', 'answer')
new_df = pd.read_csv('new_answers.csv')

In [10]:
# UTF-8 clean + remove gibberish
def normalize_text(text):
    try:
        text = str(text).encode('utf-8', errors='ignore').decode('utf-8')
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # remove non-ASCII chars
        return text.strip()
    except:
        return ""

In [11]:
# Translate if not English
def translate_to_english(text):
    try:
        return translator.translate(text)
    except:
        return text

In [13]:
# Clean and preprocess
def preprocess_text(text):
    text = normalize_text(text)
    text = translate_to_english(text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text.strip())
    text = re.sub(r'[^a-z\s]', '', text)
    doc = nlp(text)
    tokens = [
        token.lemma_ for token in doc
        if token.lemma_ not in stop_words
        and token.pos_ in {'NOUN', 'VERB', 'ADJ'}
        and not token.is_stop
        and not token.is_punct
        and len(token.text) > 2
    ]
    return ' '.join(tokens)

In [14]:
# Apply cleaning
new_df['cleaned_answer'] = new_df['answer'].progress_apply(preprocess_text)

100%|██████████| 100/100 [00:20<00:00,  4.94it/s]


In [16]:
# Load model & label encoder
clf = joblib.load('emb/classifier.pkl')
le = joblib.load('emb/label_encoder.pkl')
model = SentenceTransformer('emb/sentence_model')

In [17]:
# Encode cleaned answers
X_new = model.encode(new_df['cleaned_answer'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [18]:
# Predict
y_probs = clf.predict_proba(X_new)
y_pred = clf.predict(X_new)
y_pred_labels = le.inverse_transform(y_pred)
confidence = np.max(y_probs, axis=1)

In [20]:
# Save predictions
new_df['predicted_category'] = y_pred_labels
new_df['confidence_score']   = confidence
new_df['method_used']        = 'data_science_team'

In [21]:
# Final export
cols = ['responseid', 'answer', 'predicted_category', 'confidence_score', 'method_used']
new_df[cols].to_csv('new_predictions_with_category.csv', index=False)

KeyError: "['responseid'] not in index"