In [None]:
import pandas as pd
import json
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression  # MUCH faster than GridSearch
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import joblib
import nltk


In [None]:
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download('punkt_tab', quiet=True)

print("="*60)
print("CPU-OPTIMIZED - FAST TRAINING")
print("="*60)

# Load data
with open("skills_assessment_data/train.json", "r") as f:
    train_data = json.load(f)
with open("skills_assessment_data/test.json", "r") as f:
    test_data = json.load(f)

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

In [None]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

# Keep sentiment words
keep_words = {"delighted", "moved", "uplifted", "breathtaking", "soared", "empty", "annoyed", "not", "no", "barely"}
stop_words = stop_words - keep_words

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"[^a-z\s$!']", "", text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)

print("Preprocessing (fast)...")
train_df["text"] = train_df["text"].apply(preprocess_text)
test_df["text"] = test_df["text"].apply(preprocess_text)

In [None]:
vectorizer = CountVectorizer(
    min_df=2,           # Faster
    max_df=0.85, 
    ngram_range=(1, 2), # Only bigrams (trigrams too slow)
    max_features=10000  # Limit features for CPU
)

# Use LogisticRegression (100x faster than MultinomialNB with GridSearch)
pipeline = Pipeline([
    ("vectorizer", vectorizer),
    ("classifier", LogisticRegression(C=2.0, max_iter=500, solver='saga', random_state=42))
])

print("Training (30-60 seconds on CPU)...")
pipeline.fit(train_df["text"], train_df["label"])

In [None]:
predictions = pipeline.predict(test_df["text"])
accuracy = accuracy_score(test_df["label"], predictions)

print(f"\nðŸŽ¯ ACCURACY: {accuracy:.4f} ({accuracy*100:.2f}%)")

if accuracy >= 0.90:
    print("âœ… 90% ACHIEVED!")
else:
    print(f"âš  {(0.90-accuracy)*100:.2f}% short")

print("\n" + classification_report(test_df["label"], predictions))

# Save
joblib.dump(pipeline, 'skills_assessment.joblib')