# Text Classification with SVM (Kaggle Version)

**Self-contained notebook for Kaggle** — no external `embeddings/` or `utils/` files. All preprocessing and TF-IDF / Skip-gram / CBOW code is inlined. Uses Kaggle's preinstalled libraries (pandas, sklearn, nltk, gensim).

- **Embeddings**: TF-IDF, Skip-gram (Word2Vec), CBOW (Word2Vec) — defined in this notebook
- **Dataset**: Add your dataset in Kaggle and set `DATA_PATH` in the next cell
- **Text column**: Product Title  
- **Label column**: Cluster Label

In [None]:
# Kaggle: Set path to dataset CSV
# After adding dataset to notebook: Add Data -> Your dataset -> copy path
import os

DATA_PATH = '/kaggle/input/pricerunner-aggregate-csv/pricerunner_aggregate.csv'

# If running locally, use local path
if not os.path.exists(DATA_PATH):
    DATA_PATH = 'data/pricerunner_aggregate.csv'
print(f"Using data: {DATA_PATH}")

# Speed: set True for much faster run (~10–20 min). False = full data & grid (~hours)
FAST_MODE = True
MAX_CLASSES = 300       # use top N classes by frequency (full data has ~7800)
MAX_TRAIN_SAMPLES = 8000   # cap training size when FAST_MODE

In [None]:
# Download NLTK data (Kaggle has NLTK; may need punkt_tab)
import nltk

for name in ['punkt_tab', 'punkt', 'stopwords', 'wordnet']:
    try:
        nltk.download(name, quiet=True)
        print(f"✓ {name}")
    except Exception as e:
        print(f"  {name}: {e}")
print("NLTK ready.")

In [None]:
# Imports (all available on Kaggle)
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import seaborn as sns

print("All libraries imported.")

## Text Preprocessor

In [None]:
class TextPreprocessor:
    def __init__(self, min_word_length=2, remove_stopwords=True, lemmatize=True):
        self.min_word_length = min_word_length
        self.remove_stopwords = remove_stopwords
        self.lemmatize = lemmatize
        self.stop_words = set(stopwords.words('english')) if remove_stopwords else set()
        self.lemmatizer = WordNetLemmatizer() if lemmatize else None

    def clean_text(self, text):
        if pd.isna(text):
            return ""
        text = str(text).lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'\S+@\S+', '', text)
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def tokenize(self, text):
        return word_tokenize(text)

    def preprocess(self, text):
        cleaned = self.clean_text(text)
        tokens = self.tokenize(cleaned)
        filtered = [t for t in tokens if len(t) >= self.min_word_length
                    and (not self.remove_stopwords or t not in self.stop_words)]
        if self.lemmatize and self.lemmatizer:
            filtered = [self.lemmatizer.lemmatize(t) for t in filtered]
        return filtered

    def preprocess_for_tfidf(self, text):
        return ' '.join(self.preprocess(text))

    def preprocess_for_embeddings(self, text):
        return self.preprocess(text)

preprocessor = TextPreprocessor()
print("TextPreprocessor defined.")

## TF-IDF, Skip-gram, CBOW (no external files)

In [None]:
class TFIDFEmbedding:
    def __init__(self, max_features=10000, ngram_range=(1, 2), min_df=2, max_df=0.95):
        self.vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range,
                                          lowercase=False, min_df=min_df, max_df=max_df)
        self.is_fitted = False

    def fit(self, texts):
        self.vectorizer.fit(texts)
        self.is_fitted = True

    def transform(self, texts):
        return self.vectorizer.transform(texts).toarray()

    def fit_transform(self, texts):
        return self.vectorizer.fit_transform(texts).toarray()


class SkipGramEmbedding:
    def __init__(self, vector_size=300, window=5, min_count=2, workers=4, epochs=10):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.epochs = epochs
        self.model = None
        self.is_fitted = False

    def fit(self, tokenized_texts):
        self.model = Word2Vec(sentences=tokenized_texts, vector_size=self.vector_size,
                               window=self.window, min_count=self.min_count,
                               workers=self.workers, sg=1, epochs=self.epochs)
        self.is_fitted = True

    def transform(self, tokenized_texts):
        embs = []
        for tokens in tokenized_texts:
            vecs = [self.model.wv[t] for t in tokens if t in self.model.wv]
            embs.append(np.mean(vecs, axis=0) if vecs else np.zeros(self.vector_size))
        return np.array(embs)


class CBOWEmbedding:
    def __init__(self, vector_size=300, window=5, min_count=2, workers=4, epochs=10):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.epochs = epochs
        self.model = None
        self.is_fitted = False

    def fit(self, tokenized_texts):
        self.model = Word2Vec(sentences=tokenized_texts, vector_size=self.vector_size,
                               window=self.window, min_count=self.min_count,
                               workers=self.workers, sg=0, epochs=self.epochs)
        self.is_fitted = True

    def transform(self, tokenized_texts):
        embs = []
        for tokens in tokenized_texts:
            vecs = [self.model.wv[t] for t in tokens if t in self.model.wv]
            embs.append(np.mean(vecs, axis=0) if vecs else np.zeros(self.vector_size))
        return np.array(embs)

print("TFIDFEmbedding, SkipGramEmbedding, CBOWEmbedding defined.")

## 1. Load and Explore Data

In [None]:
# Load the CSV file (correct full path)
df = pd.read_csv(DATA_PATH)

# Strip whitespace from column names
df.columns = df.columns.str.strip()

print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())

# Show first rows
df.head()

# Column names for rest of notebook
text_col = 'Product Title'
label_col = 'Cluster Label'

## 2. Preprocess

In [None]:
df_clean = df[[text_col, label_col]].dropna()
texts_tfidf = [preprocessor.preprocess_for_tfidf(t) for t in df_clean[text_col]]
texts_tokenized = [preprocessor.preprocess_for_embeddings(t) for t in df_clean[text_col]]
print(f"Samples: {len(texts_tfidf)}")
print(f"Sample (TF-IDF): {texts_tfidf[0][:80]}...")
print(f"Sample (tokens): {texts_tokenized[0][:10]}...")

## 3. Fit Embeddings (in-notebook only, no saved files)

In [None]:
# Slightly smaller/faster in FAST_MODE
max_feat = 5000 if FAST_MODE else 10000
w2v_epochs = 5 if FAST_MODE else 10
tfidf_emb = TFIDFEmbedding(max_features=max_feat, ngram_range=(1, 2))
skipgram_emb = SkipGramEmbedding(vector_size=300, window=5, min_count=2, epochs=w2v_epochs)
cbow_emb = CBOWEmbedding(vector_size=300, window=5, min_count=2, epochs=w2v_epochs)

print("Fitting TF-IDF...")
tfidf_emb.fit(texts_tfidf)
print("Fitting Skip-gram...")
skipgram_emb.fit(texts_tokenized)
print("Fitting CBOW...")
cbow_emb.fit(texts_tokenized)
print("All embeddings fitted.")

## 4. Train/Val/Test Split

In [None]:
from collections import Counter

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_clean[label_col])
class_counts = Counter(y)
min_samples_per_class = 2
valid_indices = [i for i, label in enumerate(y) if class_counts[label] >= min_samples_per_class]

if len(valid_indices) < len(y):
    max_valid_idx = min(len(texts_tfidf), len(texts_tokenized), len(y))
    valid_indices = [i for i in valid_indices if i < max_valid_idx]
    texts_tfidf = [texts_tfidf[i] for i in valid_indices]
    texts_tokenized = [texts_tokenized[i] for i in valid_indices]
    y = [y[i] for i in valid_indices]
    y = label_encoder.fit_transform(y)

test_size = 0.15
class_counts_after = Counter(y)
num_classes = len(class_counts_after)
min_class_count = min(class_counts_after.values())
total_samples = len(y)
min_test_samples = int(test_size * total_samples)
can_stratify = (min_test_samples >= num_classes) and (min_class_count >= 2)

if can_stratify:
    X_temp, X_test, y_temp, y_test = train_test_split(range(len(texts_tfidf)), y, test_size=test_size, random_state=42, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.15/0.85, random_state=42, stratify=y_temp)
else:
    X_temp, X_test, y_temp, y_test = train_test_split(range(len(texts_tfidf)), y, test_size=test_size, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.15/0.85, random_state=42)

texts_tfidf_train = [texts_tfidf[i] for i in X_train]
texts_tfidf_val = [texts_tfidf[i] for i in X_val]
texts_tfidf_test = [texts_tfidf[i] for i in X_test]
texts_tokenized_train = [texts_tokenized[i] for i in X_train]
texts_tokenized_val = [texts_tokenized[i] for i in X_val]
texts_tokenized_test = [texts_tokenized[i] for i in X_test]

# FAST_MODE: keep only top MAX_CLASSES and optionally cap train size
if FAST_MODE and MAX_CLASSES is not None:
    from collections import Counter
    cnt = Counter(y_train)
    top_classes = [c for c, _ in cnt.most_common(MAX_CLASSES)]
    top_set = set(top_classes)
    keep_train = [i for i in range(len(y_train)) if y_train[i] in top_set]
    keep_val = [i for i in range(len(y_val)) if y_val[i] in top_set]
    keep_test = [i for i in range(len(y_test)) if y_test[i] in top_set]
    if MAX_TRAIN_SAMPLES and len(keep_train) > MAX_TRAIN_SAMPLES:
        import random
        random.seed(42)
        keep_train = random.sample(keep_train, MAX_TRAIN_SAMPLES)
    X_train = [X_train[i] for i in keep_train]
    y_train = [y_train[i] for i in keep_train]
    X_val = [X_val[i] for i in keep_val]
    y_val = [y_val[i] for i in keep_val]
    X_test = [X_test[i] for i in keep_test]
    y_test = [y_test[i] for i in keep_test]
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_val = label_encoder.transform(y_val)
    y_test = label_encoder.transform(y_test)
    texts_tfidf_train = [texts_tfidf_train[i] for i in keep_train]
    texts_tfidf_val = [texts_tfidf_val[i] for i in keep_val]
    texts_tfidf_test = [texts_tfidf_test[i] for i in keep_test]
    texts_tokenized_train = [texts_tokenized_train[i] for i in keep_train]
    texts_tokenized_val = [texts_tokenized_val[i] for i in keep_val]
    texts_tokenized_test = [texts_tokenized_test[i] for i in keep_test]
    print(f"FAST_MODE: Train {len(y_train)}, Val {len(y_val)}, Test {len(y_test)}, Classes {len(np.unique(y_train))}")

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

## 5. Transform Splits with Embeddings

In [None]:
X_train_tfidf = tfidf_emb.transform(texts_tfidf_train)
X_val_tfidf = tfidf_emb.transform(texts_tfidf_val)
X_test_tfidf = tfidf_emb.transform(texts_tfidf_test)

X_train_skipgram = skipgram_emb.transform(texts_tokenized_train)
X_val_skipgram = skipgram_emb.transform(texts_tokenized_val)
X_test_skipgram = skipgram_emb.transform(texts_tokenized_test)

X_train_cbow = cbow_emb.transform(texts_tokenized_train)
X_val_cbow = cbow_emb.transform(texts_tokenized_val)
X_test_cbow = cbow_emb.transform(texts_tokenized_test)

print(f"TF-IDF: {X_train_tfidf.shape}, Skip-gram: {X_train_skipgram.shape}, CBOW: {X_train_cbow.shape}")

## 6. Baseline SVM

In [None]:
baseline_svm_tfidf = SVC(random_state=42, probability=False)
baseline_svm_tfidf.fit(X_train_tfidf, y_train)
y_val_pred_baseline_tfidf = baseline_svm_tfidf.predict(X_val_tfidf)
print(f"TF-IDF Val Accuracy: {accuracy_score(y_val, y_val_pred_baseline_tfidf):.4f}, F1: {f1_score(y_val, y_val_pred_baseline_tfidf, average='macro'):.4f}")

baseline_svm_skipgram = SVC(random_state=42, probability=False)
baseline_svm_skipgram.fit(X_train_skipgram, y_train)
y_val_pred_baseline_skipgram = baseline_svm_skipgram.predict(X_val_skipgram)
print(f"Skip-gram Val Accuracy: {accuracy_score(y_val, y_val_pred_baseline_skipgram):.4f}, F1: {f1_score(y_val, y_val_pred_baseline_skipgram, average='macro'):.4f}")

baseline_svm_cbow = SVC(random_state=42, probability=False)
baseline_svm_cbow.fit(X_train_cbow, y_train)
y_val_pred_baseline_cbow = baseline_svm_cbow.predict(X_val_cbow)
print(f"CBOW Val Accuracy: {accuracy_score(y_val, y_val_pred_baseline_cbow):.4f}, F1: {f1_score(y_val, y_val_pred_baseline_cbow, average='macro'):.4f}")

## 7. Hyperparameter Tuning

In [None]:
# FAST_MODE: minimal grid (1 setting) + cv=2 for much faster tuning
if FAST_MODE:
    svm_param_grid = {'C': [1.0], 'kernel': ['linear'], 'class_weight': ['balanced']}
    grid_cv_folds, use_probability = 2, False
else:
    svm_param_grid = {'C': [0.01, 0.1, 1.0, 10.0], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto'], 'class_weight': [None, 'balanced']}
    grid_cv_folds, use_probability = 3, True
print("Param grid:", svm_param_grid)
print(f"CV folds: {grid_cv_folds}")

In [None]:
print("GridSearch SVM + TF-IDF...")

svm_tfidf = GridSearchCV(
    SVC(random_state=42, probability=use_probability),
    param_grid=svm_param_grid,
    cv=grid_cv_folds,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=1,
    refit=True
)

svm_tfidf.fit(X_train_tfidf, y_train)

# FIXED LINE
print(f"Best: {svm_tfidf.best_params_}  Score: {svm_tfidf.best_score_:.4f}")

y_val_pred_tfidf = svm_tfidf.predict(X_val_tfidf)

print(f"Val F1: {f1_score(y_val, y_val_pred_tfidf, average='macro'):.4f}")


In [None]:
print("GridSearch SVM + Skip-gram...")
svm_skipgram = GridSearchCV(SVC(random_state=42, probability=use_probability), param_grid=svm_param_grid, cv=grid_cv_folds, scoring='f1_macro', n_jobs=-1, verbose=1, refit=True)
svm_skipgram.fit(X_train_skipgram, y_train)
print("Best:", svm_skipgram.best_params_, "Score:", svm_skipgram.best_score_:.4f)
y_val_pred_skipgram = svm_skipgram.predict(X_val_skipgram)
print(f"Val F1: {f1_score(y_val, y_val_pred_skipgram, average='macro'):.4f}")

In [None]:
print("GridSearch SVM + CBOW...")
svm_cbow = GridSearchCV(SVC(random_state=42, probability=use_probability), param_grid=svm_param_grid, cv=grid_cv_folds, scoring='f1_macro', n_jobs=-1, verbose=1, refit=True)
svm_cbow.fit(X_train_cbow, y_train)
print("Best:", svm_cbow.best_params_, "Score:", svm_cbow.best_score_:.4f)
y_val_pred_cbow = svm_cbow.predict(X_val_cbow)
print(f"Val F1: {f1_score(y_val, y_val_pred_cbow, average='macro'):.4f}")

## 8. Test Set Evaluation

In [None]:
y_test_pred_tfidf = svm_tfidf.predict(X_test_tfidf)
y_test_pred_skipgram = svm_skipgram.predict(X_test_skipgram)
y_test_pred_cbow = svm_cbow.predict(X_test_cbow)

results = {
    'TF-IDF': {'accuracy': accuracy_score(y_test, y_test_pred_tfidf), 'f1_macro': f1_score(y_test, y_test_pred_tfidf, average='macro', zero_division=0)},
    'Skip-gram': {'accuracy': accuracy_score(y_test, y_test_pred_skipgram), 'f1_macro': f1_score(y_test, y_test_pred_skipgram, average='macro', zero_division=0)},
    'CBOW': {'accuracy': accuracy_score(y_test, y_test_pred_cbow), 'f1_macro': f1_score(y_test, y_test_pred_cbow, average='macro', zero_division=0)}
}
comparison_df = pd.DataFrame(results).T
print(comparison_df.round(4))

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
metrics = ['accuracy', 'f1_macro']
for i, m in enumerate(metrics):
    vals = [results[emb][m] for emb in ['TF-IDF', 'Skip-gram', 'CBOW']]
    ax[i].bar(['TF-IDF', 'Skip-gram', 'CBOW'], vals, color=['#3498db', '#e74c3c', '#2ecc71'])
    ax[i].set_ylim(0, 1)
    ax[i].set_title(m)
plt.suptitle('SVM on Kaggle: TF-IDF vs Skip-gram vs CBOW')
plt.tight_layout()
plt.show()