# 08 - Bag-of-Words + Logistic Regression

This notebook trains a simple Bag-of-Words classifier on lookup tables and evaluates on annotated LinkedIn CVs.

We compare two variants:
1. Baseline (no oversampling)
2. Oversampling minority classes to a minimum count per class

Training data: lookup tables (department-v2.csv, seniority-v2.csv)
Evaluation data: annotated LinkedIn CVs (real-world)


## 1. Setup


In [1]:
import json
from datetime import datetime
from pathlib import Path
import warnings
import re

import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

import sys
sys.path.append('../')
from src.data.loader import load_label_lists, load_evaluation_dataset

warnings.filterwarnings('ignore')

DATA_DIR = Path('../data')
RESULTS_DIR = Path('./results')
RESULTS_DIR.mkdir(exist_ok=True)

RANDOM_STATE = 42

TOKEN_PATTERN = r'(?u)\b[\w\+\#\.\-]{2,}\b'


## 2. Load Data


In [2]:
# Load lookup tables (training data)
dept_df, sen_df = load_label_lists(
    DATA_DIR,
    fix_encoding=True,
    deduplicate=True,
    max_per_class=None
)

# Load annotated CVs for evaluation
eval_df = load_evaluation_dataset(DATA_DIR)

print(f"Department lookup: {len(dept_df):,} examples")
print(f"Seniority lookup:  {len(sen_df):,} examples")
print(f"Annotated CVs:     {len(eval_df):,} positions")


Applying encoding fix...
Deduplicating department labels...
  Deduplication: 10145 -> 10145 (removed 0 duplicates)
Deduplicating seniority labels...
  Deduplication: 9428 -> 9428 (removed 0 duplicates)
Department lookup: 10,145 examples
Seniority lookup:  9,428 examples
Annotated CVs:     478 positions


## 3. Oversampling Helper


In [3]:
def oversample_minority(df, label_col='label', min_samples=500, random_state=42):
    """
    Oversample minority classes up to min_samples per class.
    This does not undersample majority classes.
    """
    groups = []
    counts = df[label_col].value_counts()
    for label, count in counts.items():
        group = df[df[label_col] == label]
        if count < min_samples:
            extra = group.sample(min_samples - count, replace=True, random_state=random_state)
            group = pd.concat([group, extra], ignore_index=True)
        groups.append(group)
    result = pd.concat(groups, ignore_index=True)
    result = result.sample(frac=1, random_state=random_state).reset_index(drop=True)
    return result


def clean_text_keep_symbols(text, lowercase=True):
    if not text:
        return ""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s\-\.\,\&\/\(\)\+\#]', '', text)
    text = text.strip()
    if lowercase:
        text = text.lower()
    return text


ABBREV_MAP = {
    'sr': 'senior',
    'jr': 'junior',
    'mgr': 'manager',
    'vp': 'vice president',
    'svp': 'senior vice president',
    'avp': 'assistant vice president',
    'dir': 'director',
    'cto': 'chief technology officer',
    'cfo': 'chief financial officer',
    'ceo': 'chief executive officer',
    'coo': 'chief operating officer',
    'hr': 'human resources',
    'qa': 'quality assurance',
    'ux': 'user experience',
    'ui': 'user interface',
    'ml': 'machine learning',
    'nlp': 'natural language processing'
}

SENIORITY_TOKENS = {
    'intern', 'junior', 'jr', 'senior', 'sr', 'lead', 'principal', 'head',
    'manager', 'director', 'vp', 'vice', 'president', 'chief', 'c-level',
    'staff', 'assistant', 'associate'
}

DEPARTMENT_TOKENS = {
    'engineering', 'engineer', 'dev', 'developer', 'software', 'data', 'analytics',
    'sales', 'marketing', 'finance', 'accounting', 'hr', 'human', 'resources',
    'legal', 'operations', 'ops', 'product', 'design', 'ux', 'ui',
    'support', 'customer', 'success', 'business', 'development', 'research',
    'security', 'it', 'qa', 'quality', 'people'
}


def normalize_abbreviations(text, mapping):
    for short, full in mapping.items():
        text = re.sub(rf'\b{re.escape(short)}\b', full, text)
    return text


def remove_tokens(text, tokens):
    if not tokens:
        return text
    pattern = r'\b(?:' + '|'.join(re.escape(t) for t in tokens) + r')\b'
    return re.sub(pattern, ' ', text)


def preprocess_text(text, drop_seniority=False, drop_department=False):
    text = clean_text_keep_symbols(text, lowercase=True)
    text = normalize_abbreviations(text, ABBREV_MAP)
    if drop_seniority:
        text = remove_tokens(text, SENIORITY_TOKENS)
    if drop_department:
        text = remove_tokens(text, DEPARTMENT_TOKENS)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def preprocess_series(texts, **kwargs):
    return [preprocess_text(t, **kwargs) for t in texts]


def get_preprocess_kwargs(eval_label_col):
    if eval_label_col == 'department':
        return {'drop_seniority': True}
    if eval_label_col == 'seniority':
        return {'drop_department': True}
    return {}


def compute_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    precision, recall, f1_macro, _ = precision_recall_fscore_support(
        y_true, y_pred, average='macro', zero_division=0
    )
    f1_weighted = precision_recall_fscore_support(
        y_true, y_pred, average='weighted', zero_division=0
    )[2]
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    per_class_f1 = {
        label: metrics['f1-score']
        for label, metrics in report.items()
        if label not in ['accuracy', 'macro avg', 'weighted avg']
    }
    return {
        "accuracy": float(acc),
        "precision": float(precision),
        "recall": float(recall),
        "f1_macro": float(f1_macro),
        "f1_weighted": float(f1_weighted),
        "per_class_f1": {k: float(v) for k, v in per_class_f1.items()}
    }


## 4. Train and Evaluate (Department)


In [4]:
def build_vectorizers(max_features_word, max_features_char):
    word_vectorizer = TfidfVectorizer(
        max_features=max_features_word,
        ngram_range=(1, 2),
        analyzer='word',
        lowercase=True,
        min_df=2,
        max_df=0.95,
        token_pattern=TOKEN_PATTERN,
        sublinear_tf=True
    )
    char_vectorizer = TfidfVectorizer(
        max_features=max_features_char,
        ngram_range=(3, 5),
        analyzer='char_wb',
        lowercase=True,
        min_df=2,
        max_df=0.95
    )
    return word_vectorizer, char_vectorizer


def train_eval_bow(
    train_df,
    eval_df,
    eval_label_col,
    max_features_word,
    max_features_char,
    oversample=False,
    min_samples=500
):
    df = train_df[['text', 'label']].dropna().copy()

    if oversample:
        df = oversample_minority(df, label_col='label', min_samples=min_samples, random_state=RANDOM_STATE)

    X_train, X_val, y_train, y_val = train_test_split(
        df['text'].tolist(),
        df['label'].tolist(),
        test_size=0.2,
        random_state=RANDOM_STATE,
        stratify=df['label']
    )

    preprocess_kwargs = get_preprocess_kwargs(eval_label_col)
    X_train_texts = preprocess_series(X_train, **preprocess_kwargs)
    X_val_texts = preprocess_series(X_val, **preprocess_kwargs)

    word_vectorizer, char_vectorizer = build_vectorizers(max_features_word, max_features_char)

    X_train_word = word_vectorizer.fit_transform(X_train_texts)
    X_val_word = word_vectorizer.transform(X_val_texts)

    X_train_char = char_vectorizer.fit_transform(X_train_texts)
    X_val_char = char_vectorizer.transform(X_val_texts)

    X_train_vec = hstack([X_train_word, X_train_char])
    X_val_vec = hstack([X_val_word, X_val_char])

    clf = LogisticRegression(
        max_iter=2000,
        C=1.0,
        solver='saga',
        multi_class='multinomial',
        n_jobs=-1,
        class_weight='balanced',
        random_state=RANDOM_STATE
    )
    clf.fit(X_train_vec, y_train)

    # In-distribution evaluation
    val_preds = clf.predict(X_val_vec)
    in_dist = compute_metrics(y_val, val_preds)

    # Real-world evaluation (annotated CVs)
    eval_subset = eval_df[eval_df[eval_label_col].notna()].copy()
    eval_texts_raw = eval_subset['title'].fillna('').tolist()
    eval_texts = preprocess_series(eval_texts_raw, **preprocess_kwargs)
    eval_labels = eval_subset[eval_label_col].tolist()

    eval_word = word_vectorizer.transform(eval_texts)
    eval_char = char_vectorizer.transform(eval_texts)
    eval_vec = hstack([eval_word, eval_char])

    eval_preds = clf.predict(eval_vec)
    real_world = compute_metrics(eval_labels, eval_preds)

    return {
        "in_distribution": in_dist,
        "real_world": real_world,
        "vocab_size": int(len(word_vectorizer.vocabulary_) + len(char_vectorizer.vocabulary_)),
        "vocab_size_word": int(len(word_vectorizer.vocabulary_)),
        "vocab_size_char": int(len(char_vectorizer.vocabulary_))
    }


def print_summary(name, results):
    in_acc = results['in_distribution']['accuracy']
    rw_acc = results['real_world']['accuracy']
    rw_f1 = results['real_world']['f1_macro']
    print(f"{name} - In-dist acc: {in_acc:.4f} | Real-world acc: {rw_acc:.4f} | Real-world F1 (macro): {rw_f1:.4f}")


dept_baseline = train_eval_bow(
    dept_df,
    eval_df,
    eval_label_col='department',
    max_features_word=5000,
    max_features_char=5000,
    oversample=False
)

print_summary("Department (baseline)", dept_baseline)


dept_oversampled = train_eval_bow(
    dept_df,
    eval_df,
    eval_label_col='department',
    max_features_word=5000,
    max_features_char=5000,
    oversample=True,
    min_samples=500
)

print_summary("Department (oversampled)", dept_oversampled)


Department (baseline) - In-dist acc: 0.9236 | Real-world acc: 0.2824 | Real-world F1 (macro): 0.4074
Department (oversampled) - In-dist acc: 0.9778 | Real-world acc: 0.2971 | Real-world F1 (macro): 0.4435


## 5. Train and Evaluate (Seniority)


In [5]:
sen_baseline = train_eval_bow(
    sen_df,
    eval_df,
    eval_label_col='seniority',
    max_features_word=3000,
    max_features_char=3000,
    oversample=False
)

print_summary("Seniority (baseline)", sen_baseline)


sen_oversampled = train_eval_bow(
    sen_df,
    eval_df,
    eval_label_col='seniority',
    max_features_word=3000,
    max_features_char=3000,
    oversample=True,
    min_samples=500
)

print_summary("Seniority (oversampled)", sen_oversampled)


Seniority (baseline) - In-dist acc: 0.9814 | Real-world acc: 0.4686 | Real-world F1 (macro): 0.4406
Seniority (oversampled) - In-dist acc: 0.9832 | Real-world acc: 0.4582 | Real-world F1 (macro): 0.4317


## 6. Save Results


In [6]:
results = {
    "approach": "TF-IDF (word + char) + Logistic Regression",
    "department": {
        "baseline": dept_baseline,
        "oversampled": dept_oversampled
    },
    "seniority": {
        "baseline": sen_baseline,
        "oversampled": sen_oversampled
    },
    "metadata": {
        "vectorizer": "TfidfVectorizer (word + char_wb)",
        "word_ngram_range": "(1, 2)",
        "char_ngram_range": "(3, 5)",
        "token_pattern": TOKEN_PATTERN,
        "sublinear_tf": True,
        "min_df": 2,
        "max_df": 0.95,
        "class_weight": "balanced",
        "preprocessing": {
            "normalize_abbreviations": True,
            "drop_seniority_tokens_for_department": True,
            "drop_department_tokens_for_seniority": True
        },
        "oversample_min_samples": 500,
        "train_source": "lookup tables",
        "eval_source": "annotated CVs (title)"
    },
    "timestamp": datetime.now().isoformat()
}

output_path = RESULTS_DIR / 'bow_results.json'
with open(output_path, 'w') as f:
    json.dump(results, f, indent=2)

print(f"Results saved to: {output_path}")


Results saved to: results/bow_results.json
