# 08 - Bag-of-Words + Logistic Regression

This notebook trains a simple Bag-of-Words classifier on lookup tables and evaluates on annotated LinkedIn CVs.

We compare two variants:
1. Baseline (no oversampling)
2. Oversampling minority classes to a minimum count per class

Training data: lookup tables (department-v2.csv, seniority-v2.csv)
Evaluation data: annotated LinkedIn CVs (real-world)


## 1. Setup


In [1]:
import json
from datetime import datetime
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

import sys
sys.path.append('../')
from src.data.loader import load_label_lists, load_evaluation_dataset

warnings.filterwarnings('ignore')

DATA_DIR = Path('../data')
RESULTS_DIR = Path('./results')
RESULTS_DIR.mkdir(exist_ok=True)

RANDOM_STATE = 42


## 2. Load Data


In [None]:
# Load lookup tables (training data)
dept_df, sen_df = load_label_lists(
    DATA_DIR,
    fix_encoding=True,
    deduplicate=True,
    max_per_class=None
)

# Load annotated CVs for evaluation
eval_df = load_evaluation_dataset(DATA_DIR)

print(f"Department lookup: {len(dept_df):,} examples")
print(f"Seniority lookup:  {len(sen_df):,} examples")
print(f"Annotated CVs:     {len(eval_df):,} positions")


## 3. Oversampling Helper


In [None]:
def oversample_minority(df, label_col='label', min_samples=500, random_state=42):
    """
    Oversample minority classes up to min_samples per class.
    This does not undersample majority classes.
    """
    groups = []
    counts = df[label_col].value_counts()
    for label, count in counts.items():
        group = df[df[label_col] == label]
        if count < min_samples:
            extra = group.sample(min_samples - count, replace=True, random_state=random_state)
            group = pd.concat([group, extra], ignore_index=True)
        groups.append(group)
    result = pd.concat(groups, ignore_index=True)
    result = result.sample(frac=1, random_state=random_state).reset_index(drop=True)
    return result


def compute_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    precision, recall, f1_macro, _ = precision_recall_fscore_support(
        y_true, y_pred, average='macro', zero_division=0
    )
    f1_weighted = precision_recall_fscore_support(
        y_true, y_pred, average='weighted', zero_division=0
    )[2]
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    per_class_f1 = {
        label: metrics['f1-score']
        for label, metrics in report.items()
        if label not in ['accuracy', 'macro avg', 'weighted avg']
    }
    return {
        "accuracy": float(acc),
        "precision": float(precision),
        "recall": float(recall),
        "f1_macro": float(f1_macro),
        "f1_weighted": float(f1_weighted),
        "per_class_f1": {k: float(v) for k, v in per_class_f1.items()}
    }


## 4. Train and Evaluate (Department)


In [None]:
def train_eval_bow(
    train_df,
    eval_df,
    eval_label_col,
    max_features,
    oversample=False,
    min_samples=500
):
    df = train_df[['text', 'label']].dropna().copy()

    if oversample:
        df = oversample_minority(df, label_col='label', min_samples=min_samples, random_state=RANDOM_STATE)

    X_train, X_val, y_train, y_val = train_test_split(
        df['text'].tolist(),
        df['label'].tolist(),
        test_size=0.2,
        random_state=RANDOM_STATE,
        stratify=df['label']
    )

    vectorizer = CountVectorizer(
        max_features=max_features,
        ngram_range=(1, 2),
        analyzer='word',
        lowercase=True,
        min_df=2,
        max_df=0.95
    )

    X_train_vec = vectorizer.fit_transform(X_train)
    X_val_vec = vectorizer.transform(X_val)

    clf = LogisticRegression(
        max_iter=1000,
        C=1.0,
        solver='lbfgs',
        multi_class='multinomial',
        n_jobs=-1
    )
    clf.fit(X_train_vec, y_train)

    # In-distribution evaluation
    val_preds = clf.predict(X_val_vec)
    in_dist = compute_metrics(y_val, val_preds)

    # Real-world evaluation (annotated CVs)
    eval_subset = eval_df[eval_df[eval_label_col].notna()].copy()
    eval_texts = eval_subset['title'].fillna('').tolist()
    eval_labels = eval_subset[eval_label_col].tolist()

    eval_preds = clf.predict(vectorizer.transform(eval_texts))
    real_world = compute_metrics(eval_labels, eval_preds)

    return {
        "in_distribution": in_dist,
        "real_world": real_world,
        "vocab_size": int(len(vectorizer.vocabulary_))
    }


def print_summary(name, results):
    in_acc = results['in_distribution']['accuracy']
    rw_acc = results['real_world']['accuracy']
    rw_f1 = results['real_world']['f1_macro']
    print(f"{name} - In-dist acc: {in_acc:.4f} | Real-world acc: {rw_acc:.4f} | Real-world F1 (macro): {rw_f1:.4f}")


dept_baseline = train_eval_bow(
    dept_df,
    eval_df,
    eval_label_col='department',
    max_features=5000,
    oversample=False
)

print_summary("Department (baseline)", dept_baseline)


dept_oversampled = train_eval_bow(
    dept_df,
    eval_df,
    eval_label_col='department',
    max_features=5000,
    oversample=True,
    min_samples=500
)

print_summary("Department (oversampled)", dept_oversampled)


## 5. Train and Evaluate (Seniority)


In [None]:
sen_baseline = train_eval_bow(
    sen_df,
    eval_df,
    eval_label_col='seniority',
    max_features=3000,
    oversample=False
)

print_summary("Seniority (baseline)", sen_baseline)


sen_oversampled = train_eval_bow(
    sen_df,
    eval_df,
    eval_label_col='seniority',
    max_features=3000,
    oversample=True,
    min_samples=500
)

print_summary("Seniority (oversampled)", sen_oversampled)


## 6. Save Results


In [None]:
results = {
    "approach": "Bag-of-Words + Logistic Regression",
    "department": {
        "baseline": dept_baseline,
        "oversampled": dept_oversampled
    },
    "seniority": {
        "baseline": sen_baseline,
        "oversampled": sen_oversampled
    },
    "metadata": {
        "vectorizer": "CountVectorizer",
        "ngram_range": "(1, 2)",
        "min_df": 2,
        "max_df": 0.95,
        "oversample_min_samples": 500,
        "train_source": "lookup tables",
        "eval_source": "annotated CVs (title)"
    },
    "timestamp": datetime.now().isoformat()
}

output_path = RESULTS_DIR / 'bow_results.json'
with open(output_path, 'w') as f:
    json.dump(results, f, indent=2)

print(f"Results saved to: {output_path}")
