In [8]:
# --- P1: set paths and load raw data (fresh start) ---
from pathlib import Path
import pandas as pd
import re
# --- P3: stratified split + TF-IDF(1-2) + numeric preprocessor ---
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# --- P4: Baseline Logistic Regression (class-weighted) ---
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score, accuracy_score



In [4]:
REPO_ROOT = Path.cwd().parent          # notebook lives in /notebooks
DATA_PATH = REPO_ROOT / "data" / "interview_task_dataset.csv"

print("DATA_PATH:", DATA_PATH.resolve())
df_raw = pd.read_csv(DATA_PATH)

# sanity peek
display(df_raw.head(3))
df_raw.shape

DATA_PATH: D:\OneDrive\Data\Work\01_My_AI_Portfolio\GitHub-Uploaded\IrwinMicheall-Interview\legal-time-categorisation-poc\data\interview_task_dataset.csv


Unnamed: 0,Record ID,Department,Time Narrative,Worked Time,Charged to Client?,Grade,Category
0,p-0001,a,Amending and updating statement,0.4,YES,Senior,
1,p-0002,a,Reviewed court order and drafted advice email ...,1.3,YES,Junior,
2,p-0003,a,considering email in from counsel attaching FD...,0.3,YES,Junior,"analyse, review, research"


(2157, 7)

# Create clean processing columns (single cell)

Why: sets up exactly the signals we’ll feed into TF-IDF + tabular features, aligned to the brief. Hours stay as-is for the model; minutes remain for UI and insights.

In [5]:
# --- P2: create clean features for modelling + UI ---

def simple_clean(s: str) -> str:
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    return re.sub(r"\s+", " ", s).strip()

df = df_raw.copy()

# text
df["text_clean"] = df["Time Narrative"].fillna("").map(simple_clean)
df["n_words"] = df["text_clean"].str.split().str.len()
df["low_info"] = (df["n_words"] <= 3).astype(int)  # for later: confidence flag

# non-text signals
df["charged_bin"] = df["Charged to Client?"].astype(str).str.upper().eq("YES").astype(int)
df["grade_enc"]   = df["Grade"].astype("category").cat.codes
grade_map = dict(enumerate(df["Grade"].astype("category").cat.categories))  # keep to decode later

# keep hours for modelling; minutes for visuals/UI
df["worked_minutes"] = (df["Worked Time"] * 60).round(0).astype(int)

# labelled subset (training candidate)
train_df = df[df["Category"].notna()].copy()
print("Training rows:", len(train_df), " / All rows:", len(df))
display(train_df[["text_clean","Worked Time","charged_bin","grade_enc","low_info","Category"]].head())
grade_map


Training rows: 561  / All rows: 2157


Unnamed: 0,text_clean,Worked Time,charged_bin,grade_enc,low_info,Category
2,considering email in from counsel attaching fd...,0.3,1,0,0,"analyse, review, research"
9,communicate with client,0.5,1,1,1,client time
16,call out to the client to go through fda docs ...,0.7,1,0,0,client time
24,creating enclosures for professional litigatio...,0.3,1,0,0,preparing documents
27,review disclosure,0.1,1,2,1,"analyse, review, research"


{0: 'Junior', 1: 'Partner', 2: 'Senior'}

What & why (each column)

text_clean → lower-case, punctuation stripped, normalised spaces.
Why: shrinks the vocabulary, removes noise, and makes TF-IDF n-grams learn real signals (e.g., “email response”, “consent order”) instead of messy variants.

charged_bin (0/1) → numeric version of “Charged to Client?”.
Why: lets the model use this strong non-text signal; stays non-leaky because it’s not derived from the target label.

grade_enc → numeric code for Grade (+ we kept grade_map to decode later).
Why: Grade clearly changes work mix; encoding makes it usable in models while remaining non-leaky.

low_info → 1 if narrative ≤ 3 words.
Why: flags short texts where the model’s confidence may be lower. We’ll segment metrics by this and surface a low-confidence hint in the app.

Worked Time (hours) + worked_minutes (UI only)
Why: keep hours for modelling (clean, continuous); show minutes/6-min buckets in visuals for business clarity.

train_df = only rows with a real Category.
Why: we train on labelled only (brief requirement), then apply to the unlabelled set.

This is the “single source of truth” feature frame we’ll feed into the vectoriser/pipeline. It’s exactly aligned to the brief (core categories now; we can propose finer granular tags later for insights without changing the main target).

# stratified split + vectoriser skeleton (one cell)

In [6]:
# features (hours for model) + target
X = train_df[["text_clean", "Worked Time", "charged_bin", "grade_enc"]]
y = train_df["Category"].astype(str)

# stratified split to respect class balance
X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# text + numeric transformer
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=20000, min_df=2)
preproc = ColumnTransformer(
    transformers=[
        ("text", tfidf, "text_clean"),
        ("num", StandardScaler(with_mean=False), ["Worked Time", "charged_bin", "grade_enc"]),
    ],
    remainder="drop",
    sparse_threshold=0.3,
)

print("Train size:", X_tr.shape, "| Valid size:", X_va.shape)
preproc


Train size: (448, 4) | Valid size: (113, 4)


0,1,2
,transformers,"[('text', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,copy,True
,with_mean,False
,with_std,True


You’re right to sanity-check. We’re on track and every preprocessing choice is for the models we’ll A/B.

What that “P3: split + vectoriser skeleton” step actually did (and why)

Stratified split (80/20) keeps each category’s proportion in train/valid → fair evaluation, avoids optimism on majority classes.

TF-IDF (1–2 n-grams) on text_clean builds a sparse matrix that captures phrases like “consent order” → best for short narratives.

StandardScaler (numeric) scales Worked Time, charged_bin, grade_enc, low_info so linear models don’t get dominated by raw magnitudes.

ColumnTransformer combines text + numeric into one design matrix inside a single object → prevents leakage, keeps transforms identical for train/valid and later inference.

Result you saw (Train size: (448,4) | Valid size: (113,4)) confirms we have 561 labelled rows and exactly 4 raw feature columns before transformation. It’s the correct skeleton to plug any classifier into (LogReg, LinearSVC, LightGBM).

Store processed data or not?

Do NOT persist TF-IDF features to disk (huge/sparse, easy to go stale).

If you want convenience, you may save a light “clean” table (text_clean + simple numeric) as a snapshot for EDA reuse, but the source of truth should be raw data + code.

For production, we’ll save the whole pipeline (preproc + model) via joblib. That guarantees identical preprocessing at inference.

You’re thinking exactly right. We’re on track with the brief, and every processing choice so far is because of the models we’ll A/B (not random).

Quick confirmation (aligned to the document)

Problem: multi-class text classification with partial labels; short narratives; must show business insights + production plan.

Target: keep the 7 given categories as the main label; propose granular tags later for insights only.

EDA learnings → processing: short text → TF-IDF (1–2 n-grams); include non-text features (Grade, Charged?, Worked Time); handle imbalance; keep a low_info flag.

A/B models we’ll test (and why)

Logistic Regression (OvR, class_weight='balanced') — strong on TF-IDF, fast, explainable, gives probabilities for the app.

Linear SVM (LinearSVC + CalibratedClassifierCV) — very strong on sparse text; calibrate to get probabilities.

LightGBM (or XGBoost) — handles sparse matrices; mixes text + numeric well; complementary to linear models.

Multinomial Naive Bayes (baseline) — simple, good on short text (text-only baseline to sanity check).

(Stretch if time remains: Sentence-BERT embeddings + LR.)

Metrics: stratified split, macro F1 as primary, per-class recall, confusion matrix; segment by low_info.
Semi-supervised (later): self-training on high-confidence unlabelled (e.g., p≥0.9), compare uplift.

All our preprocessing (TF-IDF + numeric features via ColumnTransformer) is built for these models.

# Train a baseline Logistic Regression and print metrics

In [9]:
# features & target
features = ["text_clean", "Worked Time", "charged_bin", "grade_enc", "low_info"]
X = df[df["Category"].notna()][features]
y = df[df["Category"].notna()]["Category"].astype(str)

# stratified split
X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# vectoriser + numeric preprocessing
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=20000, min_df=2)
preproc = ColumnTransformer(
    transformers=[
        ("text", tfidf, "text_clean"),
        ("num", StandardScaler(with_mean=False), ["Worked Time","charged_bin","grade_enc","low_info"]),
    ],
    remainder="drop",
    sparse_threshold=0.3,
)

# baseline classifier
clf = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    solver="liblinear"   # solid with sparse + OvR
)

pipe = Pipeline([("pre", preproc), ("clf", clf)])

# train & validate
pipe.fit(X_tr, y_tr)
yp = pipe.predict(X_va)

print("Accuracy:", round(accuracy_score(y_va, yp), 3))
print("F1 (macro):", round(f1_score(y_va, yp, average="macro"), 3))
print("\nClassification report:\n", classification_report(y_va, yp, zero_division=0))


Accuracy: 0.779
F1 (macro): 0.761

Classification report:
                            precision    recall  f1-score   support

              Other comms       1.00      0.73      0.85        15
                    admin       0.80      0.57      0.67         7
analyse, review, research       0.69      0.65      0.67        17
                  billing       0.67      1.00      0.80         2
              client time       0.85      0.85      0.85        40
               onboarding       0.62      1.00      0.77        10
      preparing documents       0.73      0.73      0.73        22

                 accuracy                           0.78       113
                macro avg       0.77      0.79      0.76       113
             weighted avg       0.80      0.78      0.78       113





# Are we doing enough feature engineering & balancing?”

2) “Are we doing enough feature engineering & balancing?”

Short answer: yes, and deliberately lean. For this problem, the strongest signals are in the text itself. We’ve already engineered:

Cleaned text + word TF-IDF (1–2-grams)

Character TF-IDF (3–5) → the change that gave you the jump to Acc 0.814 / Macro-F1 0.796

Numeric/context features: Worked Time, grade_enc, charged_bin, low_info (scaled)

Class imbalance handling via class_weight="balanced"

That’s “real” feature engineering for linear text models—and it worked (clear uplift).

If you want a touch more (still simple, no over-engineering), here are three high-impact, safe adds we can do next if you like:

Append grade as a token into the text (helps separate grade-specific phrasing):
text_aug = text_clean + " __grade_" + Grade.lower() and feed that into TF-IDF (drop grade_enc).

Tiny domain stopword list to down-weight generic verbs: ["email","meeting","update","discuss","call","further","file"].

Keyword flags for the rare class (billing) as binary features or appended tokens (e.g., "__kw_billing") for words like invoice, bill, write-off, charge note, timesheet.

All three are easy to justify in an interview and quick to implement.

What about re-balancing / resampling?

For linear models on sparse text, class_weight="balanced" is usually better than random oversampling (which just duplicates rows and risks overfitting).

If we still wanted to, we could try class-balanced sample weights or minority oversampling inside the train split only, but your current Macro-F1 already shows the weighting is doing its job.

NaNs in the data?

We already set text_clean = "" for missing text and 0 for numeric gaps (Worked Time, flags).

The pipeline handles that consistently; predictions won’t break on NaNs.

Only ~25% labelled?

That’s okay for a POC. For a “future work” slide you can mention self-training: pseudo-label high-confidence unlabelled rows (p≥0.9), retrain, and monitor drift. We can add a small threshold logic in the app (e.g., show top-2 if max-prob < 0.6).