In [12]:
# Minimal, inline preprocessing for modelling (no imports from src)
import re
import pandas as pd
from pathlib import Path

In [14]:
def simple_clean(s: str) -> str:
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    return re.sub(r"\s+", " ", s).strip()

def build_features(df_raw: pd.DataFrame):
    df = df_raw.copy()
    df["text_clean"]  = df["Time Narrative"].fillna("").map(simple_clean)
    df["charged_bin"] = df["Charged to Client?"].astype(str).str.upper().eq("YES").astype(int)
    df["grade_enc"]   = df["Grade"].astype("category").cat.codes
    df["n_words"]     = df["text_clean"].str.split().str.len()
    df["low_info"]    = (df["n_words"] <= 3).astype(int)
    # minutes only for UI, not needed for model here
    return df, df[df["Category"].notna()].copy()

In [15]:
# Load raw and build features
REPO_ROOT = Path.cwd().parent
DATA_PATH = REPO_ROOT / "data" / "interview_task_dataset.csv"
df_raw = pd.read_csv(DATA_PATH)
df, train_df = build_features(df_raw)

print("Labelled rows:", len(train_df))
display(train_df.head(3))

Labelled rows: 561


Unnamed: 0,Record ID,Department,Time Narrative,Worked Time,Charged to Client?,Grade,Category,text_clean,charged_bin,grade_enc,n_words,low_info
2,p-0003,a,considering email in from counsel attaching FD...,0.3,YES,Junior,"analyse, review, research",considering email in from counsel attaching fd...,1,0,8,0
9,p-0010,a,Communicate (with client),0.5,YES,Partner,client time,communicate with client,1,1,3,1
16,p-0017,a,Call out to the client to go through FDA docs ...,0.7,YES,Junior,client time,call out to the client to go through fda docs ...,1,0,16,0


In [None]:
# M2: preprocessor + split + baseline Logistic Regression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score

# features & target
features = ["text_clean", "Worked Time", "charged_bin", "grade_enc", "low_info"]
X = train_df[features]
y = train_df["Category"].astype(str)

# stratified split (keeps class balance)
X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# text + numeric transformer (built for short text + our A/B models)
tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=2, max_features=20000)
preproc = ColumnTransformer(
    transformers=[
        ("text", tfidf, "text_clean"),
        ("num", StandardScaler(with_mean=False), ["Worked Time","charged_bin","grade_enc","low_info"]),
    ],
    remainder="drop",
    sparse_threshold=0.3,
)

# baseline classifier (robust on TF-IDF; class_weight handles imbalance)
clf = LogisticRegression(max_iter=1000, class_weight="balanced", solver="liblinear")

pipe = Pipeline([("pre", preproc), ("clf", clf)])
pipe.fit(X_tr, y_tr)

yp = pipe.predict(X_va)

acc = accuracy_score(y_va, yp)
macro_f1 = f1_score(y_va, yp, average="macro")
print(f"Accuracy:  {acc:.3f}")
print(f"Macro F1:  {macro_f1:.3f}\n")
print(classification_report(y_va, yp, zero_division=0))
