In [57]:
import re
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score, make_scorer, get_scorer
import numpy as np
import plotly.express as px
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate
from sklearn.naive_bayes import ComplementNB
from sklearn.decomposition import TruncatedSVD
from lightgbm import LGBMClassifier
import joblib

In [None]:
def simple_clean(s: str) -> str:
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    return re.sub(r"\s+", " ", s).strip()

def build_features(df_raw: pd.DataFrame):
    df = df_raw.copy()
    df["text_clean"]  = df["Time Narrative"].fillna("").map(simple_clean)
    df["charged_bin"] = df["Charged to Client?"].astype(str).str.upper().eq("YES").astype(int)
    df["grade_enc"]   = df["Grade"].astype("category").cat.codes
    df["n_words"]     = df["text_clean"].str.split().str.len()
    df["low_info"]    = (df["n_words"] <= 3).astype(int)
    return df, df[df["Category"].notna()].copy()

In [9]:
REPO_ROOT = Path.cwd().parent
DATA_PATH = REPO_ROOT / "data" / "interview_task_dataset.csv"
df_raw = pd.read_csv(DATA_PATH)
df, train_df = build_features(df_raw)

print("Labelled rows:", len(train_df))
display(train_df.head(3))

Labelled rows: 561


Unnamed: 0,Record ID,Department,Time Narrative,Worked Time,Charged to Client?,Grade,Category,text_clean,charged_bin,grade_enc,n_words,low_info
2,p-0003,a,considering email in from counsel attaching FD...,0.3,YES,Junior,"analyse, review, research",considering email in from counsel attaching fd...,1,0,8,0
9,p-0010,a,Communicate (with client),0.5,YES,Partner,client time,communicate with client,1,1,3,1
16,p-0017,a,Call out to the client to go through FDA docs ...,0.7,YES,Junior,client time,call out to the client to go through fda docs ...,1,0,16,0


## Train the baseline Logistic Regression

In [10]:
features = ["text_clean", "Worked Time", "charged_bin", "grade_enc", "low_info"]
X = train_df[features]
y = train_df["Category"].astype(str)

X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=2, max_features=20000)
preproc = ColumnTransformer(
    transformers=[
        ("text", tfidf, "text_clean"),
        ("num", StandardScaler(with_mean=False), ["Worked Time","charged_bin","grade_enc","low_info"]),
    ],
    remainder="drop",
    sparse_threshold=0.3,
)
preproc



0,1,2
,transformers,"[('text', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,copy,True
,with_mean,False
,with_std,True


In [17]:
clf = LogisticRegression(max_iter=1000, class_weight="balanced", solver="liblinear")
pipe = Pipeline([("pre", preproc), ("clf", clf)])

pipe.fit(X_tr, y_tr)
yp = pipe.predict(X_va)

print("Accuracy:", round(accuracy_score(y_va, yp), 3))
print("Macro F1:", round(f1_score(y_va, yp, average="macro"), 3))
print("\n", classification_report(y_va, yp, zero_division=0))

Accuracy: 0.779
Macro F1: 0.761

                            precision    recall  f1-score   support

              Other comms       1.00      0.73      0.85        15
                    admin       0.80      0.57      0.67         7
analyse, review, research       0.69      0.65      0.67        17
                  billing       0.67      1.00      0.80         2
              client time       0.85      0.85      0.85        40
               onboarding       0.62      1.00      0.77        10
      preparing documents       0.73      0.73      0.73        22

                 accuracy                           0.78       113
                macro avg       0.77      0.79      0.76       113
             weighted avg       0.80      0.78      0.78       113





## confusion matrix + low_info diagnostics

In [21]:

# Confusion matrix
labels = list(pipe.classes_)
cm = confusion_matrix(y_va, yp, labels=labels)
cm_df = pd.DataFrame(cm, index=[f"true: {c}" for c in labels], columns=[f"pred: {c}" for c in labels])

fig = px.imshow(cm_df, text_auto=True, color_continuous_scale="Blues",
                title="Confusion matrix — Logistic Regression (valid)")
fig.update_layout(template="simple_white")
fig.show()

In [22]:
pairs = (pd.DataFrame({"true": y_va, "pred": yp})
           .value_counts().reset_index(name="rows")
           .sort_values("rows", ascending=False))
display(pairs.head(8))

va_meta = X_va.copy()
va_meta["true"] = y_va.values
va_meta["pred"] = yp

def seg_metrics(mask, name):
    y_t, y_p = va_meta.loc[mask, "true"], va_meta.loc[mask, "pred"]
    print(f"{name} — n={len(y_t)} | Acc={accuracy_score(y_t, y_p):.3f} | MacroF1={f1_score(y_t, y_p, average='macro'):.3f}")

seg_metrics(va_meta["low_info"]==1, "LOW-INFO (≤3 words)")
seg_metrics(va_meta["low_info"]==0, "NORMAL")

Unnamed: 0,true,pred,rows
0,client time,client time,34
1,preparing documents,preparing documents,16
2,Other comms,Other comms,11
3,"analyse, review, research","analyse, review, research",11
4,onboarding,onboarding,10
5,admin,admin,4
6,client time,preparing documents,3
7,preparing documents,"analyse, review, research",3


LOW-INFO (≤3 words) — n=15 | Acc=0.733 | MacroF1=0.714
NORMAL — n=98 | Acc=0.786 | MacroF1=0.753


In [26]:
# 1) features & split
features = ["text_clean", "Worked Time", "charged_bin", "grade_enc", "low_info"]
X = train_df[features]
y = train_df["Category"].astype(str)

X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# 2) preprocessor = WORD TF-IDF + CHAR TF-IDF + numeric scaler
custom_stop = ["out"]  # tiny noise reducer

word_tfidf = TfidfVectorizer(
    ngram_range=(1, 2), min_df=2, max_features=20000, stop_words=custom_stop
)
char_tfidf = TfidfVectorizer(
    analyzer="char", ngram_range=(3, 5), min_df=2
)

preproc = ColumnTransformer(
    transformers=[
        ("word", word_tfidf, "text_clean"),
        ("char", char_tfidf, "text_clean"),
        ("num",  StandardScaler(with_mean=False), ["Worked Time", "charged_bin", "grade_enc", "low_info"]),
    ],
    remainder="drop",
    sparse_threshold=0.3,
)
preproc

0,1,2
,transformers,"[('word', ...), ('char', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,['out']
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'char'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,copy,True
,with_mean,False
,with_std,True


In [28]:
# 3) classifier (OvR wrapper avoids the liblinear warning)
clf = OneVsRestClassifier(
    LogisticRegression(max_iter=1000, class_weight="balanced", solver="liblinear")
)

# 4) pipeline = preprocessor + classifier
pipe = Pipeline([("pre", preproc), ("clf", clf)])

# 5) fit, predict, metrics
pipe.fit(X_tr, y_tr)
yp = pipe.predict(X_va)

print("Accuracy:", round(accuracy_score(y_va, yp), 3))
print("Macro F1:", round(f1_score(y_va, yp, average="macro"), 3))
print("\n", classification_report(y_va, yp, zero_division=0))

Accuracy: 0.814
Macro F1: 0.796

                            precision    recall  f1-score   support

              Other comms       1.00      0.73      0.85        15
                    admin       0.71      0.71      0.71         7
analyse, review, research       0.75      0.71      0.73        17
                  billing       0.67      1.00      0.80         2
              client time       0.88      0.88      0.88        40
               onboarding       0.71      1.00      0.83        10
      preparing documents       0.77      0.77      0.77        22

                 accuracy                           0.81       113
                macro avg       0.78      0.83      0.80       113
             weighted avg       0.83      0.81      0.81       113



In [29]:
# 6) quick confusion view (top pairs)
pairs = (pd.DataFrame({"true": y_va, "pred": yp})
           .value_counts().reset_index(name="rows")
           .sort_values("rows", ascending=False))
display(pairs.head(8))

Unnamed: 0,true,pred,rows
0,client time,client time,35
1,preparing documents,preparing documents,17
2,"analyse, review, research","analyse, review, research",12
3,Other comms,Other comms,11
4,onboarding,onboarding,10
5,admin,admin,5
6,Other comms,client time,2
7,preparing documents,"analyse, review, research",2


In [None]:
labels = list(pipe.classes_)
cm = confusion_matrix(y_va, yp, labels=labels)
cm_df = pd.DataFrame(cm, index=[f"true: {c}" for c in labels], columns=[f"pred: {c}" for c in labels])
fig = px.imshow(cm_df, text_auto=True, color_continuous_scale="Blues",
                title="Confusion matrix — LR (word+char TF-IDF)")
fig.update_layout(template="simple_white")
fig.show()

### Tiny next step (A/B #2): try LinearSVC + calibrated probs with the same preprocessor'

In [32]:
# Model B — LinearSVC (calibrated) with the SAME preproc (word+char+num)
svc_base = LinearSVC(class_weight="balanced")
svc_cal  = CalibratedClassifierCV(svc_base, method="sigmoid", cv=5)

pipe_svc = Pipeline([("pre", preproc), ("clf", svc_cal)])
pipe_svc.fit(X_tr, y_tr)

yp_svc = pipe_svc.predict(X_va)

print("LinearSVC — Accuracy:", round(accuracy_score(y_va, yp_svc), 3))
print("LinearSVC — Macro F1:", round(f1_score(y_va, yp_svc, average="macro"), 3))
print("\n", classification_report(y_va, yp_svc, zero_division=0))

LinearSVC — Accuracy: 0.779
LinearSVC — Macro F1: 0.76

                            precision    recall  f1-score   support

              Other comms       1.00      0.73      0.85        15
                    admin       0.80      0.57      0.67         7
analyse, review, research       0.60      0.71      0.65        17
                  billing       0.67      1.00      0.80         2
              client time       0.83      0.88      0.85        40
               onboarding       0.73      0.80      0.76        10
      preparing documents       0.76      0.73      0.74        22

                 accuracy                           0.78       113
                macro avg       0.77      0.77      0.76       113
             weighted avg       0.79      0.78      0.78       113



In [33]:
pairs_svc = (pd.DataFrame({"true": y_va, "pred": yp_svc})
               .value_counts().reset_index(name="rows")
               .sort_values("rows", ascending=False))
display(pairs_svc.head(8))

Unnamed: 0,true,pred,rows
0,client time,client time,35
1,preparing documents,preparing documents,16
2,"analyse, review, research","analyse, review, research",12
3,Other comms,Other comms,11
4,onboarding,onboarding,8
5,preparing documents,"analyse, review, research",5
6,admin,admin,4
7,Other comms,client time,3


In [34]:
labels = list(pipe_svc.classes_)
cm = confusion_matrix(y_va, yp_svc, labels=labels)
cm_df = pd.DataFrame(cm, index=[f"true: {c}" for c in labels],
                        columns=[f"pred: {c}" for c in labels])
fig = px.imshow(cm_df, text_auto=True, color_continuous_scale="Blues",
                title="Confusion matrix — LinearSVC (valid)")
fig.update_layout(template="simple_white")
fig.show()

## Model C: Multinomial Logistic Regression (saga)

In [35]:
# Model C — Multinomial Logistic Regression (saga)
clf_mnlr = LogisticRegression(
    solver="saga",
    multi_class="multinomial",
    class_weight="balanced",
    max_iter=4000,
    n_jobs=-1,
)

pipe_mnlr = Pipeline([("pre", preproc), ("clf", clf_mnlr)])
pipe_mnlr.fit(X_tr, y_tr)
yp_mnlr = pipe_mnlr.predict(X_va)

print("Multinomial LR — Accuracy:", round(accuracy_score(y_va, yp_mnlr), 3))
print("Multinomial LR — Macro F1:", round(f1_score(y_va, yp_mnlr, average="macro"), 3))
print("\n", classification_report(y_va, yp_mnlr, zero_division=0))





Multinomial LR — Accuracy: 0.796
Multinomial LR — Macro F1: 0.784

                            precision    recall  f1-score   support

              Other comms       1.00      0.73      0.85        15
                    admin       0.71      0.71      0.71         7
analyse, review, research       0.71      0.71      0.71        17
                  billing       0.67      1.00      0.80         2
              client time       0.87      0.85      0.86        40
               onboarding       0.71      1.00      0.83        10
      preparing documents       0.73      0.73      0.73        22

                 accuracy                           0.80       113
                macro avg       0.77      0.82      0.78       113
             weighted avg       0.81      0.80      0.80       113



In [36]:
pairs_mnlr = (pd.DataFrame({"true": y_va, "pred": yp_mnlr})
                .value_counts().reset_index(name="rows")
                .sort_values("rows", ascending=False))
display(pairs_mnlr.head(8))

Unnamed: 0,true,pred,rows
0,client time,client time,34
1,preparing documents,preparing documents,16
2,"analyse, review, research","analyse, review, research",12
3,Other comms,Other comms,11
4,onboarding,onboarding,10
5,admin,admin,5
6,client time,preparing documents,3
7,preparing documents,"analyse, review, research",3


In [37]:
labels = list(pipe_mnlr.classes_)
cm = confusion_matrix(y_va, yp_mnlr, labels=labels)
cm_df = pd.DataFrame(cm, index=[f"true: {c}" for c in labels],
                        columns=[f"pred: {c}" for c in labels])
fig = px.imshow(cm_df, text_auto=True, color_continuous_scale="Blues",
                title="Confusion matrix — Multinomial LR (valid)")
fig.update_layout(template="simple_white")
fig.show()

# sanity-check stability with CV

In [42]:
# Robust CV for the champion model using the built-in "f1_macro" scorer
features = ["text_clean", "Worked Time", "charged_bin", "grade_enc", "low_info"]
X_all = train_df[features]
y_all = train_df["Category"].astype(str)

champ_clf  = OneVsRestClassifier(
    LogisticRegression(max_iter=1000, class_weight="balanced", solver="liblinear")
)
champ_pipe = Pipeline([("pre", preproc), ("clf", champ_clf)])

In [44]:
# keep CV safe & readable
k = 3 
cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

cv_res = cross_validate(
    champ_pipe, X_all, y_all,
    cv=cv,
    scoring="f1_macro",  
    n_jobs=1,             
    error_score="raise"
)

scores = cv_res["test_score"]
print("Macro-F1 per fold:", scores.round(3))
print("Mean ± SD:", f"{scores.mean():.3f} ± {scores.std():.3f}")

Macro-F1 per fold: [0.91  0.774 0.762]
Mean ± SD: 0.815 ± 0.067


## A1) Complement Naive Bayes (text-only)

In [46]:
# A1: Complement Naive Bayes (text-only) on your X_tr / X_va
nb_pre = ColumnTransformer(
    transformers=[
        ("w", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=40000), "text_clean"),
        ("c", TfidfVectorizer(analyzer="char", ngram_range=(3,5), min_df=2, max_features=60000), "text_clean"),
    ],
    remainder="drop",
)

nb_pipe = Pipeline([("pre", nb_pre), ("clf", ComplementNB(alpha=0.5))])

nb_pipe.fit(X_tr, y_tr)
nb_pred = nb_pipe.predict(X_va)

print(f"ComplementNB — Acc: {accuracy_score(y_va, nb_pred):.3f}  "
      f"MacroF1: {f1_score(y_va, nb_pred, average='macro'):.3f}")
print(classification_report(y_va, nb_pred, digits=2, zero_division=0))

ComplementNB — Acc: 0.743  MacroF1: 0.651
                           precision    recall  f1-score   support

              Other comms       0.80      0.80      0.80        15
                    admin       1.00      0.14      0.25         7
analyse, review, research       0.58      0.65      0.61        17
                  billing       1.00      0.50      0.67         2
              client time       0.82      0.90      0.86        40
               onboarding       0.67      0.60      0.63        10
      preparing documents       0.71      0.77      0.74        22

                 accuracy                           0.74       113
                macro avg       0.80      0.62      0.65       113
             weighted avg       0.76      0.74      0.73       113



## A2) SVD + LightGBM (multiclass, balanced)

In [53]:
# A2: TF-IDF -> SVD -> LightGBM on your X_tr / X_va
num_cols = ["Worked Time","charged_bin","grade_enc","low_info"]

svd_pre = ColumnTransformer(
    transformers=[
        ("w", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=40000), "text_clean"),
        ("c", TfidfVectorizer(analyzer="char", ngram_range=(3,5), min_df=2, max_features=60000), "text_clean"),
        ("num", StandardScaler(with_mean=False), num_cols),
    ],
    remainder="drop",
)

lgbm_pipe = Pipeline([
    ("pre", svd_pre),
    ("svd", TruncatedSVD(n_components=300, random_state=42)),
    ("clf", LGBMClassifier(
        objective="multiclass",
        class_weight="balanced",
        n_estimators=300,
        num_leaves=31,
        learning_rate=0.1,
        random_state=42
    ))
])

lgbm_pipe.fit(X_tr, y_tr)
lgbm_pred = lgbm_pipe.predict(X_va)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003518 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 44885
[LightGBM] [Info] Number of data points in the train set: 448, number of used features: 300
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910



X does not have valid feature names, but LGBMClassifier was fitted with feature names



In [55]:
print(f"LGBM+SVD — Acc: {accuracy_score(y_va, lgbm_pred):.3f}  "
      f"MacroF1: {f1_score(y_va, lgbm_pred, average='macro'):.3f}")
print(classification_report(y_va, lgbm_pred, digits=2, zero_division=0))

LGBM+SVD — Acc: 0.761  MacroF1: 0.705
                           precision    recall  f1-score   support

              Other comms       1.00      0.73      0.85        15
                    admin       0.67      0.29      0.40         7
analyse, review, research       0.92      0.71      0.80        17
                  billing       0.67      1.00      0.80         2
              client time       0.80      0.90      0.85        40
               onboarding       0.43      0.60      0.50        10
      preparing documents       0.71      0.77      0.74        22

                 accuracy                           0.76       113
                macro avg       0.74      0.71      0.70       113
             weighted avg       0.78      0.76      0.76       113



## B) Micro-tweak: Grade-as-token with LR champion

In [56]:
# B: Grade-as-token augmentation with the same split
X_tr_aug = X_tr.copy()
X_va_aug = X_va.copy()
X_tr_aug = X_tr_aug.join(train_df.loc[X_tr.index, "Grade"])
X_va_aug = X_va_aug.join(train_df.loc[X_va.index, "Grade"])

X_tr_aug["text_aug"] = X_tr_aug["text_clean"].fillna("") + " __grade_" + X_tr_aug["Grade"].str.lower()
X_va_aug["text_aug"] = X_va_aug["text_clean"].fillna("") + " __grade_" + X_va_aug["Grade"].str.lower()

num_cols = ["Worked Time","charged_bin","grade_enc","low_info"]

pre_aug = ColumnTransformer(
    transformers=[
        ("w", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=40000), "text_aug"),
        ("c", TfidfVectorizer(analyzer="char", ngram_range=(3,5), min_df=2, max_features=60000), "text_aug"),
        ("num", StandardScaler(with_mean=False), num_cols),
    ],
    remainder="drop",
)

lr_aug = Pipeline([
    ("pre", pre_aug),
    ("clf", OneVsRestClassifier(
        LogisticRegression(max_iter=1000, class_weight="balanced", solver="liblinear")
    ))
])

lr_aug.fit(X_tr_aug, y_tr)
aug_pred = lr_aug.predict(X_va_aug)

print(f"LR (grade-as-token) — Acc: {accuracy_score(y_va, aug_pred):.3f}  "
      f"MacroF1: {f1_score(y_va, aug_pred, average='macro'):.3f}")
print(classification_report(y_va, aug_pred, digits=2, zero_division=0))


LR (grade-as-token) — Acc: 0.796  MacroF1: 0.772
                           precision    recall  f1-score   support

              Other comms       1.00      0.73      0.85        15
                    admin       0.80      0.57      0.67         7
analyse, review, research       0.73      0.65      0.69        17
                  billing       0.67      1.00      0.80         2
              client time       0.89      0.85      0.87        40
               onboarding       0.62      1.00      0.77        10
      preparing documents       0.72      0.82      0.77        22

                 accuracy                           0.80       113
                macro avg       0.78      0.80      0.77       113
             weighted avg       0.82      0.80      0.80       113



## Save the champion LR with grade mapping

In [76]:
# Refit on ALL labelled rows and save artifact WITH the grade mapping
features = ["text_clean", "Worked Time", "charged_bin", "grade_enc", "low_info"]
X_all = train_df[features]
y_all = train_df["Category"].astype(str)

word_tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=20000, stop_words=["out"])
char_tfidf = TfidfVectorizer(analyzer="char", ngram_range=(3,5), min_df=2)

pre = ColumnTransformer(
    transformers=[
        ("word", word_tfidf, "text_clean"),
        ("char", char_tfidf, "text_clean"),
        ("num",  StandardScaler(with_mean=False), ["Worked Time","charged_bin","grade_enc","low_info"]),
    ],
    sparse_threshold=0.3,
)

clf = OneVsRestClassifier(
    LogisticRegression(max_iter=1000, class_weight="balanced", solver="liblinear")
)

champion = Pipeline([("pre", pre), ("clf", clf)])
champion.fit(X_all, y_all)

grade2code = dict(train_df[["Grade","grade_enc"]].drop_duplicates().values.tolist())

Path(f"{REPO_ROOT}/models").mkdir(exist_ok=True)
joblib.dump(
    {"model": champion, "labels": champion.classes_.tolist(), "grade2code": grade2code},
    f"{REPO_ROOT}/models/champion_lr_v1.joblib"
)
print("Saved -> models/champion_lr_v1.joblib")
print("grade2code:", grade2code)

Saved -> models/champion_lr_v1.joblib
grade2code: {'Junior': 0, 'Partner': 1, 'Senior': 2}


## Model Testing

In [60]:
ART = joblib.load("models/champion_lr_v1.joblib")
MODEL = ART["model"]
GRADE2CODE = ART["grade2code"]

In [61]:
def _clean(s: str) -> str:
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    return re.sub(r"\s+", " ", s).strip()

def predict_rows(rows):
    """
    rows: list of dicts with keys:
        text, worked_time, grade, charged_to_client  (YES/NO or 1/0)
    returns: preds (np.array[str]), scores_df (top-3 decision scores), X_df (features used)
    """
    recs = []
    for r in rows:
        txt = _clean(r.get("text",""))
        words = txt.split()
        charged_raw = str(r.get("charged_to_client","")).upper()
        charged = 1 if charged_raw in ("YES","Y","TRUE","1") else 0
        grade_key = str(r.get("grade","")).title()  # 'Junior','Senior','Partner'
        recs.append({
            "text_clean": txt,
            "Worked Time": float(r.get("worked_time", 0.0)),
            "charged_bin": int(charged),
            "grade_enc": int(GRADE2CODE.get(grade_key, 0)),  # default 0 if unseen
            "low_info": int(len(words) <= 3),
        })
    X_df = pd.DataFrame.from_records(recs)

    preds = MODEL.predict(X_df)

    # Top-3 scores (uses decision_function since LR(liblinear) in OvR has no predict_proba)
    scores_df = None
    if hasattr(MODEL, "predict_proba"):
        probs = MODEL.predict_proba(X_df)  # shape (n, K)
        classes = MODEL.classes_
        topk_idx = np.argsort(probs, axis=1)[:, -3:]
        rows_out = [{classes[j]: float(probs[i, j]) for j in topk_idx[i]} for i in range(len(X_df))]
        scores_df = pd.DataFrame(rows_out)
    elif hasattr(MODEL, "decision_function"):
        margins = np.atleast_2d(MODEL.decision_function(X_df))
        classes = MODEL.classes_
        topk_idx = np.argsort(margins, axis=1)[:, -3:]
        rows_out = [{classes[j]: float(margins[i, j]) for j in topk_idx[i]} for i in range(len(X_df))]
        scores_df = pd.DataFrame(rows_out)
    
    conf = []
    for i, row in scores_df.iterrows():
        # highest probability across available entries in the row
        top = max(v for v in row.dropna().values.tolist())
        conf.append(top)

    scores_df["__confidence"] = conf
    scores_df["__needs_review"] = (scores_df["__confidence"] < 0.50)  # threshold you can tune

    return preds, scores_df, X_df

In [62]:
# --- smoke test (three tiny examples) ---
rows = [
    {"text":"email out with draft letter to client", "worked_time":0.3, "grade":"Junior",  "charged_to_client":"YES"},
    {"text":"prepare consent order and revise form", "worked_time":0.7, "grade":"Senior",  "charged_to_client":"YES"},
    {"text":"onboarding paperwork and admin setup",  "worked_time":0.5, "grade":"Partner", "charged_to_client":"NO"},
]
preds, scores_df, X_used = predict_rows(rows)
print("Predictions:", preds)
display(scores_df)
display(X_used)

Predictions: ['client time' 'preparing documents' 'onboarding']


Unnamed: 0,Other comms,preparing documents,client time,"analyse, review, research",billing,admin,onboarding,__confidence,__needs_review
0,0.069852,0.288418,0.571812,,,,,0.571812,False
1,,0.602952,0.106491,0.239542,,,,0.602952,False
2,,,,,0.138575,0.210086,0.495593,0.495593,True


Unnamed: 0,text_clean,Worked Time,charged_bin,grade_enc,low_info
0,email out with draft letter to client,0.3,1,0,0
1,prepare consent order and revise form,0.7,1,2,0
2,onboarding paperwork and admin setup,0.5,0,1,0


## Batch score the whole dataset

In [78]:
# Batch-score the entire dataset and save a report

assert 'df' in globals(), "I need the full DataFrame 'df' already in memory."
ART = joblib.load(f"{REPO_ROOT}/models/champion_lr_v1.joblib")
MODEL = ART["model"]

In [64]:
feat_cols = ["text_clean","Worked Time","charged_bin","grade_enc","low_info"]
X_all = df[feat_cols]

# predictions
pred = MODEL.predict(X_all)

In [65]:
top1_prob = None
top2_label = None
if hasattr(MODEL, "predict_proba"):
    P = MODEL.predict_proba(X_all)          
    C = MODEL.classes_
    top_idx = np.argsort(P, axis=1)
    top1 = top_idx[:, -1]
    top2 = top_idx[:, -2]
    top1_prob = P[np.arange(len(P)), top1]
    top2_label = C[top2]
else:
    M = np.atleast_2d(MODEL.decision_function(X_all))
    C = MODEL.classes_
    top_idx = np.argsort(M, axis=1)
    top1 = top_idx[:, -1]
    top2 = top_idx[:, -2]
    # margin isn't a probability; scale to [0,1] for display only
    mmin, mmax = M.min(), M.max()
    top1_prob = (M[np.arange(len(M)), top1] - mmin) / (mmax - mmin + 1e-9)
    top2_label = C[top2]

In [66]:
OUT = pd.DataFrame({
    "Record ID": df.get("Record ID", pd.Series(range(len(df)))),
    "Time Narrative": df["Time Narrative"],
    "Grade": df["Grade"],
    "Worked Time": df["Worked Time"],
    "Charged to Client?": df["Charged to Client?"],
    "predicted_category": pred,
    "top1_confidence": top1_prob,
    "top2_suggestion": top2_label,
})
OUT["needs_review"] = OUT["top1_confidence"] < 0.50

In [67]:
cov = (OUT["needs_review"] == False).mean()
by_class = OUT["predicted_category"].value_counts().sort_values(ascending=False)

print(f"Automation coverage @0.50 threshold: {cov:.1%} of rows auto-classified")
display(by_class.to_frame("rows"))

Automation coverage @0.50 threshold: 72.9% of rows auto-classified


Unnamed: 0_level_0,rows
predicted_category,Unnamed: 1_level_1
client time,791
preparing documents,435
"analyse, review, research",311
Other comms,263
onboarding,196
admin,130
billing,31


In [82]:
Path(f"{REPO_ROOT}/reports").mkdir(exist_ok=True, parents=True)
csv_path = Path(f"{REPO_ROOT}/reports/predictions_v1.csv")
OUT.to_csv(csv_path, index=False)
print("Saved:", csv_path.resolve())

Saved: D:\OneDrive\Data\Work\01_My_AI_Portfolio\GitHub-Uploaded\IrwinMicheall-Interview\legal-time-categorisation-poc\reports\predictions_v1.csv


In [69]:
display(OUT.head(10))

Unnamed: 0,Record ID,Time Narrative,Grade,Worked Time,Charged to Client?,predicted_category,top1_confidence,top2_suggestion,needs_review
0,p-0001,Amending and updating statement,Senior,0.4,YES,preparing documents,0.637494,"analyse, review, research",False
1,p-0002,Reviewed court order and drafted advice email ...,Junior,1.3,YES,client time,0.455111,"analyse, review, research",True
2,p-0003,considering email in from counsel attaching FD...,Junior,0.3,YES,"analyse, review, research",0.458921,client time,True
3,p-0004,Communicate (other party(s)/other outside lawy...,Junior,0.1,YES,Other comms,0.712444,preparing documents,False
4,p-0005,Filing physical documents,Junior,0.1,NO,admin,0.498349,onboarding,True
5,p-0006,Emailing client to acknowledge safe receipt of...,Junior,0.1,YES,client time,0.423673,preparing documents,True
6,p-0007,considered email and order from client ; short...,Senior,0.1,YES,client time,0.468386,"analyse, review, research",True
7,p-0008,Draft/ Revise post-nup,Senior,0.3,YES,preparing documents,0.660083,"analyse, review, research",False
8,p-0009,Exchange of emails with client,Partner,0.2,YES,client time,0.628054,Other comms,False
9,p-0010,Communicate (with client),Partner,0.5,YES,client time,0.771812,Other comms,False


# Global explainability
## Top words per class (model coefficients):

In [None]:
ovr = pipe.named_steps["clf"]    
pre = pipe.named_steps["pre"]

feat_names = []
for name, trans, cols in pre.transformers_:
    if name in ("word","char"):
        fn = trans.get_feature_names_out()
        feat_names.extend(fn)
    elif name=="num":
        feat_names.extend(cols)

def top_terms_for(class_ix, k=12):
    lr = ovr.estimators_[class_ix]
    coefs = lr.coef_.ravel()
    top = np.argsort(coefs)[-k:][::-1]
    df = pd.DataFrame({
        "feature": [feat_names[i] for i in top],
        "coef": coefs[top]
    })
    return df

for i, cls in enumerate(ovr.classes_):
    print(f"\nTop terms for class: {cls}")
    display(top_terms_for(i, 10))



Top terms for class: Other comms


Unnamed: 0,feature,coef
0,communicate,1.944389
1,other,1.370527
2,team,1.273974
3,communicate experts,1.218342
4,communicate other,1.186821
5,experts,1.095448
6,outside,1.008781
7,external,0.879041
8,other external,0.879041
9,re mediation,0.683159



Top terms for class: admin


Unnamed: 0,feature,coef
0,internal,1.033298
1,handover,0.907077
2,low_info,0.73612
3,leave,0.730064
4,team,0.727259
5,2x,0.708005
6,drive,0.70764
7,share,0.70764
8,share drive,0.70764
9,travel,0.631356



Top terms for class: analyse, review, research


Unnamed: 0,feature,coef
0,review,2.871804
1,reviewing,1.508694
2,reviewed,1.258053
3,evie,1.038175
4,revie,1.038175
5,vie,1.038175
6,rev,1.032307
7,eview,1.024128
8,iew,1.024128
9,view,1.024128



Top terms for class: billing


Unnamed: 0,feature,coef
0,bill,2.141158
1,billing,1.253919
2,drafted,1.15686
3,final,1.056481
4,invoice,0.922564
5,checked,0.885755
6,bil,0.839054
7,bill,0.839054
8,bill,0.839054
9,final bill,0.83171



Top terms for class: client time


Unnamed: 0,feature,coef
0,client,1.987411
1,email,0.964568
2,charged_bin,0.832755
3,with client,0.82968
4,attending,0.805598
5,client with,0.803455
6,call,0.794814
7,new,0.772116
8,attend,0.754498
9,the client,0.74704



Top terms for class: onboarding


Unnamed: 0,feature,coef
0,loe,1.538211
1,id,1.496756
2,onboarding,1.264577
3,tob,1.196275
4,ra,0.955199
5,and ra,0.953323
6,loe and,0.921405
7,of engagement,0.772155
8,engagement,0.772155
9,obtain id,0.718906



Top terms for class: preparing documents


Unnamed: 0,feature,coef
0,revise,1.747874
1,letter to,1.172843
2,drafting,1.121386
3,form,1.110667
4,amend,1.07639
5,draft,1.059699
6,updating,1.041358
7,offer,0.982022
8,drafted,0.793981
9,d81,0.785757


## Business value estimate

In [72]:
rows_total     = 2000     
coverage       = 0.729
manual_secs    = 20

auto_rows      = int(rows_total * coverage)
mins_saved     = auto_rows * manual_secs / 60
hrs_saved      = mins_saved / 60

print(f"Auto-classified rows: {auto_rows} / {rows_total}")
print(f"Labelling time saved: {hrs_saved:.1f} hours/week")

Auto-classified rows: 1458 / 2000
Labelling time saved: 8.1 hours/week


In [75]:
# If a junior para/legal does labeling, showing a gross cost equivalent:
junior_rate = 200
value_equiv = hrs_saved * junior_rate
print(f"Equivalent capacity value @£{junior_rate}/h: ~£{value_equiv:,.0f}/week")

Equivalent capacity value @£200/h: ~£1,620/week
