In [None]:
from pathlib import Path
import pandas as pd
import plotly.express as px
import plotly.io as pio
import re
from collections import Counter

pio.renderers.default = "vscode"       
pio.renderers.default = "notebook_connected"


In [2]:
REPO_ROOT = Path.cwd().parent
DATA_PATH = REPO_ROOT / "data" / "interview_task_dataset.csv"
assert DATA_PATH.exists(), f"File not found: {DATA_PATH.resolve()}"

## Load + Inspect, then Label Sparsity

In [3]:
df = pd.read_csv(DATA_PATH)
df.head(3)

Unnamed: 0,Record ID,Department,Time Narrative,Worked Time,Charged to Client?,Grade,Category
0,p-0001,a,Amending and updating statement,0.4,YES,Senior,
1,p-0002,a,Reviewed court order and drafted advice email ...,1.3,YES,Junior,
2,p-0003,a,considering email in from counsel attaching FD...,0.3,YES,Junior,"analyse, review, research"


In [4]:
df.shape

(2157, 7)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2157 entries, 0 to 2156
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Record ID           2157 non-null   object 
 1   Department          2157 non-null   object 
 2   Time Narrative      2157 non-null   object 
 3   Worked Time         2157 non-null   float64
 4   Charged to Client?  2157 non-null   object 
 5   Grade               2157 non-null   object 
 6   Category            561 non-null    object 
dtypes: float64(1), object(6)
memory usage: 118.1+ KB


In [7]:
labelled = df['Category'].notna().sum()
unlabelled = df['Category'].isna().sum()
share = labelled / len(df) * 100

print(f"\nLabelled:   {labelled:,}")
print(f"Unlabelled: {unlabelled:,}")
print(f"Share labelled: {share:.1f}%")


Labelled:   561
Unlabelled: 1,596
Share labelled: 26.0%


## Label distribution (only on labelled rows)

In [8]:
tbl = (
    df.assign(charged=df["Charged to Client?"].str.upper().eq("YES"))
      .groupby("Grade", as_index=False)
      .agg(rows=("charged", "size"), charged_yes=("charged", "sum"))
)
tbl["charged_%"] = (tbl["charged_yes"] / tbl["rows"] * 100).round(1)
tbl.sort_values("Grade", inplace=True)

tbl

Unnamed: 0,Grade,rows,charged_yes,charged_%
0,Junior,893,686,76.8
1,Partner,268,224,83.6
2,Senior,996,868,87.1


In [None]:
plot_df = tbl.sort_values("charged_%", ascending=False).copy()

fig = px.bar(
    plot_df,
    x="Grade", y="charged_%", text="charged_%",
)

fig.update_traces(
    texttemplate="%{text:.1f}%", textposition="outside",
    hovertemplate="Grade: %{x}<br>Charge rate: %{y:.1f}%<br>Rows: %{customdata}",
    customdata=plot_df["rows"],
)

fig.update_layout(
    title=dict(text="Charge rate by grade", x=0.02, xanchor="left"),
    template="simple_white",
    font=dict(family="Segoe UI, Arial, sans-serif", size=14),
    yaxis=dict(title="Charged (%)", range=[0, 100], ticksuffix="%"),
    xaxis_title=None,
    uniformtext_minsize=12,
    bargap=0.35,
    margin=dict(t=60, r=30, l=60, b=60),
)

fig.show()


## Labelled category mix (class balance)

In [10]:
df_l = df[df["Category"].notna()].copy()

cat_counts = (
    df_l["Category"]
    .value_counts()
    .rename_axis("Category")
    .reset_index(name="rows")
)
cat_counts["share_%"] = (cat_counts["rows"] / cat_counts["rows"].sum() * 100).round(1)
cat_counts



Unnamed: 0,Category,rows,share_%
0,client time,199,35.5
1,preparing documents,109,19.4
2,"analyse, review, research",85,15.2
3,Other comms,75,13.4
4,onboarding,49,8.7
5,admin,32,5.7
6,billing,12,2.1


In [11]:
fig = px.bar(
    cat_counts, x="Category", y="rows", text="share_%",
    title="Labelled subset: category mix",
)
fig.update_traces(texttemplate="%{text}%", textposition="outside")
fig.update_layout(template="simple_white", xaxis_title=None, yaxis_title="Rows")
fig.show()

## Text length (word count) distribution

In [36]:
df["n_words"] = df["Time Narrative"].fillna("").str.split().str.len()
df["n_words"].describe()

count    2157.000000
mean        8.121465
std         5.025546
min         1.000000
25%         5.000000
50%         7.000000
75%        10.000000
max        54.000000
Name: n_words, dtype: float64

In [37]:
fig = px.histogram(
    df, x="n_words", nbins=40,
    title="Time Narrative length (word count)",
)
fig.update_layout(
    template="simple_white",
    xaxis_title="Words per entry",
    yaxis_title="Rows",
    margin=dict(t=60, r=30, l=60, b=60),
)
fig.show()

## mark “low-info” narratives

In [38]:
df["low_info"] = (df["n_words"] <= 3).astype(int)

low_info_share = (df["low_info"].mean() * 100).round(1)
print(f"Low-info narratives (≤3 words): {low_info_share}%")

Low-info narratives (≤3 words): 14.9%


In [39]:
lab_rate = (
    df.assign(labelled=df["Category"].notna())
      .groupby("low_info")["labelled"].mean()
      .mul(100).round(1)
      .rename("labelled_%")
)
lab_rate

low_info
0    26.4
1    23.9
Name: labelled_%, dtype: float64

## Worked Time vs Charged?


In [16]:
dfv = df.assign(charged=df["Charged to Client?"].str.upper().map({"YES": "Yes", "NO": "No"}))

fig = px.box(
    dfv, x="charged", y="Worked Time", color="charged",
    title="Worked Time vs Charged?",
    points="suspectedoutliers",
)
fig.update_layout(template="simple_white", xaxis_title=None, yaxis_title="Hours")
fig.show()

In [17]:
dfv.groupby("charged")["Worked Time"].agg(count="size", median="median", mean="mean").round(2)

Unnamed: 0_level_0,count,median,mean
charged,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,379,0.5,0.66
Yes,1778,0.3,0.58


In [41]:
df["worked_minutes"] = (df["Worked Time"] * 60).round(0).astype(int)

df["worked_min_bucket6"] = ((df["worked_minutes"] / 6).round().astype(int) * 6).astype(int)

dfv = df.assign(charged=df["Charged to Client?"].str.upper().map({"YES": "Yes", "NO": "No"}))
fig = px.box(
    dfv, x="charged", y="worked_minutes", color="charged",
    title="Worked minutes vs Charged?"
)
fig.update_layout(template="simple_white", xaxis_title=None, yaxis_title="Minutes")
fig.show()

In [42]:
dfv.groupby("charged")["worked_minutes"].agg(count="size", median="median", mean="mean")

Unnamed: 0_level_0,count,median,mean
charged,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,379,30.0,39.593668
Yes,1778,18.0,35.004499


## “Leak rate”: non-billable share among long sessions (≥ 60 min)

In [43]:
THRESH = 60  # minutes
df["worked_minutes"] = (df["Worked Time"] * 60).round(0).astype(int)
df["charged"] = df["Charged to Client?"].str.upper().map({"YES": "Yes", "NO": "No"})
df["long"] = df["worked_minutes"] >= THRESH

overall = (
    df.loc[df["long"]]
      .groupby("charged")
      .size()
      .rename("rows")
      .reset_index()
)
overall["share_%"] = (overall["rows"] / overall["rows"].sum() * 100).round(1)
display(overall)


Unnamed: 0,charged,rows,share_%
0,No,78,19.5
1,Yes,321,80.5


In [None]:
by_grade = (
    df.loc[df["long"]]
      .pivot_table(index="Grade", columns="charged", values="worked_minutes", aggfunc="size", fill_value=0)
      .reset_index()
      .rename_axis(None, axis=1)
)
if "No" not in by_grade.columns: by_grade["No"] = 0
if "Yes" not in by_grade.columns: by_grade["Yes"] = 0
by_grade["non_billable_share_%"] = (by_grade["No"] / (by_grade["No"] + by_grade["Yes"]).replace(0, pd.NA) * 100).round(1)
by_grade = by_grade.sort_values("non_billable_share_%", ascending=False)

by_grade

Unnamed: 0,Grade,No,Yes,non_billable_share_%
1,Partner,16,44,26.7
0,Junior,36,120,23.1
2,Senior,26,157,14.2


In [24]:
fig = px.bar(
    by_grade, x="Grade", y="non_billable_share_%", text="non_billable_share_%",
    title=f"Non-billable share for long sessions (≥{THRESH} min)",
)
fig.update_traces(texttemplate="%{text}%", textposition="outside")
fig.update_layout(template="simple_white", yaxis_title="Non-billable (%)", xaxis_title=None)
fig.show()

In [44]:
df["worked_minutes"] = (df["Worked Time"] * 60).round(0).astype(int)
df["worked_min_bucket6"] = ((df["worked_minutes"] / 6).round().astype(int) * 6).astype(int)

fig = px.histogram(
    df, x="worked_min_bucket6",
    title="Worked minutes — 6-minute billing buckets",
)
fig.update_layout(
    template="simple_white",
    xaxis_title="Minutes (6-min buckets)",
    yaxis_title="Rows",
)
fig.update_xaxes(tickmode="linear", dtick=30)  # ticks every 30 min for readability
fig.show()


## Grade × Category heatmap (labelled subset)

In [45]:
df_l = df[df["Category"].notna()].copy()

pvt = (
    df_l.pivot_table(index="Grade", columns="Category", values="Worked Time",
                     aggfunc="size", fill_value=0)
)
pvt



Category,Other comms,admin,"analyse, review, research",billing,client time,onboarding,preparing documents
Grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Junior,32,18,21,6,73,32,47
Partner,6,3,16,3,29,4,6
Senior,37,11,48,3,97,13,56


In [28]:
fig = px.imshow(
    pvt,
    text_auto=True,
    color_continuous_scale="Blues",
    aspect="auto",
    title="Labelled subset: Grade × Category (counts)"
)
fig.update_layout(template="simple_white", xaxis_title="Category", yaxis_title="Grade")
fig.show()

## convert counts → within-grade percentages

In [46]:
df_l = df[df["Category"].notna()].copy()

pvt_counts = df_l.pivot_table(
    index="Grade", columns="Category", values="Worked Time",
    aggfunc="size", fill_value=0
)

pvt_pct = pvt_counts.div(pvt_counts.sum(axis=1), axis=0).mul(100).round(1)
pvt_pct

Category,Other comms,admin,"analyse, review, research",billing,client time,onboarding,preparing documents
Grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Junior,14.0,7.9,9.2,2.6,31.9,14.0,20.5
Partner,9.0,4.5,23.9,4.5,43.3,6.0,9.0
Senior,14.0,4.2,18.1,1.1,36.6,4.9,21.1


In [47]:
fig = px.imshow(
    pvt_pct,
    text_auto=True,
    color_continuous_scale="Blues",
    aspect="auto",
    title="Grade × Category — within-grade %"
)
fig.update_layout(template="simple_white", xaxis_title="Category", yaxis_title="Grade")
fig.show()

## Top keywords per category (labelled subset)

In [48]:
df_l = df[df["Category"].notna()].copy()

stop = set("""
the a an and or of to for on in at by with from is are was were be been being
this that these those it its as into about over under between after before during
not no yes
client time other comms admin billing onboarding preparing documents analyse review research
""".split())

def tokenize(s: str):
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    toks = [t for t in s.split() if len(t) > 2 and t not in stop and not t.isdigit()]
    return toks

df_l["tokens"] = df_l["Time Narrative"].map(tokenize)

def top_terms(n=10):
    out = {}
    for cat, g in df_l.groupby("Category"):
        c = Counter()
        for toks in g["tokens"]:
            c.update(toks)
        out[cat] = pd.DataFrame(c.most_common(n), columns=["term", "count"])
    return out

tops = top_terms(10)

tops["client time"]

Unnamed: 0,term,count
0,email,96
1,call,38
2,out,24
3,letter,23
4,communicate,20
5,draft,17
6,hearing,16
7,meeting,15
8,advice,14
9,update,13


## Top bigrams per category

In [49]:
df_l = df[df["Category"].notna()].copy()

# minimal stopword list (same idea as before)
stop = set("""
the a an and or of to for on in at by with from is are was were be been being
this that these those it its as into about over under between after before during
not no yes
client time other comms admin billing onboarding preparing documents analyse review research
""".split())

def tokenize(s: str):
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    return [t for t in s.split() if len(t) > 2 and t not in stop and not t.isdigit()]

# (re)create tokens column
df_l["tokens"] = df_l["Time Narrative"].map(tokenize)

def bigram_terms(tokens):
    for a, b in zip(tokens, tokens[1:]):
        if len(a) < 3 or len(b) < 3: 
            continue
        if a in stop or b in stop:
            continue
        yield f"{a} {b}"

def top_bigrams(cat, n=12):
    c = Counter()
    for toks in df_l.loc[df_l["Category"] == cat, "tokens"]:
        c.update(bigram_terms(toks))
    return pd.DataFrame(c.most_common(n), columns=["bigram", "count"])

print("client time")
display(top_bigrams("client time", 12))

print("preparing documents")
display(top_bigrams("preparing documents", 12))


client time


Unnamed: 0,bigram,count
0,email out,14
1,next steps,12
2,call out,6
3,out update,6
4,email exchange,5
5,considering file,5
6,file further,5
7,initial meeting,5
8,email response,4
9,draft letter,4


preparing documents


Unnamed: 0,bigram,count
0,draft revise,9
1,consent order,7
2,brief counsel,4
3,finalise letter,3
4,first draft,3
5,attendance note,3
6,revise form,3
7,professional litigation,2
8,enclosures brief,2
9,revise d81,2
