In [8]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio

pio.renderers.default = "vscode"            # good for VS Code
pio.renderers.default = "notebook_connected"


In [9]:
# because the notebook lives in /notebooks, go up one level to the repo root
REPO_ROOT = Path.cwd().parent
DATA_PATH = REPO_ROOT / "data" / "interview_task_dataset.csv"

# print("CWD:", Path.cwd())
# print("DATA_PATH:", DATA_PATH.resolve())
assert DATA_PATH.exists(), f"File not found: {DATA_PATH.resolve()}"

In [10]:
df = pd.read_csv(DATA_PATH)
df.head(3)

Unnamed: 0,Record ID,Department,Time Narrative,Worked Time,Charged to Client?,Grade,Category
0,p-0001,a,Amending and updating statement,0.4,YES,Senior,
1,p-0002,a,Reviewed court order and drafted advice email ...,1.3,YES,Junior,
2,p-0003,a,considering email in from counsel attaching FD...,0.3,YES,Junior,"analyse, review, research"


In [17]:
df.info()

labelled = df['Category'].notna().sum()
unlabelled = df['Category'].isna().sum()
share = labelled / len(df) * 100

print(f"\nLabelled:   {labelled:,}")
print(f"Unlabelled: {unlabelled:,}")
print(f"Share labelled: {share:.1f}%")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2157 entries, 0 to 2156
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Record ID           2157 non-null   object 
 1   Department          2157 non-null   object 
 2   Time Narrative      2157 non-null   object 
 3   Worked Time         2157 non-null   float64
 4   Charged to Client?  2157 non-null   object 
 5   Grade               2157 non-null   object 
 6   Category            561 non-null    object 
dtypes: float64(1), object(6)
memory usage: 118.1+ KB

Labelled:   561
Unlabelled: 1,596
Share labelled: 26.0%


In [11]:
tbl = (
    df.assign(charged=df["Charged to Client?"].str.upper().eq("YES"))
      .groupby("Grade", as_index=False)
      .agg(rows=("charged", "size"), charged_yes=("charged", "sum"))
)
tbl["charged_%"] = (tbl["charged_yes"] / tbl["rows"] * 100).round(1)
tbl.sort_values("Grade", inplace=True)

tbl

Unnamed: 0,Grade,rows,charged_yes,charged_%
0,Junior,893,686,76.8
1,Partner,268,224,83.6
2,Senior,996,868,87.1


What you just did (and why)
What: You grouped by Grade and computed how many entries were charged and the % charged.

Why (business): This shows baseline billing efficiency by seniority. In your data, Seniors have the highest charge rate (~87%), Juniors lower (~77%). This becomes a talking point and feeds the revenue-uplift calc later (e.g., “if Juniors’ charged% rose by 5 pts → £X more/week”).

Why (modelling): Confirms “Charged to Client?” and Grade are informative features and correlate with category. It also flags a risk to watch: if a category is basically “Admin time”, we’ll likely see lower charge%—good to validate but avoid any leakage assumptions.

In [13]:
# sort for a tidy left→right story
plot_df = tbl.sort_values("charged_%", ascending=False).copy()

fig = px.bar(
    plot_df,
    x="Grade", y="charged_%", text="charged_%",
)

fig.update_traces(
    texttemplate="%{text:.1f}%", textposition="outside",
    hovertemplate="Grade: %{x}<br>Charge rate: %{y:.1f}%<br>Rows: %{customdata}",
    customdata=plot_df["rows"],
)

fig.update_layout(
    title=dict(text="Charge rate by grade", x=0.02, xanchor="left"),
    template="simple_white",
    font=dict(family="Segoe UI, Arial, sans-serif", size=14),
    yaxis=dict(title="Charged (%)", range=[0, 100], ticksuffix="%"),
    xaxis_title=None,
    uniformtext_minsize=12,
    bargap=0.35,
    margin=dict(t=60, r=30, l=60, b=60),
)

fig.show()


In [15]:
df_l = df[df["Category"].notna()].copy()

cat_counts = (
    df_l["Category"]
    .value_counts()
    .rename_axis("Category")
    .reset_index(name="rows")
)
cat_counts["share_%"] = (cat_counts["rows"] / cat_counts["rows"].sum() * 100).round(1)
cat_counts



Unnamed: 0,Category,rows,share_%
0,client time,199,35.5
1,preparing documents,109,19.4
2,"analyse, review, research",85,15.2
3,Other comms,75,13.4
4,onboarding,49,8.7
5,admin,32,5.7
6,billing,12,2.1


In [None]:
fig = px.bar(
    cat_counts, x="Category", y="rows", text="share_%",
    title="Labelled subset: category mix",
)
fig.update_traces(texttemplate="%{text}%", textposition="outside")
fig.update_layout(template="simple_white", xaxis_title=None, yaxis_title="Rows")
fig.show()