In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter,MultipleLocator

In [2]:
import sys
from pathlib import Path

repo_root = Path().resolve().parents[0]
sys.path.append(str(repo_root))

print("Added to sys.path:", repo_root)

from utils.common_functions import save_or_show, load_df


Added to sys.path: D:\Joe\1\Stufffs\Research\Mapping Future Skills in UK Engineering Education


In [3]:
def _clean_num(s):
    # handle "*", "-", commas, blanks → numeric
    return (s.astype(str)
              .str.strip()
              .replace({"*": np.nan, "-": 0, "": np.nan})
              .str.replace(",", "", regex=False)
              .pipe(pd.to_numeric, errors="coerce"))

In [4]:
def load_table4_2023_all_regions(path_xlsx: str) -> pd.DataFrame:
    """
    Combine all region sheets for the 2023 workbook.
    - Region name from cell A2 on each sheet
    - Data range: rows 4–24, columns A,D,G,J,M
    - Returns tidy df with columns:
      ['Year','Region','Broad Industry Group',
       'All_FT_employees','All_PT_employees',
       'All_total_employees','All_total_employment']
    """
    xl = pd.ExcelFile(path_xlsx)
    frames = []
    skip_re = re.compile(r"(?i)information|contents|cover|readme|about")

    for sh in xl.sheet_names:
        if skip_re.search(sh or ""):
            continue

        # --- 1) Get region from A2
        reg_df = pd.read_excel(path_xlsx, sheet_name=sh, header=None, nrows=2, usecols="A")
        region = str(reg_df.iat[1, 0]).strip() if reg_df.shape[0] >= 2 else sh.strip()

        # --- 2) Read the required block: rows 4–24, cols A,D,G,J,M
        # rows are 1-indexed in Excel, so skip first 3 rows, then read 21 rows
        block = pd.read_excel(
            path_xlsx,
            sheet_name=sh,
            header=None,
            usecols="A,D,G,J,M",
            skiprows=3,   # skip rows 1–3
            nrows=21      # rows 4–24 inclusive
        )

        block.columns = [
            "Broad Industry Group",
            "All_FT_employees",
            "All_PT_employees",
            "All_total_employees",
            "All_total_employment",
        ]

        # Drop empty rows (sometimes trailing notes bleed in)
        block["Broad Industry Group"] = block["Broad Industry Group"].astype(str).str.strip()
        block = block[block["Broad Industry Group"].ne("") & block["Broad Industry Group"].ne("nan")].copy()

        # Clean numeric columns
        for c in ["All_FT_employees","All_PT_employees","All_total_employees","All_total_employment"]:
            block[c] = _clean_num(block[c])

        # Tag region/year
        block["Region"] = region
        block["Year"] = 2023

        frames.append(block)

    if not frames:
        return pd.DataFrame(columns=[
            "Year","Region","Broad Industry Group",
            "All_FT_employees","All_PT_employees",
            "All_total_employees","All_total_employment"
        ])

    df = pd.concat(frames, ignore_index=True)

    # Optional: remove obvious header echoes if any slipped through
    drop_patterns = re.compile(r"(?i)source:|notes?$|key$|reference year|thousand")
    df = df[~df["Broad Industry Group"].str.contains(drop_patterns, na=False)].copy()

    # Order columns nicely
    df = df[
        ["Year","Region","Broad Industry Group",
         "All_FT_employees","All_PT_employees",
         "All_total_employees","All_total_employment"]
    ]

    return df

In [5]:
# path to your 2023 workbook
path = "../data/ons-table-4/table42023rv2.xlsx"   # <- update if needed
df_2023 = load_table4_2023_all_regions(path)

# quick peek
print(df_2023.shape)
print(df_2023.head(10).to_string(index=False))


ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.