In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

import glob

In [28]:
path = r"../data/table-52"
files = glob.glob(path + "/*.csv")

In [36]:
read_opts = dict(
    header=13, 
    engine="pyarrow",
    dtype_backend="pyarrow",
    # low_memory=False
)

In [37]:
dfs = [pd.read_csv(f, **read_opts) for f in files]
df = pd.concat(dfs, ignore_index=True)

In [50]:
for a in dfs:
    print(a.shape)

(625614, 8)
(626560, 8)
(628353, 8)
(625570, 8)
(621907, 8)


In [48]:
df

Unnamed: 0,CAH level marker,CAH level subject,Entrant marker,Level of study,Mode of study,Academic Year,Permanent address marker,Number
0,CAH level 1,01 Medicine and dentistry,Entrant,All,All,2019/20,England,14275
1,CAH level 1,01 Medicine and dentistry,Entrant,All,All,2019/20,Wales,725
2,CAH level 1,01 Medicine and dentistry,Entrant,All,All,2019/20,Scotland,1695
3,CAH level 1,01 Medicine and dentistry,Entrant,All,All,2019/20,Northern Ireland,600
4,CAH level 1,01 Medicine and dentistry,Entrant,All,All,2019/20,Other UK,35
...,...,...,...,...,...,...,...,...
3127999,HECoS,Total,Not an entrant,All undergraduate,Part-time,2023/24,European Union,1175
3128000,HECoS,Total,Not an entrant,All undergraduate,Part-time,2023/24,Non-European Union,3875
3128001,HECoS,Total,Not an entrant,All undergraduate,Part-time,2023/24,Total Non-UK,5050
3128002,HECoS,Total,Not an entrant,All undergraduate,Part-time,2023/24,Not known,535


In [8]:
df.shape

(3128069, 8)

### Grouping by subject of study

In [54]:
by_subject = df[df["CAH level marker"] == "CAH level 3"]
by_subject.head(10)

Unnamed: 0,CAH level marker,CAH level subject,Entrant marker,Level of study,Mode of study,Academic Year,Permanent address marker,Number
17325,CAH level 3,01-01-01 Medical sciences (non-specific),Entrant,All,All,2019/20,England,1350
17326,CAH level 3,01-01-01 Medical sciences (non-specific),Entrant,All,All,2019/20,Wales,75
17327,CAH level 3,01-01-01 Medical sciences (non-specific),Entrant,All,All,2019/20,Scotland,85
17328,CAH level 3,01-01-01 Medical sciences (non-specific),Entrant,All,All,2019/20,Northern Ireland,95
17329,CAH level 3,01-01-01 Medical sciences (non-specific),Entrant,All,All,2019/20,Other UK,5
17330,CAH level 3,01-01-01 Medical sciences (non-specific),Entrant,All,All,2019/20,Total UK,1610
17331,CAH level 3,01-01-01 Medical sciences (non-specific),Entrant,All,All,2019/20,European Union,135
17332,CAH level 3,01-01-01 Medical sciences (non-specific),Entrant,All,All,2019/20,Non-European Union,475
17333,CAH level 3,01-01-01 Medical sciences (non-specific),Entrant,All,All,2019/20,Total Non-UK,610
17334,CAH level 3,01-01-01 Medical sciences (non-specific),Entrant,All,All,2019/20,Not known,0


#### CAH Level 3 Subjects

In [56]:
by_subject["CAH level subject"].unique().tolist()

['01-01-01 Medical sciences (non-specific)',
 '01-01-02 Medicine (non-specific)',
 '01-01-03 Medicine by specialism',
 '01-01-04 Dentistry',
 '02-02-01 Pharmacology',
 '02-02-02 Toxicology',
 '02-02-03 Pharmacy',
 '02-04-01 Nursing (non-specific)',
 '02-04-02 Adult nursing',
 '02-04-03 Community nursing',
 '02-04-04 Midwifery',
 "02-04-05 Children's nursing",
 '02-04-06 Dental nursing',
 '02-04-07 Mental health nursing',
 '02-04-08 Learning disabilities nursing',
 '02-04-09 Others in nursing',
 '02-05-01 Medical technology',
 '02-05-02 Healthcare science (non-specific)',
 '02-05-03 Biomedical sciences (non-specific)',
 '02-05-04 Anatomy, physiology and pathology',
 '02-06-01 Health sciences (non-specific)',
 '02-06-02 Nutrition and dietetics',
 '02-06-03 Ophthalmics',
 '02-06-04 Environmental and public health',
 '02-06-05 Physiotherapy',
 '02-06-06 Complementary and alternative medicine',
 '02-06-07 Counselling, psychotherapy and occupational therapy',
 '03-01-01 Biosciences (non-spec

### Engineering and Technology, Computing, Architecture, building and planning, 

In [71]:
keep = df["CAH level subject"].str.startswith(("10","11","13"), na=False)
df_keep = by_subject[keep].copy()
df_keep.shape

  df_keep = by_subject[keep].copy()


(96756, 8)

In [72]:
df_keep.head(20)

Unnamed: 0,CAH level marker,CAH level subject,Entrant marker,Level of study,Mode of study,Academic Year,Permanent address marker,Number
58828,CAH level 3,10-01-01 Engineering (non-specific),Entrant,All,All,2019/20,England,5835
58829,CAH level 3,10-01-01 Engineering (non-specific),Entrant,All,All,2019/20,Wales,520
58830,CAH level 3,10-01-01 Engineering (non-specific),Entrant,All,All,2019/20,Scotland,980
58831,CAH level 3,10-01-01 Engineering (non-specific),Entrant,All,All,2019/20,Northern Ireland,185
58832,CAH level 3,10-01-01 Engineering (non-specific),Entrant,All,All,2019/20,Other UK,15
58833,CAH level 3,10-01-01 Engineering (non-specific),Entrant,All,All,2019/20,Total UK,7535
58834,CAH level 3,10-01-01 Engineering (non-specific),Entrant,All,All,2019/20,European Union,605
58835,CAH level 3,10-01-01 Engineering (non-specific),Entrant,All,All,2019/20,Non-European Union,3340
58836,CAH level 3,10-01-01 Engineering (non-specific),Entrant,All,All,2019/20,Total Non-UK,3940
58837,CAH level 3,10-01-01 Engineering (non-specific),Entrant,All,All,2019/20,Not known,0


In [73]:
mask = (
    df_keep["Entrant marker"].eq("All") &
    df_keep["Level of study"].eq("All") &
    df_keep["Mode of study"].eq("All") &
    df_keep["Permanent address marker"].eq("Total")
)

df_total = df_keep.loc[mask].copy()

In [74]:
df_total

Unnamed: 0,CAH level marker,CAH level subject,Entrant marker,Level of study,Mode of study,Academic Year,Permanent address marker,Number
58904,CAH level 3,10-01-01 Engineering (non-specific),All,All,All,2019/20,Total,24265
59597,CAH level 3,10-01-02 Mechanical engineering,All,All,All,2019/20,Total,35455
60290,CAH level 3,10-01-03 Production and manufacturing engineering,All,All,All,2019/20,Total,11870
60983,CAH level 3,10-01-04 Aeronautical and aerospace engineering,All,All,All,2019/20,Total,13955
61676,CAH level 3,10-01-05 Naval architecture,All,All,All,2019/20,Total,480
...,...,...,...,...,...,...,...,...
2580764,CAH level 3,11-01-08 Others in computing,All,All,All,2023/24,Total,3745
2581457,CAH level 3,13-01-01 Architecture,All,All,All,2023/24,Total,26855
2582150,CAH level 3,13-01-02 Building,All,All,All,2023/24,Total,31560
2582843,CAH level 3,13-01-03 Landscape design,All,All,All,2023/24,Total,1240


##### Add subject column without CAH Level 3 code

In [75]:
col = "CAH level subject"

df_total["Subject"] = (
    df_total[col]
    .str.replace(r"^\s*\d{2}-\d{2}-\d{2}\s+", "", regex=True)
    .str.strip()
)

In [76]:
df_total["Subject"].unique().tolist()

['Engineering (non-specific)',
 'Mechanical engineering',
 'Production and manufacturing engineering',
 'Aeronautical and aerospace engineering',
 'Naval architecture',
 'Bioengineering, medical and biomedical engineering',
 'Civil engineering',
 'Electrical and electronic engineering',
 'Chemical, process and energy engineering',
 'Others in engineering',
 'Minerals technology',
 'Materials technology',
 'Polymers and textiles',
 'Maritime technology',
 'Biotechnology',
 'Others in technology',
 'Materials science',
 'Computer science',
 'Information technology',
 'Information systems',
 'Software engineering',
 'Artificial intelligence',
 'Computer games and animation',
 'Business computing',
 'Others in computing',
 'Architecture',
 'Building',
 'Landscape design',
 'Planning (urban, rural and regional)']

In [80]:
df_total.sample(20)

Unnamed: 0,CAH level marker,CAH level subject,Entrant marker,Level of study,Mode of study,Academic Year,Permanent address marker,Number,Subject
2574747,CAH level 3,10-03-06 Others in technology,All,All,All,2023/24,Total,6515,Others in technology
62259,CAH level 3,"10-01-06 Bioengineering, medical and biomedica...",All,All,All,2019/20,Total,5840,"Bioengineering, medical and biomedical enginee..."
700369,CAH level 3,11-01-08 Others in computing,All,All,All,2020/21,Total,3485,Others in computing
694990,CAH level 3,10-03-07 Materials science,All,All,All,2020/21,Total,1940,Materials science
685750,CAH level 3,10-01-03 Production and manufacturing engineering,All,All,All,2020/21,Total,13025,Production and manufacturing engineering
2575440,CAH level 3,10-03-07 Materials science,All,All,All,2023/24,Total,1790,Materials science
690491,CAH level 3,10-01-10 Others in engineering,All,All,All,2020/21,Total,1010,Others in engineering
1954567,CAH level 3,11-01-07 Business computing,All,All,All,2022/23,Total,4595,Business computing
2583437,CAH level 3,"13-01-04 Planning (urban, rural and regional)",All,All,All,2023/24,Total,6685,"Planning (urban, rural and regional)"
1955249,CAH level 3,11-01-08 Others in computing,All,All,All,2022/23,Total,4120,Others in computing


### Plots