## Imports

In [1]:
from os import environ as ENV
from time import sleep
import logging
from datetime import datetime
import re
from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Extract + Transform

### URLs

In [2]:
BASE_URL = "https://www.judiciary.uk/about-the-judiciary/who-are-the-judiciary/list-of-members-of-the-judiciary/"

In [3]:
def scrape_judge_urls(web_url: str) -> dict[str:str]:
    """Get the url for each list of judges"""
    
    try:
        response = requests.get(web_url, timeout=10)

        soup = BeautifulSoup(response.content, 'html.parser')

        judge_lists = soup.find_all(
            "a", class_="card__link")
        
        judge_dict = {}
        for judge_list in judge_lists:
            name = re.search(r"(( \(?[A-Z][a-z]*.*\)?)+)</a>", str(judge_list)
                             ).group(2).strip().lstrip("List of ")
            url = judge_list["href"]
            judge_dict[name] = url
        
        return judge_dict

    except requests.RequestException as error:
        logging.info(f"Error fetching URL: {error}")
        return []

In [4]:
judge_dict = scrape_judge_urls(BASE_URL)

In [5]:
judge_dict

{'Bench Chairs': 'https://www.judiciary.uk/about-the-judiciary/who-are-the-judiciary/list-of-members-of-the-judiciary/bench-chairmen-list/',
 'District Judges (Magistrates’ Courts)': 'https://www.judiciary.uk/about-the-judiciary/who-are-the-judiciary/list-of-members-of-the-judiciary/dj-mags-ct-list/',
 'Diversity and Community Relations Judges (DCRJs)': 'https://www.judiciary.uk/about-the-judiciary/who-are-the-judiciary/list-of-members-of-the-judiciary/diversity-and-community-relations-judges-list/',
 'Diversity and Community Relations Magistrates': 'https://www.judiciary.uk/about-the-judiciary/who-are-the-judiciary/list-of-members-of-the-judiciary/diversity-community-and-relations-magistrates/',
 'Judge Advocates General': 'https://www.judiciary.uk/about-the-judiciary/who-are-the-judiciary/list-of-members-of-the-judiciary/jag-list/',
 'Circuit Judges': 'https://www.judiciary.uk/about-the-judiciary/who-are-the-judiciary/list-of-members-of-the-judiciary/circuit-judge-list/',
 'District 

### functions

In [6]:
def convert_date(datestr: str) -> datetime:
    """Turns date string to datetime.
    Returns date as datetime."""
    
    datestr = datestr.split()[0]
    length = len(datestr)
    
    if length == 10:
        return datetime.strptime(datestr, "%d-%m-%Y")
    if length == 8:
        return datetime.strptime(datestr, "%d-%m-%y")
    if length == 9:
        return datetime.strptime(datestr, "%d-%b-%y")

In [7]:
def extract_name(judge: str, title: str) -> str:
    """Strip away any prefix and suffix.
    Returns a name."""
    
    if title in judge:
        prefix, name = judge.split(f" {title} ")
    else:
        for token in title.split():
            if token in judge:
                prefix, name = judge.split(f" {token} ")
                break
    
    if prefix in ["Her", "Mrs", "Ms", "Miss"]:
        gender = "F"
    elif prefix in ["His", "Mr"]:
        gender = "M"
    else:
        gender = "X"
    
    if "(" not in name:
        return name, gender

    name = name[:name.find("(")]
    
    return name, gender

### Bench Chairs

In [8]:
def scrape_BC(web_url: str) -> list[str]:
    """Get data from a list of judges with a URL"""

    try:
        response = requests.get(web_url, timeout=10)

        soup = BeautifulSoup(response.content, 'html.parser')

        regions = soup.find_all(
            "h2", class_="wp-block-heading")
        regions = [region.text for region in regions]

        region_tables = soup.find_all(
            "table", class_="govuk-table")
        region_tables = [[row.text
                          for row in table.find_all("td", class_="govuk-table__cell")
                          if "strong" not in str(row)]
                         for table in region_tables]

        rows = []
        for t, table in list(enumerate(region_tables)):
            for c in range(0, len(table), 2):
                rows.append({"region":regions[t],
                             "bench":table[c],
                             "name":table[c+1]})
        
        return pd.DataFrame(rows)
        
    except requests.RequestException as error:
        logging.info(f"Error fetching URL: {error}")
        return []

In [9]:
BC = scrape_BC(judge_dict["Bench Chairs"])

In [10]:
BC["first_name"] = BC["name"].str.split().str[0]

In [11]:
BC["last_name"] = BC["name"].str.split().str[1]

In [12]:
BC = BC.drop(columns=["name"])

In [13]:
BC

Unnamed: 0,region,bench,first_name,last_name
0,London,Central London,Maeve,Bromwich
1,London,East London,Tina,Hayhow
2,London,North East London,Robert,Chambers
3,London,North London,Gwilwyn,Wright
4,London,North West London,Valerie,Crookes
...,...,...,...,...
70,Wales,Ceredigion & Pembrokeshire,Jennie,Robson
71,Wales,Montgomeryshire,Stephen,Pembroke
72,Wales,North East Wales,Ceri,Hughes
73,Wales,North Central Wales,Hilary,Owen


### DJMC

In [14]:
def scrape_DJMC(web_url: str) -> list[str]:
    """Get data from a list of judges with a URL"""

    try:
        response = requests.get(web_url, timeout=10)

        soup = BeautifulSoup(response.content, 'html.parser')

        cells = soup.find_all(
            "td", class_="govuk-table__cell")
        cells = [cell.text
                 for cell in cells
                 if "strong" not in str(cell)]

        rows = []
        for c in range(0, len(cells), 3):
            rows.append({"judge": cells[c],
                         "circuit": cells[c+1],
                         "appointment": cells[c+2]})

        return pd.DataFrame(rows)

    except requests.RequestException as error:
        logging.info(f"Error fetching URL: {error}")
        return []

In [15]:
DJMC = scrape_DJMC(judge_dict["District Judges (Magistrates’ Courts)"])

In [16]:
DJMC["appointment"] = DJMC["appointment"].apply(lambda x: datetime.strptime(x, "%d-%m-%y"))

In [17]:
DJMC.dtypes

judge                  object
circuit                object
appointment    datetime64[ns]
dtype: object

In [18]:
DJMC

Unnamed: 0,judge,circuit,appointment
0,District Judge (MC) Allen-Khimani,Midlands,2019-06-08
1,District Judge (MC) Apted,North West,2023-01-09
2,District Judge (MC) Austin,South West,2023-04-14
3,District Judge (MC) Barnes,Midlands,2023-02-06
4,District Judge (MC) Barron,South East,2011-10-13
...,...,...,...
132,District Judge (MC) Wilkinson,Midlands,2009-07-20
133,District Judge (MC) Williams,South East,2022-01-03
134,District Judge (MC) Wilson,South East,2021-11-01
135,District Judge (MC) Young,North East,2020-03-16


### High Court Masters, Costs Judges and Insolvency and Companies Court Judges

In [19]:
def scrape_HCM(web_url: str) -> list[str]:
    """Get data from a list of judges with a URL"""

    try:
        response = requests.get(web_url, timeout=10)

        soup = BeautifulSoup(response.content, 'html.parser')

        cells = soup.find_all(
            "td", class_="govuk-table__cell")
        cells = [cell.text
                 for cell in cells]

        rows = []
        for c in range(0, len(cells), 2):
            rows.append({"judge": cells[c],
                         "appointment": cells[c+1]})

        return pd.DataFrame(rows)

    except requests.RequestException as error:
        logging.info(f"Error fetching URL: {error}")
        return []

In [20]:
HCM = scrape_HCM(judge_dict["High Court Masters, Costs Judges and Insolvency and Companies Court Judges"])

In [21]:
HCM

Unnamed: 0,judge,appointment
0,Senior Master Cook (The King’s Remembrancer),18-09-23 (King’s Bench Master 20-06-11)
1,King’s Bench Master Armstrong,01-11-23
2,King’s Bench Master Dagnall,23-03-20
3,King’s Bench Master Davison,01-02-16 (Admiralty Registrar 02-05-20)
4,King’s Bench Master Eastman,27-04-09
5,King’s Bench Master Gidden,01-10-12
6,King’s Bench Master McCloud,23-06-10
7,King’s Bench Master Stevens,13-04-21
8,King’s Bench Master Sullivan,02-12-19
9,King’s Bench Master Thornett,31-10-16


In [22]:
HCM_judge_vocab =[token
                  for token in " ".join(list(HCM["judge"])).split()]

In [23]:
HCM_judge_vocab[:15]

['Senior',
 'Master',
 'Cook',
 '(The',
 'King’s',
 'Remembrancer)',
 'King’s',
 'Bench',
 'Master',
 'Armstrong',
 'King’s',
 'Bench',
 'Master',
 'Dagnall',
 'King’s']

### High Court Judges (Wikipedia)

In [24]:
def scrape_HCJ_wiki(web_url: str) -> list[str]:
    """Get data from a list of judges with a URL"""

    try:
        response = requests.get(web_url, timeout=10)

        soup = BeautifulSoup(response.content, 'html.parser')

        table = soup.find("table", class_="wikitable sortable")
        trs = table.find_all("tr")[1:]
        
        dict_rows = []
        for tr in trs:
            tds = tr.find_all("td")
            dict_rows.append({"judge":tds[1].text,
                              "retirement": tds[2].text,
                              "appointment": tds[3].text,
                              "division": tds[4].text})

        return pd.DataFrame(dict_rows)

    except requests.RequestException as error:
        logging.info(f"Error fetching URL: {error}")
        return []

In [25]:
scrape_HCJ_wiki(
    "https://en.wikipedia.org/wiki/List_of_High_Court_judges_of_England_and_Wales")

Unnamed: 0,judge,retirement,appointment,division
0,Sir Peter Roth,19 December 2027,1 October 2009,Chancery
1,Dame Lucy Theis,6 November 2035,15 November 2010,Family
2,Sir Robert Hildyard,10 October 2027,3 October 2011,Chancery
3,Dame Beverley Lang,13 October 2030,3 October 2011,King's Bench
4,Sir Philip Moor,15 July 2034,3 October 2011,Family
...,...,...,...,...
103,Sir Eason Rajah,18 January 2042,18 April 2023,Chancery
104,Dame Ruth Henke,,29 September 2023,Family
105,Sir Nicholas Cusworth,20 March 2039,11 January 2024,Family
106,Sir Clive Sheldon,,1 February 2024,King's Bench


### High Court King’s Bench Division

In [26]:
def scrape_HCKB(web_url: str) -> list[str]:
    """Get data from a list of judges with a URL"""

    try:
        response = requests.get(web_url, timeout=10)

        soup = BeautifulSoup(response.content, 'html.parser')

        cells = soup.find_all(
            "td", class_="govuk-table__cell")
        cells = [cell.text
                 for cell in cells
                 if "<strong>" not in str(cell)]

        rows = []
        for c in range(0, len(cells), 2):
            rows.append({"judge": cells[c],
                         "appointment": cells[c+1]})

        return pd.DataFrame(rows)

    except requests.RequestException as error:
        logging.info(f"Error fetching URL: {error}")
        return pd.DataFrame([])

In [27]:
HCKB = scrape_HCKB("https://www.judiciary.uk/about-the-judiciary/who-are-the-judiciary/senior-judiciary-list/kings-bench-division-judges/")

In [28]:
HCKB["appointment"] = HCKB["appointment"].apply(convert_date)

In [29]:
HCKB["judge"] = HCKB["judge"].apply(extract_name, args=("Justice",))

In [30]:
HCKB["name"] = HCKB["judge"].str[0]

In [31]:
HCKB["gender"] = HCKB["judge"].str[1]

In [32]:
HCKB["type"] = "High Court King’s Bench Division"

In [33]:
HCKB["circuit"] = ""

In [34]:
HCKB = HCKB[["name", "gender", "appointment", "type", "circuit"]]

In [35]:
HCKB

Unnamed: 0,name,gender,appointment,type,circuit
0,Lang DBE,F,2011-10-03,High Court King’s Bench Division,
1,Turner,M,2013-01-28,High Court King’s Bench Division,
2,Jeremy Baker,M,2013-03-25,High Court King’s Bench Division,
3,Jay,M,2013-06-04,High Court King’s Bench Division,
4,Goss,M,2014-10-01,High Court King’s Bench Division,
...,...,...,...,...,...
67,Bright,M,2023-01-11,High Court King’s Bench Division,
68,Constable,M,2023-03-13,High Court King’s Bench Division,
69,Dias DBE,F,2023-03-20,High Court King’s Bench Division,
70,Sheldon,M,2024-02-01,High Court King’s Bench Division,


### Circuit Judges

In [36]:
def scrape_CJ(web_url: str) -> list[str]:
    """Get data from a list of judges with a URL"""

    try:
        response = requests.get(web_url, timeout=10)

        soup = BeautifulSoup(response.content, 'html.parser')

        cells = soup.find_all(
            "td", class_="govuk-table__cell")
        cells = [cell.text
                 for cell in cells
                 if "strong" not in str(cell)]

        rows = []
        for c in range(0, len(cells), 3):
            rows.append({"judge": cells[c],
                         "circuit": cells[c+1],
                         "appointment": cells[c+2]})

        return pd.DataFrame(rows)

    except requests.RequestException as error:
        logging.info(f"Error fetching URL: {error}")
        return []

In [37]:
CJ = scrape_CJ(judge_dict["Circuit Judges"])

In [38]:
CJ["appointment"] = CJ["appointment"].apply(convert_date)

In [39]:
CJ["judge"] = CJ["judge"].apply(extract_name, args=("Honour Judge",))

In [40]:
CJ["name"] = CJ["judge"].str[0]

In [41]:
CJ["gender"] = CJ["judge"].str[1]

In [42]:
CJ["type"] = "Circuit Judge"

In [43]:
CJ = CJ[["name", "gender", "appointment", "type", "circuit"]]

In [44]:
CJ

Unnamed: 0,name,gender,appointment,type,circuit
0,Aaronberg KC,M,2018-02-09,Circuit Judge,London
1,Adams,M,2014-01-20,Circuit Judge,North East
2,Nathan Adams,M,2023-11-20,Circuit Judge,North East
3,Adkin,M,2017-10-02,Circuit Judge,North East
4,Afzal CBE,M,2024-04-15,Circuit Judge,North East
...,...,...,...,...,...
737,Woolfall,M,2023-09-18,Circuit Judge,North East
738,Worster,M,2017-04-03,Circuit Judge,Midlands
739,Alistair Charles Wright,M,2022-08-22,Circuit Judge,South Easter
740,Caroline Jane Wright,F,2009-10-20,Circuit Judge,London


In [45]:
CJ["circuit"].unique()

array(['London', 'North East', 'South East', 'North West', 'South West',
       'Midlands', 'Other Tribunal', 'Wales', 'Northern', 'Sout East',
       'Western', 'North Eastern', 'West', 'North', 'Norhtern',
       'South Easter'], dtype=object)

## Load