In [1]:
import requests
from bs4 import BeautifulSoup

base_url = 'https://www.lse.ac.uk'
search_url = 'https://www.lse.ac.uk/Programmes/Search-courses?pageIndex='

programme_links = []

# Looping through all 22 pages
for page in range(1, 23):
    print(f'Scraping page {page}/22...')
    url = f"{search_url}{page}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    all_links = soup.find_all('a', href=True)
    
    for link in all_links:
        href = link['href']
        if href.startswith('/study-at-lse/undergraduate/'):
            full_url = base_url + href
            programme_links.append(full_url)

print('\n'+ f'Total programmes found: {len(programme_links)}')

Scraping page 1/22...
Scraping page 2/22...
Scraping page 3/22...
Scraping page 4/22...
Scraping page 5/22...
Scraping page 6/22...
Scraping page 7/22...
Scraping page 8/22...
Scraping page 9/22...
Scraping page 10/22...
Scraping page 11/22...
Scraping page 12/22...
Scraping page 13/22...
Scraping page 14/22...
Scraping page 15/22...
Scraping page 16/22...
Scraping page 17/22...
Scraping page 18/22...
Scraping page 19/22...
Scraping page 20/22...
Scraping page 21/22...
Scraping page 22/22...

Total programmes found: 42


In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

def scrape_programme_data(url):
    res = requests.get(url)
    if res.status_code != 200:
        print(f"⚠️ Skipping {url} — status code {res.status_code}")
        return None
    
    soup = BeautifulSoup(res.text, 'html.parser')
    data = {}

    # Degree
    course = soup.select_one('h1 > span').get_text(strip=True)
    data['degree'] = course

    # A-level requirement
    alevel_elem = soup.select_one('#alevels > div > p')
    alevel_text = alevel_elem.get_text(strip=True).split(maxsplit=1)
    data['a_lvl_req'] = alevel_text[0].strip(',')
    if len(alevel_text) > 1: data['a_lvl_extra'] = alevel_text[1]
    else: data['a_lvl_extra'] = None

    # Modules (looping through years)
    data['modules_y1'] = []
    data['modules_y2'] = []
    data['modules_y3'] = []
    for year in range(1, 4):  # assuming up to Year 3
        modules = soup.select(f'#year-{year} div.code')
        for module in modules:
            code = module.get_text(strip=True)
            if year == 1:
                data['modules_y1'].append(code)
            elif year == 2:
                data['modules_y2'].append(code)
            elif year == 3:
                data['modules_y3'].append(code)
        
    # Applications statistics
    nr_apps = soup.select_one("#your-application__overview .block--applications .stats")
    if nr_apps: data['nr_applications'] = nr_apps.get_text(strip=True)
    else: data['nr_applications'] = None
        
    intake = soup.select_one("#your-application__overview .block--places .stats")
    if intake: data['intake'] = intake.get_text(strip=True)
    else: data['intake'] = None
        
    ratio = soup.select_one("#your-application__overview .block--ratio .stats")
    if ratio: data['ratio'] = ratio.get_text(strip=True)
    else: data['ratio'] = None

    # Fees
    home_fee_text = soup.select_one('#fees-and-funding__home p').get_text(strip=True)
    data['home_fee'] = re.search(r'£[\d,]+', home_fee_text).group()

    # Median Salary
    salary = soup.select_one('#graduate-destinations__overview .salary')
    if salary: data['median_salary'] = salary.get_text(strip=True)
    else: data['median_salary'] = None

    # 6. Survey Data
    survey_1 = soup.select_one("#satisfaction > div > p")
    survey_2 = soup.select_one("#explanation > div > p")
    survey_3 = soup.select_one("#work > div > p")
    if survey_1: data['satisfaction'] = survey_1.get_text(strip=True)
    else: data['satisfaction'] = None
    if survey_2: data['explanation'] = survey_2.get_text(strip=True)
    else: data['explanation'] = None
    if survey_3: data['work'] = survey_3.get_text(strip=True)
    else: data['work'] = None

    return data


In [11]:
all_data = []
skipped_urls = 0

for i, url in enumerate(programme_links):
    print(f"Scraping {i+1}/{len(programme_links)}: {url}")
    info = scrape_programme_data(url)
    if info is None:
        skipped_urls += 1
    else:
        all_data.append(info)

print('\nData scraping complete\n')
print(f'{skipped_urls} programmes skipped in data extraction, due to website loading error (500).')

Scraping 1/42: https://www.lse.ac.uk/study-at-lse/undergraduate/ba-anthropology-and-law
Scraping 2/42: https://www.lse.ac.uk/study-at-lse/undergraduate/ba-geography
Scraping 3/42: https://www.lse.ac.uk/study-at-lse/undergraduate/ba-history
⚠️ Skipping https://www.lse.ac.uk/study-at-lse/undergraduate/ba-history — status code 500
Scraping 4/42: https://www.lse.ac.uk/study-at-lse/undergraduate/ba-social-anthropology
Scraping 5/42: https://www.lse.ac.uk/study-at-lse/undergraduate/bsc-accounting-and-finance
Scraping 6/42: https://www.lse.ac.uk/study-at-lse/undergraduate/bsc-actuarial-science
Scraping 7/42: https://www.lse.ac.uk/study-at-lse/undergraduate/bsc-data-science
Scraping 8/42: https://www.lse.ac.uk/study-at-lse/undergraduate/bsc-econometrics-and-mathematical-economics
Scraping 9/42: https://www.lse.ac.uk/study-at-lse/undergraduate/bsc-economic-history
Scraping 10/42: https://www.lse.ac.uk/study-at-lse/undergraduate/bsc-economic-history-and-geography
Scraping 11/42: https://www.lse.

In [23]:
# Cleaning data from non-responsive websites & converting to Dataframe
clean_data = [d for d in all_data if d is not None]
df = pd.DataFrame(clean_data)
df

Unnamed: 0,degree,a_lvl_req,a_lvl_extra,modules_y1,modules_y2,modules_y3,nr_applications,intake,ratio,home_fee,median_salary,satisfaction,explanation,work
0,BA Anthropology and Law,AAB,,"[LL141, AN100, AN101, LL142, LL108, LL100, LL1...","[AN253, AN379, LL106, LL143, LL200]",[LL276],250.0,20.0,13:1,"£9,535","£34,500",,,
1,BA Geography,AAA,,"[GY100, GY140, GY144, LSE100]","[GY245, GY246, GY212, GY204, GY206, GY207]",[GY350],377.0,38.0,10:1,"£9,535","£35,000",,,
2,BA Social Anthropology,AAB,,"[AN100, AN101, AN102, LSE100]","[AN286, AN253, AN256, AN273, AN285, AN287, AN2...",[AN397],232.0,30.0,8:1,"£9,535","£34,500",,,
3,BSc Accounting and Finance,AAA,with A in Mathematics,"[LSE100, AC105, AC106, ST107, FM101, EC1A3, EC...","[AC205, AC206, FM210, FM211, FM214, FM215, EC2...","[AC331, AC311, FM310, FM311]",2283.0,140.0,16:1,"£9,535","£35,000",,,
4,BSc Actuarial Science,A*AA,with an A* in Mathematics,"[ST102, MA100, EC1A3, EC1B3, LSE100]","[ST206, ST216, MA221, MA222, ST226, ST227]","[ST302, ST301]",615.0,68.0,9:1,"£9,535","£36,500",,,
5,BSc Data Science,A*AA,with an A* in Mathematics,"[ST102, MA100, ST101, ST115, LSE100, AC102, AC...","[ST206, ST211, ST207, MA214, MA222, MA102, MA2...","[ST310, ST311, ST312, ST300, ST301, ST302, ST3...",633.0,32.0,20:1,"£9,535","£36,500",,,
6,BSc Econometrics and Mathematical Economics,A*AA,with an A* in Mathematics,"[EC1P1, MA108, ST109, EC1A1, EC1B1, EC1C1, LSE...","[EC2A1, EC2B1, EC2C1, EH238, FM214, FM215]","[EC319, EC333, EC336, EC337, EC311]",,,,"£9,535","£55,000",,,
7,BSc Economic History,AAA,including Economics or History,"[EH101, EC1A5, EC1B5, EH102, LSE100]",[EH237],[EH390],267.0,29.0,9:1,"£9,535","£35,000",,,
8,BSc Economic History and Geography,AAB,including Economics or History,"[EH101, GY100, GY140, LSE100, EC1A3, EC1A5, EC...","[GY209, GY210, EH237]","[GY313, GY314, EH308, EH390]",175.0,6.0,29:1,"£9,535","£35,000",,,
9,BSc Economics,A*AA,with an A* in Mathematics,"[EC1P1, MA108, ST109, EC1A1, EC1B1, EC1C1, LSE...","[EC2A1, EC2B1, EC2C1, EH238, FM214, FM215]",[],3731.0,225.0,17:1,"£9,535","£55,000",,,


In [27]:
# Saving as CSV file
df.to_csv('data/degrees/programme_data.csv', index=False)
print("Data has been saved to 'programme_data.csv'")

Data has been saved to 'programme_data.csv'
