In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

base_url = 'https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides'
guide_url = f'{base_url}/undergraduate.htm'

In [3]:
# Getting all links to course guides
response = requests.get(guide_url)
soup = BeautifulSoup(response.content, 'html.parser')

# Finding all tables (each course is stored in departments table)
tables = soup.find_all('table')

course_links = []

for table in tables:
    for a_tag in table.find_all('a', href=True):
        href = a_tag['href']
        if href.startswith('../courseGuides/'):
            full_url = base_url + href.split('../courseGuides')[1]
            course_links.append(full_url)

print(f'Found {len(course_links)} course guide links.')

Found 568 course guide links.


In [18]:
# Function to extract course data
def extract_course_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    title = soup.find('title').get_text().split(maxsplit=1)
    code = title[0]
    course = title[1]
    
    data = {'code': code, 'course': course}
    
    key_facts_section = soup.find('div', id='keyFacts-Content')
    items = key_facts_section.find_all('p')
        
    for item in items:
        text = item.get_text(strip=True)
    
        if text.startswith('Department'):
            data['department'] = text.split(':')[1].strip()
        elif text.startswith('Total students'):
            data['total_students'] = text.split(':')[1].strip()
        elif text.startswith('Average class size'):
            data['avg_class_size'] = text.split(':')[1].strip()
        elif text.startswith('Capped'):
            data['capped'] = text.split(':')[1].strip()
        elif text.startswith("Value:"):
            data['units'] = text.split('Value:')[1].strip()

    return data

In [5]:
# Running the scraper
all_course_data = []

for i, url in enumerate(course_links):
    print(f"Scraping {i+1}/{len(course_links)}: {url}")
    course_data = extract_course_data(url)
    all_course_data.append(course_data)

Scraping 1/568: https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/AC/2024_AC102.htm
Scraping 2/568: https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/AC/2024_AC103.htm
Scraping 3/568: https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/AC/2024_AC105.htm
Scraping 4/568: https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/AC/2024_AC106.htm
Scraping 5/568: https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/AC/2024_AC205.htm
Scraping 6/568: https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/AC/2024_AC206.htm
Scraping 7/568: https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/AC/2024_AC311.htm
Scraping 8/568: https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/AC/2024_AC312.htm
Scraping 9/568: https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/AC/2024_AC331.htm
Scraping 10/568: https://www.lse.ac.uk/resources/calendar2024-2025/courseGuides/AC/2024_AC332.htm
Scraping 11/568: https://www.

In [6]:
# Converting to DataFrame
modules_facts = pd.DataFrame(all_course_data)
modules_facts

Unnamed: 0,code,course,department,total_students,avg_class_size,capped,units
0,AC102,Elements of Financial Accounting,Accounting,564,15,No,Half Unit
1,AC103,"Elements of Management Accounting, Financial M...",Accounting,256,18,No,Half Unit
2,AC105,Introduction to Financial Accounting,Accounting,115,39,No,Half Unit
3,AC106,Introduction to Management Accounting,Accounting,115,39,No,Half Unit
4,AC205,Intermediate Financial Accounting,Accounting,Unavailable,Unavailable,No,Half Unit
...,...,...,...,...,...,...,...
563,ST314,Multilevel and Longitudinal Models,Statistics,23,21,Yes (30),Half Unit
564,ST326,Financial Statistics,Statistics,65,33,No,Half Unit
565,ST327,Market Research: An Integrated Approach,Statistics,58,15,Yes (60),One Unit
566,ST330,Stochastic and Actuarial Methods in Finance,Statistics,61,31,No,One Unit


In [7]:
# Cleaning Data
modules_facts['units'] = modules_facts['units'].map({'One Unit': 1.0, 'Half Unit': 0.5, 'Non-credit bearing': 0.0})
modules_facts['total_students'] = modules_facts['total_students'].replace('Unavailable', np.nan).astype(float)
modules_facts['avg_class_size'] = modules_facts['avg_class_size'].replace('Unavailable', np.nan).astype(float)
modules_facts.loc[modules_facts['capped'].str.startswith('No'), 'capped'] = False
modules_facts.loc[modules_facts['capped'] != False, 'capped'] = modules_facts.loc[modules_facts['capped'] != False, 'capped'].str.split(' ').str[1].str.strip('()').astype(int)
modules_facts

Unnamed: 0,code,course,department,total_students,avg_class_size,capped,units
0,AC102,Elements of Financial Accounting,Accounting,564.0,15.0,False,0.5
1,AC103,"Elements of Management Accounting, Financial M...",Accounting,256.0,18.0,False,0.5
2,AC105,Introduction to Financial Accounting,Accounting,115.0,39.0,False,0.5
3,AC106,Introduction to Management Accounting,Accounting,115.0,39.0,False,0.5
4,AC205,Intermediate Financial Accounting,Accounting,,,False,0.5
...,...,...,...,...,...,...,...
563,ST314,Multilevel and Longitudinal Models,Statistics,23.0,21.0,30,0.5
564,ST326,Financial Statistics,Statistics,65.0,33.0,False,0.5
565,ST327,Market Research: An Integrated Approach,Statistics,58.0,15.0,60,1.0
566,ST330,Stochastic and Actuarial Methods in Finance,Statistics,61.0,31.0,False,1.0


In [8]:
# Counting rows containing NaN values
modules_facts[modules_facts['avg_class_size'].isna()].count()

code              107
course            107
department        107
total_students     10
avg_class_size      0
capped            107
units             107
dtype: int64

In [9]:
# Saving to CSV
modules_facts.to_csv('data/modules/modules_key_facts.csv', index=False)

In [84]:
# Force the single column to be called "code"
outside_options = pd.read_csv('data/degrees/ug_outside_options.csv', names=["code"], header=None)

# Now you can safely set index
outside_options.set_index("code", inplace=True)

# Make sure modules_facts has 'code' column
if "code" not in modules_facts.columns:
    modules_facts.reset_index(inplace=True)

modules_facts.set_index("code", inplace=True)

# Add all columns from modules_facts to outside_options using index alignment
for col in modules_facts.columns:
    outside_options[col] = modules_facts[col].reindex(outside_options.index)

outside_options['units'] = outside_options['units'].replace({'Half Unit': 0.5, 'One Unit': 1.0})
outside_options.dropna(how='all', inplace=True)

outside_options

Unnamed: 0_level_0,course,prerequisites,department,total_students,avg_class_size,capped,units
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AC102,Elements of Financial Accounting,[],Accounting,564.0,15.0,False,0.5
AC103,"Elements of Management Accounting, Financial M...",[],Accounting,256.0,18.0,False,0.5
AC205,Intermediate Financial Accounting,"[AC102, AC105]",Accounting,,,False,0.5
AC206,Intermediate Management Accounting,"[AC103, AC106]",Accounting,,,False,0.5
AC311,Results Accountability and Management Control ...,"[AC103, AC200, AC312, AC100]",Accounting,118.0,39.0,False,0.5
...,...,...,...,...,...,...,...
ST313,Ethics for Data Science,"[ST102, EC220, MA100, MA212, EC221, ST202, ST206]",Statistics,37.0,20.0,False,0.5
ST314,Multilevel and Longitudinal Models,"[ST201, ST102, ST109, ST211, ST107]",Statistics,23.0,21.0,30,0.5
ST326,Financial Statistics,"[ST202, ST206, ST211]",Statistics,65.0,33.0,False,0.5
ST327,Market Research: An Integrated Approach,"[ST102, ST203, MG205, MG202, ST109, ST107]",Statistics,58.0,15.0,60,1.0


In [90]:
mutually_exclusive_options = pd.read_csv('data/modules/mutual_exclusive.csv')
mutually_exclusive_outside_options = mutually_exclusive_options[mutually_exclusive_options['Course'].isin(outside_options.index)]

outside_options['mutually_exclusive_courses'] = outside_options.index.to_series().apply(
    lambda x: mutually_exclusive_outside_options.loc[mutually_exclusive_outside_options['Course'] == x, 'Mutually Exclusive Courses'].tolist() if x in mutually_exclusive_outside_options['Course'].values else []

)
outside_options.dropna(axis=1, how='all', inplace=True)
cols = outside_options.columns.tolist()

# Find the index of the 'mutually_exclusive_courses' and 'department' columns
mutually_exclusive_index = cols.index('mutually_exclusive_courses')
department_index = cols.index('department')

# Swap the columns
cols[mutually_exclusive_index], cols[department_index] = cols[department_index], cols[mutually_exclusive_index]

# Reorder the DataFrame columns based on the modified column list
outside_options = outside_options[cols]
outside_options

outside_options.to_csv('data/modules/outside_options.csv', index=True)
outside_options

Unnamed: 0_level_0,course,prerequisites,department,total_students,avg_class_size,capped,units,mutually_exclusive_courses
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AC102,Elements of Financial Accounting,[],Accounting,564.0,15.0,False,0.5,[]
AC103,"Elements of Management Accounting, Financial M...",[],Accounting,256.0,18.0,False,0.5,[]
AC205,Intermediate Financial Accounting,"[AC102, AC105]",Accounting,,,False,0.5,[]
AC206,Intermediate Management Accounting,"[AC103, AC106]",Accounting,,,False,0.5,[]
AC311,Results Accountability and Management Control ...,"[AC103, AC200, AC312, AC100]",Accounting,118.0,39.0,False,0.5,[]
...,...,...,...,...,...,...,...,...
ST313,Ethics for Data Science,"[ST102, EC220, MA100, MA212, EC221, ST202, ST206]",Statistics,37.0,20.0,False,0.5,[]
ST314,Multilevel and Longitudinal Models,"[ST201, ST102, ST109, ST211, ST107]",Statistics,23.0,21.0,30,0.5,[]
ST326,Financial Statistics,"[ST202, ST206, ST211]",Statistics,65.0,33.0,False,0.5,[]
ST327,Market Research: An Integrated Approach,"[ST102, ST203, MG205, MG202, ST109, ST107]",Statistics,58.0,15.0,60,1.0,[ST307]
