In [37]:
# set up webscrabing
import requests
from bs4 import BeautifulSoup
import time
import numpy as np

# set up the url
url = 'https://phdcourses.dk/Course/'


# set up the headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

def get_page(url,):
    page = requests.get(url, headers=headers)
    if page.status_code == 200:
        try:
            soup = BeautifulSoup(page.content, 'html.parser')
            return soup
        except:
            return None
    else:
        return None



def get_data_from_id(num : int):
    url = f'https://phdcourses.dk/Course/{num}'
    soup = get_page(url)

    if soup is None:
        return None

    def get_text_after_header(header_text):
        header = soup.find("h3", string=lambda text: text and header_text.lower() in text.lower())
        return header.find_next("p").text.strip() if header and header.find_next("p") else "Not found"

    def get_link_after_header(header_text):
        header = soup.find("h3", string=lambda text: text and header_text.lower() in text.lower())
        link_tag = header.find_next("p").find("a") if header and header.find_next("p") else None
        return link_tag["href"] if link_tag else "No link found"

    def get_text_from_tag(tag, class_name):
        element = soup.find(tag, class_=class_name)
        return element.text.strip() if element else "Not found"

    # Extract data with error handling
    course_dates = get_text_after_header("Course dates")
    ects = get_text_after_header("ECTS")
    course_link = get_link_after_header("Link")
    title = get_text_from_tag("h2", "page__title")
    phd_school = get_text_from_tag("h3", "page__subtitle")


    def make_legal(text):
        return text.replace('"', '').replace("'", "").replace(";", "").replace("\n", "").replace("\t", "").replace("\r", "").replace(",", ".")
    
    title = make_legal(title)
    phd_school = make_legal(phd_school)
    course_dates = make_legal(course_dates)
    ects = make_legal(ects).replace("points", "")
    course_link = make_legal(course_link)

    return title, phd_school, course_dates, ects, course_link


def get_ids_from_searchpagenum(pagenum : int):

    soup = get_page(f"https://phdcourses.dk/?page={pagenum}&currentSearchWord=&currentEcts=&")

    nums = []
    t = soup.find_all("a", class_ = "subtitle")
    for i in t:
        nums.append(i["href"].split("/")[-1])
    return nums

In [None]:
sleeptime = 0.25
N_pages = 90

print("This script will scrape data from phdcourses.dk")
print(f"Estimated time: {int((sleeptime*N_pages + sleeptime*N_pages*10)/60)} minutes")

print("Finding course IDs...\n\n")

ids = []
for pn in range(1, N_pages+1):
    print(f"Page {pn}", end = "\r")
    for num in get_ids_from_searchpagenum(pn):
        ids.append(num)
    time.sleep(sleeptime)

print("\n\nGetting course data...\n")
print("")
data = []

for i, num in enumerate(ids):
    print(f"Course {num}  - {int(i/len(ids)*100.):d}/100+
          
          
          
          
          
          
          
          
          
          
          ", end = "\r")
    data_from_id = get_data_from_id(num)
    if data_from_id is not None: 
        data.append(data_from_id)
    else:
        print(f"Error with course {num}")
    time.sleep(sleeptime)
print("")
print("Done!")


data = np.array(data)

np.savetxt("phd_courses.csv", data, delimiter=",", fmt="%s")
print("Data saved to phd_courses.csv")

This script will scrape data from phdcourses.dk
Estimated time: 4 minutes
Finding course IDs...


Page 90

Getting course data...


Course 127737  - 99
Done!
Data saved to phd_courses.csv


In [None]:

data = np.loadtxt("phd_courses.csv", delimiter=",", dtype=str)

Data saved to phd_courses.csv
