In [26]:
import importlib
import sys
from Module import CourseModule
importlib.reload(sys.modules['Module'])
from scrape_helper import get_module_links, get_courses
from helpers import courses_dict
import sqlite3
import os

In [5]:
# Get the list of all courses from the given url
courses = get_courses("https://hpi.de/studium/im-studium/lehrveranstaltungen.html")

# Dictionary to store the courses and their modules (trimmed links)
course_modules = {}

# Dictionary to store trimmed and full module links
module_links_dict = {}

for course in courses:

    # get the list of modules for the given course
    module_links = get_module_links(course)

    # If no modules were found, skip the course
    if module_links == []:
        print("No modules found for course:", course)
        continue

    # Cut the host part of the url
    module_links_trimmed = [link.split("/")[-1] for link in module_links]

    # add the trimmed and full links to the dictionary
    for trimmed, full in zip(module_links_trimmed, module_links):
        module_links_dict[trimmed] = full
    
    # add the course and its modules to the dictionary
    course_modules[course] = module_links_trimmed


Div with class 'tx-ciuniversity' not found for url: https://hpi.de//studium/im-studium/lehrveranstaltungen/professional-skills.html
No modules found for course: https://hpi.de//studium/im-studium/lehrveranstaltungen/professional-skills.html
Div with class 'tx-ciuniversity' not found for url: https://hpi.de/entrepreneurship/home.html
No modules found for course: https://hpi.de/entrepreneurship/home.html


In [21]:
# create a list of modules of class Module
modules = {}

# create a Module object for each module
for i, (url_trimmed, url) in enumerate(module_links_dict.items()):
    if i%10 == 0:
        print(f"Fetching data for module {i+1}/{len(module_links_dict)}")

    # create a Module object for the given url
    module = CourseModule(url)
    module.get_landing_page_information()
    module.get_evaluation_metrics()

    # add the module to the list of modules
    modules[url_trimmed] = module

Fetching data for module 1/115
Fetching data for module 11/115
Fetching data for module 21/115
Fetching data for module 31/115
Fetching data for module 41/115
Fetching data for module 51/115
Fetching data for module 61/115
Fetching data for module 71/115
Fetching data for module 81/115
Fetching data for module 91/115
Fetching data for module 101/115
Fetching data for module 111/115


In [22]:
list(modules.values())[2].__dict__

{'_url': 'https://hpi.de/studium/im-studium/lehrveranstaltungen/it-systems-engineering-ma/lehrveranstaltung/wise-23-24-3925-advanced-track-design-thinking-d_school.html',
 '_url_trimmed': 'wise-23-24-3925-advanced-track-design-thinking-d_school.html',
 '_title': 'Advanced Track Design Thinking (D-School)',
 '_description': None,
 '_dates': 'This hybrid course will be held weekly on two program days (Monday and Thursday) from 9:00 am to 3:00 pm (CET).\nThe dates for the winter semester 22/23\n- First day of the program: 17.10.2022 (as pre-launch)\n- Last day of the program: 16.02.2023\n- Deadline for submission of the final version of the project documentation: 06.03.2023\n\nThe\xa0total time investment\xa0in this course is\xa016-20 hours/week. On program days, we require active participation for a total of 10 working hours (either onsite or virtual). In addition, we expect all participants to invest an additional 6-10 working hours per week in teamwork and individual contributions to t

In [23]:
module = CourseModule("https://hpi.de/studium/im-studium/lehrveranstaltungen/it-systems-engineering-ma/lehrveranstaltung/wise-23-24-3925-advanced-track-design-thinking-d_school.html")
module.get_landing_page_information()

In [24]:
module.__dict__

{'_url': 'https://hpi.de/studium/im-studium/lehrveranstaltungen/it-systems-engineering-ma/lehrveranstaltung/wise-23-24-3925-advanced-track-design-thinking-d_school.html',
 '_url_trimmed': 'wise-23-24-3925-advanced-track-design-thinking-d_school.html',
 '_title': 'Advanced Track Design Thinking (D-School)',
 '_description': None,
 '_dates': 'This hybrid course will be held weekly on two program days (Monday and Thursday) from 9:00 am to 3:00 pm (CET).\nThe dates for the winter semester 22/23\n- First day of the program: 17.10.2022 (as pre-launch)\n- Last day of the program: 16.02.2023\n- Deadline for submission of the final version of the project documentation: 06.03.2023\n\nThe\xa0total time investment\xa0in this course is\xa016-20 hours/week. On program days, we require active participation for a total of 10 working hours (either onsite or virtual). In addition, we expect all participants to invest an additional 6-10 working hours per week in teamwork and individual contributions to t

In [29]:
# delete the database if it already exists
try:
    os.remove("hpi_modules.db")
except FileNotFoundError:
    pass
con = sqlite3.connect("hpi_modules.db")
with open('sqlite_db_setup.sql', 'r') as sql_file:
    sql_queries = sql_file.read()
con.executescript(sql_queries)
con.commit()

In [30]:
cur = con.cursor()
try:
    cur.executemany("INSERT INTO courses VALUES (?, ?)", zip(courses_dict.values(), courses_dict.keys()))
    con.commit()
except sqlite3.IntegrityError:
    print("Courses already exist in the database")
for row in cur.execute("SELECT * FROM courses"):
    print(row)

('itse_bachelor', 'IT-Systems Engineering BA')
('itse_master', 'IT-Systems Engineering MA')
('dh_master', 'Digital Health MA')
('de_master', 'Data Engineering MA')
('cyber_master', 'Cybersecurity MA')
('sse_master', 'Software Systems Engineering MA')


In [31]:
for module in modules.values():
    try:
        cur.execute("INSERT INTO modules VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
                    (module.url_trimmed,
                    module.url,
                    module.website,
                    module.title,
                    module.credits,
                    module.evap_grade,
                    module.evap_semester,
                    module.description,
                    module.lecturers))
        con.commit()
    except sqlite3.IntegrityError:
        print("Module already exists in database: ", module.url_trimmed)

In [32]:
for module in modules.values():
    for module_group in module.module_groups:
        try:
            cur.execute("INSERT INTO course_modules VALUES (?, ?, ?, ?)",
                        [
                            module_group[0], # course abbreviation
                            module.url_trimmed, # module url trimmed
                            module_group[1], # module group
                            module_group[2]
                        ]) # submodule group
            con.commit()
        except sqlite3.IntegrityError:
            print(f"IntegrityError: {module_group[0]}, {module.url_trimmed}, {module_group[1]}, {module_group[2]}")

IntegrityError: itse_bachelor, wise-23-24-3846-3d_computergrafik-i.html, HCGT, 


In [35]:
for row in cur.execute("""
            SELECT * 
            FROM modules 
            JOIN course_modules ON modules.url_trimmed = course_modules.module_url_trimmed 
            JOIN courses ON courses.course_abbreviation = course_modules.course_abbreviation
            WHERE courses.course_abbreviation LIKE 'dh_master'
            AND modules.evap_grade < 1.2;
            """):
    print(row)

('wise-23-24-3858-feature-selection-and-classification-for-hierarchically-structured-feature-spaces.html', 'https://hpi.de/studium/im-studium/lehrveranstaltungen/cybersecurity-ma/lehrveranstaltung/wise-23-24-3858-feature-selection-and-classification-for-hierarchically-structured-feature-spaces.html', None, 'Feature Selection and Classification for hierarchically structured feature spaces', 6, 1.1, 'WiSe 2025/26', '------------------------------------------------------------------------------------------------------------------------------------\nIf you want to join this seminar, please enroll to the moodle course:\nhttps://moodle.hpi.de/course/view.php?id=688\n------------------------------------------------------------------------------------------------------------------------------------\nPowerful predictive models based on high-dimensional data are a cornerstone of modern machine learning applications. Yet, in real-world scenarios, often a vast majority of the available features ar

In [None]:
cursor.close()
connection.close()