In [1]:
import requests
from bs4 import BeautifulSoup
import re
import json

In [2]:
PHYSICS_YEAR1_FILE = r'.\HTML_content\year1_phys.txt'
PHYSICS_YEAR2_FILE = r'.\HTML_content\year2_phys.txt'
CAS_YEAR1_FILE = r'.\HTML_content\year1_cas.txt'
CAS_YEAR2_FILE = r'.\HTML_content\year2_cas.txt'
EM_YEAR1_FILE = r'.\HTML_content\year1_em.txt'
EM_YEAR2_FILE = r'.\HTML_content\year2_em.txt'
DATA_YEAR1_FILE = r'.\HTML_content\year1_data.txt'
DATA_YEAR2_FILE = r'.\HTML_content\year2_data.txt'
MOB_YEAR1_FILE = r'.\HTML_content\year1_mobility.txt'
MOB_YEAR2_FILE = r'.\HTML_content\year2_mobility.txt'
APP_YEAR1_FILE = r'.\HTML_content\year1_applied.txt'
APP_YEAR2_FILE = r'.\HTML_content\year2_applied.txt'
ALS_YEAR1_FILE = r'.\HTML_content\year1_als.txt'
ALS_YEAR2_FILE = r'.\HTML_content\year2_als.txt'

In [3]:
class scrape():
    def __init__(self, YEAR1_FILE, YEAR2_FILE, loadfile=None):
        if loadfile!=None:
            self.load(loadfile)
            return
        self.YEAR1_FILE = YEAR1_FILE
        self.YEAR2_FILE = YEAR2_FILE
        with open(self.YEAR1_FILE, 'r') as FILE:
            for line in FILE:
                self.html_content1 = line.strip()
                self.soup1 = BeautifulSoup(self.html_content1, 'html.parser')
        with open(self.YEAR2_FILE, 'r') as FILE:
            for line in FILE:
                self.html_content2 = line.strip()
                self.soup2 = BeautifulSoup(self.html_content2, 'html.parser')
        href_pattern = re.compile(r'/en/education/your-studies/find-course-and-programme-syllabi/course-syllabus/')
        self.links1 = self.soup1.find_all('a', href=href_pattern)
        self.links2 = self.soup2.find_all('a', href=href_pattern)
        self.courses_information1 = {link.get_text(): {'link': link['href']} for link in self.links1}
        self.courses_information2 = {link.get_text(): {'link': link['href']} for link in self.links2}
        self.courses_information = self.courses_information1 | self.courses_information2
        self.html_contents_lps = {course_name: requests.get(r'https://www.chalmers.se' + self.courses_information[course_name]['link']).text for course_name in self.courses_information}

        pattern = re.compile(r'^\d+\.\d+\sc|\d+\sc$')
        for course_name in self.html_contents_lps:
            soup = BeautifulSoup(self.html_contents_lps[course_name], 'html.parser')
            lps_nest = soup.find_all(class_=r'[&>*]:leading-7')[0]
            lps = lps_nest.find_all(class_='whitespace-nowrap')
            indexes = []
            for i in range(len(lps[1:-2])):
                text = lps[1:-2][i].get_text()
                if re.search(pattern, text) and '0' not in text:
                    indexes.append(i+1)
            self.courses_information[course_name]['lp'] = indexes
        for course in self.courses_information:
            if course in list(self.courses_information1.keys()) and course in list(self.courses_information2.keys()):
                self.courses_information[course]['år'] = [1,2]
            elif course in list(self.courses_information1.keys()):
                self.courses_information[course]['år'] = [1]
            elif course in list(self.courses_information2.keys()):
                self.courses_information[course]['år'] = [2]
    def get_lp(self, lp):
        for course in list(self.courses_information.keys()):
            if lp in self.courses_information[course]['lp']:
                print('år: ', self.courses_information[course]['år'], '\tlp: ', self.courses_information[course]['lp'], '\t', course)
    def save(self, SAVE_FILE):
        with open(SAVE_FILE, 'w', encoding='utf-8') as FILE:
            json.dump(self.courses_information, FILE)
    def load(self, LOAD_FILE):
        with open(LOAD_FILE, 'r', encoding='utf-8') as FILE:
            self.courses_information = json.load(FILE)
    def get_link(self, name):
        for course in self.courses_information:
            if (name).lower() in course.lower():
                print(f'{course}: \t', r'https://www.chalmers.se' + self.courses_information[course]['link'])
    def get_information(self, name):
        for course in self.courses_information:
            if (name).lower() in course.lower():
                print('år: ', self.courses_information[course]['år'], '\tlp: ', self.courses_information[course]['lp'], '\t', f'{course}: \t', r'https://www.chalmers.se' + self.courses_information[course]['link'])

In [4]:
EM = scrape(EM_YEAR1_FILE, EM_YEAR2_FILE, loadfile=r'.\Courses_information\course_information_em.json')
PHYS = scrape(PHYSICS_YEAR1_FILE, PHYSICS_YEAR2_FILE, loadfile=r'.\Courses_information\course_information_phys.json')
CAS = scrape(CAS_YEAR1_FILE, CAS_YEAR2_FILE, loadfile=r'.\Courses_information\course_information_cas.json')
DATA = scrape(DATA_YEAR1_FILE, DATA_YEAR2_FILE, loadfile=r'.\Courses_information\course_information_data.json')
MOB = scrape(MOB_YEAR1_FILE, MOB_YEAR2_FILE, loadfile=r'.\Courses_information\course_information_mob.json')
APP = scrape(APP_YEAR1_FILE, APP_YEAR2_FILE, loadfile=r'.\Courses_information\course_information_app.json')
ALS = scrape(ALS_YEAR1_FILE, ALS_YEAR2_FILE, loadfile=r'.\Courses_information\course_information_als.json')

In [5]:
MOB.get_lp(1)

år:  [1] 	lp:  [1] 	 Systems and mechatronics for mobility engineering 
år:  [1] 	lp:  [1] 	 Introduction to propulsion and energy systems for transport 
år:  [1, 2] 	lp:  [1, 2] 	 Marine design project 
år:  [1, 2] 	lp:  [1] 	 Aircraft design 
år:  [1, 2] 	lp:  [1, 2] 	 Automotive engineering project 
år:  [1, 2] 	lp:  [1] 	 Active safety 
år:  [1, 2] 	lp:  [1] 	 Fatigue and fracture 
år:  [1, 2] 	lp:  [1] 	 Introduction to data science and AI 
år:  [1, 2] 	lp:  [1] 	 Quality management 
år:  [2] 	lp:  [1] 	 Infrastructure and urban systems 
år:  [2] 	lp:  [1] 	 Design and analysis of experiments 
år:  [2] 	lp:  [1, 2] 	 Master's thesis in Architecture and civil engineering 
år:  [2] 	lp:  [1, 2] 	 Master's thesis in Electrical engineering 
år:  [2] 	lp:  [1, 2] 	 Master's thesis in Industrial and materials science 
år:  [2] 	lp:  [1, 2] 	 Master's thesis in Mechanics and maritime sciences 
år:  [2] 	lp:  [1, 2] 	 Master's thesis in Technology management and economics 
