Z uporabo knjižnic Selenium, BeautifulSoup in pandas bomo dobili podatke o predmetih na programih UNI in VSŠ. Zaenkrat imamo za vsak predmet njegov id, ime in semester, v katerem se izvaja. Naknadno bo dodan še letnik, v katerem je predmet na voljo.

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd


def remove_last_element(string):
    return string[:-1]

def year_to_slash(year):
    num = str(year)[2:]
    return num + '/' + str(int(num)+1)

# 632 - uni
# 635 - mag
# 637 - vss
# 638 - doktorski
# ostali so za druge fakse

subjects = []
url_base = 'https://urnik.fri.uni-lj.si/timetable/fri-'
url_end_1 = '-zimski'
url_end_2 = '-letni'

urls = []

for i in range(2018, 2025):
    year = str(i) + '_' + str(i+1)
    url1 = url_base + year + url_end_1
    url2 = url_base + year + url_end_2
    urls.append([url1, url2, year_to_slash(i)])

def get_subjects_for_sem(url, subjects, sem, year):
    driver = webdriver.Chrome()
    driver.get(url)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    subjects_section = soup.find('h2', string='Subjects').find_parent('td')

    for a_tag in subjects_section.find_all('a'):
        subject_text = a_tag.text
        subject_name, mid, subject_id = subject_text.rpartition(" (")
        if subject_id[:3] == "632":
            subjects.append([subject_id, subject_name, sem, year, "UNI"])
        elif subject_id[:3] == "637":
            subjects.append([subject_id, subject_name, sem, year, "VSS"])
            
            
for url_list in urls:
    get_subjects_for_sem(url_list[0], subjects, 1, url_list[2])
    get_subjects_for_sem(url_list[1], subjects, 2, url_list[2])


# odstranimo zaklepaj pri id, spremenimo format zapisa let
subjects = pd.DataFrame(subjects, columns = ['subject_id', 'subject_name', 'subject_semester', 'subject_year', 'subject_program'])
subjects['subject_id'] = subjects['subject_id'].apply(remove_last_element)


# agregiramo po idju, stejemo koliko let se je predmet izvajal
subjects = subjects.groupby('subject_id').agg(
    subject_name=('subject_name', 'first'),
    subject_program=('subject_program', 'first'),
    subject_semester=('subject_semester', 'first'),
    year_count=('subject_year', 'count'),
    years_offered=('subject_year', lambda x: list(x))
).reset_index()


# oznacimo, ali je bil predmet ponujen vsako leto ali ne
subjects['offered_every_year'] = subjects.apply(
    lambda row: True if row['year_count'] == 7 else False,
    axis=1
)

subjects.to_csv('subjects_1.csv', index=False)

In [2]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

subjects = pd.read_csv("./subjects_1.csv")
subjects['years_offered'] = subjects['years_offered'].apply(eval)

# funkcija za sestavo linka
def make_subject_url(subject_id, year, sem):
    url_base = 'https://urnik.fri.uni-lj.si/timetable/fri-'
    url_year = '20' + year[:2] + '_20' + year[3:]
    url_sem = '-zimski' if sem==1 else '-letni'
    # to include only lectures
    # url_subject_param = '/allocations?type=P&subject='
    # we include labs as well, not all subjects have lectures
    url_subject_param = '/allocations?subject='
    return url_base + url_year + url_sem + url_subject_param + str(subject_id)


# for loop za vse predmete ki imajo offered every year true
for index, subject in subjects.iterrows():
    year = subject['years_offered'][-1]
    
    url = make_subject_url(subject['subject_id'], year, subject['subject_semester'])
    
    driver = webdriver.Chrome()
    driver.get(url)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    subjects_link_group = soup.find_all('a', class_='link-group')
    
    string=""
    for a_tag in subjects_link_group:
        string = string + a_tag.text + ", "
        
    row_index = subjects[subjects['subject_id'] == subject['subject_id']].index[0]
    subjects.at[row_index, 'subject_codes'] = string

In [3]:
subjects.to_csv('subjects_2.csv', index=False)

In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

subjects = pd.read_csv("./subjects_2.csv")

def remove_last_element(string):
    if string == '': return string
    else: return string[:-1]

def strings_to_year(string):
    codes = []
    codes = string.split(', ')

def extract_years(el):
    length = len(el)
    new_list = []
    if isinstance(el, list) and length == 1 and el[0] == 'nan':
        # to so izbirni predmeti, ki nimajo vnosov na urniku
        return ['0']
    else:
        for i in range(length):
            split_el = el[i].split('_')
            if split_el[1] == 'BUN-RI':
                new_list.append(split_el[0] + '_UNI')
            elif split_el[1] == 'BVS-RI':
                new_list.append(split_el[0] + '_VSS')

        if len(new_list) == 0: return
        return new_list
    
    
subjects['subject_codes'] = subjects['subject_codes'].astype(str)
subjects['subject_codes'] = subjects['subject_codes'].apply(lambda x: [item for item in x.split(', ') if item])
subjects['student_year'] = subjects['subject_codes'].apply(extract_years)

subjects = subjects.drop(columns=['subject_codes'])

# predmeti ki nimajo vnosa so za program multimedija ali pa upravna informatika, a nimajo svoje kode
subjects = subjects.dropna(subset=['student_year'])
# odstrani duplikatne vrednosti
subjects['student_year'] = subjects['student_year'].apply(lambda x: list(set(x)) if isinstance(x, list) else x)

In [5]:
subjects.to_csv('subjects_3.csv', index=False)