Z uporabo knjižnic Selenium, BeautifulSoup in pandas bomo dobili podatke o predmetih na programih UNI in VSŠ. Zaenkrat imamo za vsak predmet njegov id, ime in semester, v katerem se izvaja. Naknadno bo dodan še letnik, v katerem je predmet na voljo.

In [72]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

timetables = pd.read_csv('./timetables.csv')
subjects = []

# 632 - uni
# 635 - mag
# 637 - vss
# 638 - doktorski
# ostali so za druge fakse

timetables.head(12)

Unnamed: 0,url,url_year,url_semester
0,https://urnik.fri.uni-lj.si/timetable/fri-2019...,19/20,2
1,https://urnik.fri.uni-lj.si/timetable/fri-2019...,19/20,1
2,https://urnik.fri.uni-lj.si/timetable/fri-2020...,20/21,2
3,https://urnik.fri.uni-lj.si/timetable/fri-2020...,20/21,1
4,https://urnik.fri.uni-lj.si/timetable/fri-2021...,21/22,2
5,https://urnik.fri.uni-lj.si/timetable/fri-2021...,21/22,1
6,https://urnik.fri.uni-lj.si/timetable/fri-2022...,22/23,2
7,https://urnik.fri.uni-lj.si/timetable/fri-2022...,22/23,1
8,https://urnik.fri.uni-lj.si/timetable/fri-2023...,23/24,2
9,https://urnik.fri.uni-lj.si/timetable/fri-2023...,23/24,1


Za vsak predmet dobimo `subject_id`, `subject_name`, `subject_semester`, `subject_year` in `subject_program`. Shranimo vmesno `subjects_1.csv` datoteko.

In [None]:
# funkciji podamo url, v katerem iscemo vse predmete
# zraven vsakega predmeta shranimo se leto, semester in program
def get_subjects(url, subjects, sem, year):
    driver = webdriver.Chrome()
    driver.get(url)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    subjects_section = soup.find('h2', string='Subjects').find_parent('td')

    for a_tag in subjects_section.find_all('a'):
        subject_text = a_tag.text
        subject_name, mid, subject_id = subject_text.rpartition(" (")
        if subject_id[:3] == "632":
            subjects.append([subject_id, subject_name, sem, year, "UNI"])
        elif subject_id[:3] == "637":
            subjects.append([subject_id, subject_name, sem, year, "VSS"])
            
# in vseh linkov dobimo podatke o predmetih            
for index, row in timetables.iterrows():
    get_subjects(row['url'], subjects, row['url_semester'], row['url_year'])


# poimenujemo stolpce
subjects = pd.DataFrame(subjects, columns = ['subject_id', 'subject_name', 'subject_semester', 'subject_year', 'subject_program'])
# odstranimo odvecni zaklepaj
subjects['subject_id'] = subjects['subject_id'].apply(lambda x: x[:-1])


# agregiramo po idju, stejemo koliko let se je predmet izvajal
subjects = subjects.groupby('subject_id').agg(
    subject_name=('subject_name', 'first'),
    subject_program=('subject_program', 'first'),
    subject_semester=('subject_semester', 'first'),
    year_count=('subject_year', 'count'),
    years_offered=('subject_year', lambda x: list(x))
).reset_index()


subjects.to_csv("./checkpoint/subjects_1.csv", index=False)
subjects.head(20)

Unnamed: 0,subject_id,subject_name,subject_program,subject_semester,year_count,years_offered
0,63202,Osnove matematične analize,UNI,1,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]"
1,63203,Diskretne strukture,UNI,1,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]"
2,63204,Osnove digitalnih vezij,UNI,1,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]"
3,63205,Fizika,UNI,1,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]"
4,63207,Linearna algebra,UNI,2,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]"
5,63208,Osnove podatkovnih baz,UNI,1,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]"
6,63209,Računalniške komunikacije,UNI,2,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]"
7,63212,Arhitektura računalniških sistemov,UNI,2,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]"
8,63213,Verjetnost in statistika,UNI,1,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]"
9,63214,Osnove umetne inteligence,UNI,1,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]"


Importamo podatke iz prejšnjega koraka in definiramo funkciji, ki ju bomo potrebovali.

In [None]:
subjects = pd.read_csv("./checkpoint/subjects_1.csv")
# spremenimo podatke o letih v seznam
subjects['years_offered'] = subjects['years_offered'].apply(eval)

# funkcija za sestavo linka
def get_subject_url(subject_id, year, sem, lectures_only):
    url_row = timetables[(timetables['url_year'] == year) & (timetables['url_semester'] == sem)]
    if len(url_row) == 0: return ""
    url = url_row.iloc[0]['url']
    url += "/allocations?"
    if lectures_only == 1: url += "type=P&"
    return url + "subject=" + subject_id


# funkcija, ki za vsak predmet dobi kode za ekstrakcijo letnika
def get_codes(table, lectures_only):
    for index, subject in table.iterrows():
        url = get_subject_url(subject['subject_id'], subject['years_offered'][-1], subject['subject_semester'], lectures_only)

        driver = webdriver.Chrome()
        driver.get(url)

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        subjects_link_group = soup.find_all('a', class_='link-group')

        string=""
        for a_tag in subjects_link_group:
            string += a_tag.text + ", "

        table.at[index, 'subject_codes'] = string.rstrip(", ")

Kličemo funkcijo `get_codes` na vseh predmetih in gledamo samo predavanja. Filtriramo ven predmete, ki imajo polje za kode prazno in tokrat gledamo tudi vaje.

In [75]:
get_codes(subjects, 1)
#print(subjects)

filter = subjects[subjects['subject_codes'] == ""]

get_codes(filter, 0)
#print(filter)

Pridobljene podatke združimo v en stolpec in shranimo v vmesno datoteko `subjects_2.csv`.

In [None]:
# merge filter with subjects
filter = filter[['subject_id', 'subject_codes']]

subjects_merge = pd.merge(subjects, filter, on='subject_id', how='left')

subjects_merge['subject_codes'] = subjects_merge['subject_codes_y'].combine_first(subjects_merge['subject_codes_x'])

subjects_merge = subjects_merge.drop(columns=['subject_codes_x', 'subject_codes_y'])

subjects_merge.to_csv("./checkpoint/subjects_2.csv", index=False)

V zadnjem koraku iz dobljenih kod ekstrahiramo podatke o tem, za študente katerega programa in letnika je predmet na voljo, in jih shranimo v končno datoteko `subjects_3`.

In [None]:
subjects = pd.read_csv("./checkpoint/subjects_2.csv")

def extract_years(el):
    length = len(el)
    new_list = []
    if isinstance(el, list) and length == 1 and el[0] == 'nan':
        # to so izbirni predmeti, ki nimajo vnosov na urniku
        return ['ni vnosov']
    else:
        for i in range(length):
            split_el = el[i].split('_')
            if split_el[1] == 'BUN-RI':
                new_list.append(split_el[0] + '_UNI')
            elif split_el[1] == 'BVS-RI':
                new_list.append(split_el[0] + '_VSS')

        # ni uni oz vss predmet, kasneje izbrisemo
        if len(new_list) == 0: return 
        return new_list
    
# kode pretvorimo v tip string za lazjo manipulacijo in odstranimo odvecno vejico
# klicemo funkcijo za ekstrakcijo letnika in brisemo zdaj odvecen stolpec
subjects['subject_codes'] = subjects['subject_codes'].astype(str)
subjects['subject_codes'] = subjects['subject_codes'].apply(lambda x: [item for item in x.split(', ') if item])
subjects['student_year'] = subjects['subject_codes'].apply(extract_years)
subjects = subjects.drop(columns=['subject_codes'])

# predmeti ki nimajo vnosa so za program multimedija ali pa upravna informatika, a nimajo svoje kode
subjects = subjects.dropna(subset=['student_year'])

# odstrani duplikatne vrednosti
subjects['student_year'] = subjects['student_year'].apply(lambda x: list(set(x)) if isinstance(x, list) else x)

subjects.to_csv("subjects_3.csv", index=False)