Z uporabo knjižnic Selenium, BeautifulSoup in pandas bomo dobili podatke o predmetih na programih UNI in VSŠ. Zaenkrat imamo za vsak predmet njegov id, ime in semester, v katerem se izvaja. Naknadno bo dodan še letnik, v katerem je predmet na voljo.

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

timetables = pd.read_csv('./timetables.csv')

In [2]:
def remove_last_element(string):
    return string[:-1]

def year_to_slash(year):
    num = str(year)[2:]
    return num + '/' + str(int(num)+1)

# 632 - uni
# 635 - mag
# 637 - vss
# 638 - doktorski
# ostali so za druge fakse

subjects = []

def get_subjects_for_sem(url, subjects, sem, year):
    driver = webdriver.Chrome()
    driver.get(url)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    subjects_section = soup.find('h2', string='Subjects').find_parent('td')

    for a_tag in subjects_section.find_all('a'):
        subject_text = a_tag.text
        subject_name, mid, subject_id = subject_text.rpartition(" (")
        if subject_id[:3] == "632":
            subjects.append([subject_id, subject_name, sem, year, "UNI"])
        elif subject_id[:3] == "637":
            subjects.append([subject_id, subject_name, sem, year, "VSS"])
            
            
for index, row in timetables.iterrows():
    get_subjects_for_sem(row['url'], subjects, row['url_semester'], row['url_year'])


# odstranimo zaklepaj pri id, spremenimo format zapisa let
subjects = pd.DataFrame(subjects, columns = ['subject_id', 'subject_name', 'subject_semester', 'subject_year', 'subject_program'])
subjects['subject_id'] = subjects['subject_id'].apply(remove_last_element)


# agregiramo po idju, stejemo koliko let se je predmet izvajal
subjects = subjects.groupby('subject_id').agg(
    subject_name=('subject_name', 'first'),
    subject_program=('subject_program', 'first'),
    subject_semester=('subject_semester', 'first'),
    year_count=('subject_year', 'count'),
    years_offered=('subject_year', lambda x: list(x))
).reset_index()


# oznacimo, ali je bil predmet ponujen vsako leto ali ne
subjects['offered_every_year'] = subjects.apply(
    lambda row: True if row['year_count'] == 6 else False,
    axis=1
)

subjects.to_csv('subjects_1.csv', index=False)

In [3]:
subjects.head(20)

Unnamed: 0,subject_id,subject_name,subject_program,subject_semester,year_count,years_offered,offered_every_year
0,63202,Osnove matematične analize,UNI,1,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]",True
1,63203,Diskretne strukture,UNI,1,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]",True
2,63204,Osnove digitalnih vezij,UNI,1,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]",True
3,63205,Fizika,UNI,1,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]",True
4,63207,Linearna algebra,UNI,2,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]",True
5,63208,Osnove podatkovnih baz,UNI,1,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]",True
6,63209,Računalniške komunikacije,UNI,2,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]",True
7,63212,Arhitektura računalniških sistemov,UNI,2,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]",True
8,63213,Verjetnost in statistika,UNI,1,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]",True
9,63214,Osnove umetne inteligence,UNI,1,6,"[19/20, 20/21, 21/22, 22/23, 23/24, 24/25]",True


In [None]:
subjects = pd.read_csv("./subjects_1.csv")
subjects['years_offered'] = subjects['years_offered'].apply(eval)

# funkcija za sestavo linka
def get_subject_url(subject_id, year, sem, lectures_only):
    url_row = timetables[(timetables['url_year'] == year) & (timetables['url_semester'] == sem)]
    url = url_row.iloc[0]['url']
    url += "/allocations?"
    if lectures_only: url += "type=P&"
    return url + "subject=" + subject_id


# for loop za vse predmete
def get_codes(table, lectures_only):
    for index, subject in table.iterrows():
        try:
            url = get_subject_url(subject['subject_id'], subject['years_offered'][-1], subject['subject_semester'], lectures_only)

            driver = webdriver.Chrome()
            driver.get(url)

            soup = BeautifulSoup(driver.page_source, 'html.parser')

            subjects_link_group = soup.find_all('a', class_='link-group')

            string=""
            for a_tag in subjects_link_group:
                string += a_tag.text + ", "

            table.at[index, 'subject_codes'] = string.rstrip(", ")
        except Exception as e:
            print(f"Error processing subject {subject['subject_id']}: {e}")
        finally:
            driver.quit()

In [5]:
get_codes(subjects, 1)

print(subjects)

#filtered_subjects = subjects[subjects['subject_codes'] == ""]
#print(filtered_subjects)

# filtrirani samo tisti predmeti, ki imajo še prazne šifre
# get_codes(filtrirani, 0)

# funkcija za dobitev kod, podat bi blo treba tabelo predmetov, ki jih filamo, pa lectures_only parameter

    subject_id                              subject_name subject_program  \
0        63202                Osnove matematične analize             UNI   
1        63203                       Diskretne strukture             UNI   
2        63204                   Osnove digitalnih vezij             UNI   
3        63205                                    Fizika             UNI   
4        63207                          Linearna algebra             UNI   
..         ...                                       ...             ...   
112      63774                         Varnost programov             VSS   
113      63775                          Varnost podatkov             VSS   
114      63776  Programiranje energijsko omejenih naprav             VSS   
115      63779                  Človeški vidiki varnosti             VSS   
116      63780                          Varnost sistemov             VSS   

     subject_semester  year_count                               years_offered  \
0     

In [6]:
subjects.to_csv('subjects_2.csv', index=False)

In [None]:
subjects = pd.read_csv("./subjects_2.csv")

def remove_last_element(string):
    if string == '': return string
    else: return string[:-1]

def strings_to_year(string):
    codes = []
    codes = string.split(', ')

def extract_years(el):
    length = len(el)
    new_list = []
    if isinstance(el, list) and length == 1 and el[0] == 'nan':
        # to so izbirni predmeti, ki nimajo vnosov na urniku
        return ['0']
    else:
        for i in range(length):
            split_el = el[i].split('_')
            if split_el[1] == 'BUN-RI':
                new_list.append(split_el[0] + '_UNI')
            elif split_el[1] == 'BVS-RI':
                new_list.append(split_el[0] + '_VSS')

        if len(new_list) == 0: return
        return new_list
    
    
subjects['subject_codes'] = subjects['subject_codes'].astype(str)
subjects['subject_codes'] = subjects['subject_codes'].apply(lambda x: [item for item in x.split(', ') if item])
subjects['student_year'] = subjects['subject_codes'].apply(extract_years)

subjects = subjects.drop(columns=['subject_codes'])

# predmeti ki nimajo vnosa so za program multimedija ali pa upravna informatika, a nimajo svoje kode
subjects = subjects.dropna(subset=['student_year'])
# odstrani duplikatne vrednosti
subjects['student_year'] = subjects['student_year'].apply(lambda x: list(set(x)) if isinstance(x, list) else x)

In [5]:
subjects.to_csv('subjects_3.csv', index=False)